commit 34061c7ab797e44678c16711bd0a34fa4e8f5f0c Author: ModelHub XC Date: Sat May 30 10:31:46 2026 +0800 初始化项目,由ModelHub XC社区提供模型 Model: Neelectric/Llama-3.1-8B-Instruct_SFT_mathsp_ewc_v00.10 Source: Original Platform diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..52373fe --- /dev/null +++ b/.gitattributes @@ -0,0 +1,36 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tar filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text +tokenizer.json filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000..f83486f --- /dev/null +++ b/README.md @@ -0,0 +1,60 @@ +--- +base_model: meta-llama/Llama-3.1-8B-Instruct +datasets: Neelectric/OpenR1-Math-220k_all_Llama3_4096toks +library_name: transformers +model_name: Llama-3.1-8B-Instruct_SFT_mathsp_ewc_v00.10 +tags: +- generated_from_trainer +- sft +- trl +- open-r1 +licence: license +--- + +# Model Card for Llama-3.1-8B-Instruct_SFT_mathsp_ewc_v00.10 + +This model is a fine-tuned version of [meta-llama/Llama-3.1-8B-Instruct](https://huggingface.co/meta-llama/Llama-3.1-8B-Instruct) on the [Neelectric/OpenR1-Math-220k_all_Llama3_4096toks](https://huggingface.co/datasets/Neelectric/OpenR1-Math-220k_all_Llama3_4096toks) dataset. +It has been trained using [TRL](https://github.com/huggingface/trl). + +## Quick start + +```python +from transformers import pipeline + +question = "If you had a time machine, but could only go to the past or the future once and never return, which would you choose and why?" +generator = pipeline("text-generation", model="Neelectric/Llama-3.1-8B-Instruct_SFT_mathsp_ewc_v00.10", device="cuda") +output = generator([{"role": "user", "content": question}], max_new_tokens=128, return_full_text=False)[0] +print(output["generated_text"]) +``` + +## Training procedure + +[Visualize in Weights & Biases](https://wandb.ai/neelectric/open-r1_math/runs/46l7kegv) + + + +This model was trained with SFT. + +### Framework versions + +- TRL: 1.1.0.dev0 +- Transformers: 4.57.6 +- Pytorch: 2.9.0 +- Datasets: 4.8.5 +- Tokenizers: 0.22.2 + +## Citations + + + +Cite TRL as: + +```bibtex +@software{vonwerra2020trl, + title = {{TRL: Transformers Reinforcement Learning}}, + author = {von Werra, Leandro and Belkada, Younes and Tunstall, Lewis and Beeching, Edward and Thrush, Tristan and Lambert, Nathan and Huang, Shengyi and Rasul, Kashif and Gallouédec, Quentin}, + license = {Apache-2.0}, + url = {https://github.com/huggingface/trl}, + year = {2020} +} +``` \ No newline at end of file diff --git a/all_results.json b/all_results.json new file mode 100644 index 0000000..c07f092 --- /dev/null +++ b/all_results.json @@ -0,0 +1,11 @@ +{ + "ewc_loss": 0.052088044583797455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6044022888527252e-05, + "total_flos": 5.62815163329864e+19, + "train_loss": 0.0, + "train_runtime": 852.4232, + "train_samples": 125770, + "train_samples_per_second": 442.632, + "train_steps_per_second": 27.666 +} \ No newline at end of file diff --git a/chat_template.jinja b/chat_template.jinja new file mode 100644 index 0000000..0ab931a --- /dev/null +++ b/chat_template.jinja @@ -0,0 +1,121 @@ +{{- bos_token }} +{%- if custom_tools is defined %} + {%- set tools = custom_tools %} +{%- endif %} +{%- if not tools_in_user_message is defined %} + {%- set tools_in_user_message = true %} +{%- endif %} +{%- if not date_string is defined %} + {%- set date_string = "26 Jul 2024" %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} + +{#- This block extracts the system message, so we can slot it into the right place. #} +{%- if messages[0]['role'] == 'system' %} + {%- set system_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} +{%- else %} + {%- set system_message = "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: +... + + +... +" %} +{%- endif %} + +{#- System message + builtin tools #} +{{- "<|start_header_id|>system<|end_header_id|>\n\n" }} +{%- if builtin_tools is defined or tools is not none %} + {{- "Environment: ipython\n" }} +{%- endif %} +{%- if builtin_tools is defined %} + {{- "Tools: " + builtin_tools | reject('equalto', 'code_interpreter') | join(", ") + "\n\n"}} +{%- endif %} +{{- "Cutting Knowledge Date: December 2023\n" }} +{{- "Today Date: " + date_string + "\n\n" }} +{%- if tools is not none and not tools_in_user_message %} + {{- "You have access to the following functions. To call a function, please respond with JSON for a function call." }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} +{%- endif %} +{{- system_message }} +{{- "<|eot_id|>" }} + +{#- Custom tools are passed in a user message with some extra guidance #} +{%- if tools_in_user_message and not tools is none %} + {#- Extract the first user message so we can plug it in here #} + {%- if messages | length != 0 %} + {%- set first_user_message = messages[0]['content']|trim %} + {%- set messages = messages[1:] %} + {%- else %} + {{- raise_exception("Cannot put tools in the first user message when there's no first user message!") }} + {%- endif %} + {{- '<|start_header_id|>user<|end_header_id|>\n\n' -}} + {{- "Given the following functions, please respond with a JSON for a function call " }} + {{- "with its proper arguments that best answers the given prompt.\n\n" }} + {{- 'Respond in the format {"name": function name, "parameters": dictionary of argument name and its value}.' }} + {{- "Do not use variables.\n\n" }} + {%- for t in tools %} + {{- t | tojson(indent=4) }} + {{- "\n\n" }} + {%- endfor %} + {{- first_user_message + "<|eot_id|>"}} +{%- endif %} + +{%- for message in messages %} + {%- if not (message.role == 'ipython' or message.role == 'tool' or 'tool_calls' in message) %} + {%- if message['role'] == 'assistant' %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} + {% generation %} + {{- message['content'] | trim + '<|eot_id|>' }} + {% endgeneration %} + {%- else %} + {{- '<|start_header_id|>' + message['role'] + '<|end_header_id|>\n\n'+ message['content'] | trim + '<|eot_id|>' }} + {%- endif %} + {%- elif 'tool_calls' in message %} + {%- if not message.tool_calls|length == 1 %} + {{- raise_exception("This model only supports single tool-calls at once!") }} + {%- endif %} + {%- set tool_call = message.tool_calls[0].function %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' -}} + {% generation %} + {%- if builtin_tools is defined and tool_call.name in builtin_tools %} + {{- "<|python_tag|>" + tool_call.name + ".call(" }} + {%- for arg_name, arg_val in tool_call.arguments | items %} + {{- arg_name + '="' + arg_val + '"' }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- else %} + {{- '{"name": "' + tool_call.name + '", ' }} + {{- '"parameters": ' }} + {{- tool_call.arguments | tojson }} + {{- "}" }} + {%- endif %} + {%- if builtin_tools is defined %} + {{- "<|eom_id|>" }} + {%- else %} + {{- "<|eot_id|>" }} + {%- endif %} + {% endgeneration %} + {%- elif message.role == "tool" or message.role == "ipython" %} + {{- "<|start_header_id|>ipython<|end_header_id|>\n\n" }} + {%- if message.content is mapping or message.content is iterable %} + {{- message.content | tojson }} + {%- else %} + {{- message.content }} + {%- endif %} + {{- "<|eot_id|>" }} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|start_header_id|>assistant<|end_header_id|>\n\n' }} +{%- endif %} \ No newline at end of file diff --git a/config.json b/config.json new file mode 100644 index 0000000..06df27b --- /dev/null +++ b/config.json @@ -0,0 +1,36 @@ +{ + "architectures": [ + "LlamaForCausalLM" + ], + "attention_bias": false, + "attention_dropout": 0.0, + "bos_token_id": 128000, + "dtype": "bfloat16", + "eos_token_id": 128009, + "head_dim": 128, + "hidden_act": "silu", + "hidden_size": 4096, + "initializer_range": 0.02, + "intermediate_size": 14336, + "max_position_embeddings": 131072, + "mlp_bias": false, + "model_type": "llama", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 8, + "pad_token_id": 128009, + "pretraining_tp": 1, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "factor": 8.0, + "high_freq_factor": 4.0, + "low_freq_factor": 1.0, + "original_max_position_embeddings": 8192, + "rope_type": "llama3" + }, + "rope_theta": 500000.0, + "tie_word_embeddings": false, + "transformers_version": "4.57.6", + "use_cache": true, + "vocab_size": 128256 +} diff --git a/generation_config.json b/generation_config.json new file mode 100644 index 0000000..50f6077 --- /dev/null +++ b/generation_config.json @@ -0,0 +1,9 @@ +{ + "bos_token_id": 128000, + "do_sample": true, + "eos_token_id": 128009, + "pad_token_id": 128009, + "temperature": 0.6, + "top_p": 0.9, + "transformers_version": "4.57.6" +} diff --git a/model-00001-of-00004.safetensors b/model-00001-of-00004.safetensors new file mode 100644 index 0000000..9d48e39 --- /dev/null +++ b/model-00001-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1a6e28bd559748ccf91c290b5c1a1df8215b216a54b5008de5f5e2aa5ca9598 +size 4976698672 diff --git a/model-00002-of-00004.safetensors b/model-00002-of-00004.safetensors new file mode 100644 index 0000000..0f119a5 --- /dev/null +++ b/model-00002-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12d00a8d92445a79c95c4fcce4031af884e27ed9da7278530ff5382969e05e68 +size 4999802720 diff --git a/model-00003-of-00004.safetensors b/model-00003-of-00004.safetensors new file mode 100644 index 0000000..cfa18b2 --- /dev/null +++ b/model-00003-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fae07bf2ea123a76a15e122c8dfc72bed77b852b21e03a22db19f960c58d98f6 +size 4915916176 diff --git a/model-00004-of-00004.safetensors b/model-00004-of-00004.safetensors new file mode 100644 index 0000000..0c344d6 --- /dev/null +++ b/model-00004-of-00004.safetensors @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e2fbf277a700ecd3ea58cd99ad308f18b7eedb4ed7dae04626b45200607af9c +size 1168138808 diff --git a/model.safetensors.index.json b/model.safetensors.index.json new file mode 100644 index 0000000..5c64f1e --- /dev/null +++ b/model.safetensors.index.json @@ -0,0 +1,299 @@ +{ + "metadata": { + "total_parameters": 8030261248, + "total_size": 16060522496 + }, + "weight_map": { + "lm_head.weight": "model-00004-of-00004.safetensors", + "model.embed_tokens.weight": "model-00001-of-00004.safetensors", + "model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors", + "model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors", + "model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors", + "model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors", + "model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors", + "model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors", + "model.norm.weight": "model-00004-of-00004.safetensors" + } +} diff --git a/special_tokens_map.json b/special_tokens_map.json new file mode 100644 index 0000000..9d4773c --- /dev/null +++ b/special_tokens_map.json @@ -0,0 +1,11 @@ +{ + "bos_token": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false + }, + "eos_token": "<|eot_id|>", + "pad_token": "<|eot_id|>" +} diff --git a/tokenizer.json b/tokenizer.json new file mode 100644 index 0000000..1c1d8d5 --- /dev/null +++ b/tokenizer.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9e4e7fb171f92fd137b777cc2714bf87d11576700a1dcd7a399e7bbe39537b +size 17209920 diff --git a/tokenizer_config.json b/tokenizer_config.json new file mode 100644 index 0000000..3beeacc --- /dev/null +++ b/tokenizer_config.json @@ -0,0 +1,2063 @@ +{ + "added_tokens_decoder": { + "128000": { + "content": "<|begin_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128001": { + "content": "<|end_of_text|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128002": { + "content": "<|reserved_special_token_0|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128003": { + "content": "<|reserved_special_token_1|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128004": { + "content": "<|finetune_right_pad_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128005": { + "content": "<|reserved_special_token_2|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128006": { + "content": "<|start_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128007": { + "content": "<|end_header_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128008": { + "content": "<|eom_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128009": { + "content": "<|eot_id|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128010": { + "content": "<|python_tag|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128011": { + "content": "<|reserved_special_token_3|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128012": { + "content": "<|reserved_special_token_4|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128013": { + "content": "<|reserved_special_token_5|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128014": { + "content": "<|reserved_special_token_6|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128015": { + "content": "<|reserved_special_token_7|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128016": { + "content": "<|reserved_special_token_8|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128017": { + "content": "<|reserved_special_token_9|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128018": { + "content": "<|reserved_special_token_10|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128019": { + "content": "<|reserved_special_token_11|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128020": { + "content": "<|reserved_special_token_12|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128021": { + "content": "<|reserved_special_token_13|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128022": { + "content": "<|reserved_special_token_14|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128023": { + "content": "<|reserved_special_token_15|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128024": { + "content": "<|reserved_special_token_16|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128025": { + "content": "<|reserved_special_token_17|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128026": { + "content": "<|reserved_special_token_18|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128027": { + "content": "<|reserved_special_token_19|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128028": { + "content": "<|reserved_special_token_20|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128029": { + "content": "<|reserved_special_token_21|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128030": { + "content": "<|reserved_special_token_22|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128031": { + "content": "<|reserved_special_token_23|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128032": { + "content": "<|reserved_special_token_24|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128033": { + "content": "<|reserved_special_token_25|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128034": { + "content": "<|reserved_special_token_26|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128035": { + "content": "<|reserved_special_token_27|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128036": { + "content": "<|reserved_special_token_28|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128037": { + "content": "<|reserved_special_token_29|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128038": { + "content": "<|reserved_special_token_30|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128039": { + "content": "<|reserved_special_token_31|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128040": { + "content": "<|reserved_special_token_32|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128041": { + "content": "<|reserved_special_token_33|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128042": { + "content": "<|reserved_special_token_34|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128043": { + "content": "<|reserved_special_token_35|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128044": { + "content": "<|reserved_special_token_36|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128045": { + "content": "<|reserved_special_token_37|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128046": { + "content": "<|reserved_special_token_38|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128047": { + "content": "<|reserved_special_token_39|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128048": { + "content": "<|reserved_special_token_40|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128049": { + "content": "<|reserved_special_token_41|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128050": { + "content": "<|reserved_special_token_42|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128051": { + "content": "<|reserved_special_token_43|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128052": { + "content": "<|reserved_special_token_44|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128053": { + "content": "<|reserved_special_token_45|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128054": { + "content": "<|reserved_special_token_46|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128055": { + "content": "<|reserved_special_token_47|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128056": { + "content": "<|reserved_special_token_48|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128057": { + "content": "<|reserved_special_token_49|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128058": { + "content": "<|reserved_special_token_50|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128059": { + "content": "<|reserved_special_token_51|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128060": { + "content": "<|reserved_special_token_52|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128061": { + "content": "<|reserved_special_token_53|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128062": { + "content": "<|reserved_special_token_54|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128063": { + "content": "<|reserved_special_token_55|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128064": { + "content": "<|reserved_special_token_56|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128065": { + "content": "<|reserved_special_token_57|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128066": { + "content": "<|reserved_special_token_58|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128067": { + "content": "<|reserved_special_token_59|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128068": { + "content": "<|reserved_special_token_60|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128069": { + "content": "<|reserved_special_token_61|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128070": { + "content": "<|reserved_special_token_62|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128071": { + "content": "<|reserved_special_token_63|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128072": { + "content": "<|reserved_special_token_64|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128073": { + "content": "<|reserved_special_token_65|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128074": { + "content": "<|reserved_special_token_66|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128075": { + "content": "<|reserved_special_token_67|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128076": { + "content": "<|reserved_special_token_68|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128077": { + "content": "<|reserved_special_token_69|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128078": { + "content": "<|reserved_special_token_70|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128079": { + "content": "<|reserved_special_token_71|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128080": { + "content": "<|reserved_special_token_72|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128081": { + "content": "<|reserved_special_token_73|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128082": { + "content": "<|reserved_special_token_74|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128083": { + "content": "<|reserved_special_token_75|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128084": { + "content": "<|reserved_special_token_76|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128085": { + "content": "<|reserved_special_token_77|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128086": { + "content": "<|reserved_special_token_78|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128087": { + "content": "<|reserved_special_token_79|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128088": { + "content": "<|reserved_special_token_80|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128089": { + "content": "<|reserved_special_token_81|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128090": { + "content": "<|reserved_special_token_82|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128091": { + "content": "<|reserved_special_token_83|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128092": { + "content": "<|reserved_special_token_84|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128093": { + "content": "<|reserved_special_token_85|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128094": { + "content": "<|reserved_special_token_86|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128095": { + "content": "<|reserved_special_token_87|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128096": { + "content": "<|reserved_special_token_88|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128097": { + "content": "<|reserved_special_token_89|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128098": { + "content": "<|reserved_special_token_90|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128099": { + "content": "<|reserved_special_token_91|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128100": { + "content": "<|reserved_special_token_92|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128101": { + "content": "<|reserved_special_token_93|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128102": { + "content": "<|reserved_special_token_94|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128103": { + "content": "<|reserved_special_token_95|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128104": { + "content": "<|reserved_special_token_96|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128105": { + "content": "<|reserved_special_token_97|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128106": { + "content": "<|reserved_special_token_98|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128107": { + "content": "<|reserved_special_token_99|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128108": { + "content": "<|reserved_special_token_100|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128109": { + "content": "<|reserved_special_token_101|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128110": { + "content": "<|reserved_special_token_102|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128111": { + "content": "<|reserved_special_token_103|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128112": { + "content": "<|reserved_special_token_104|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128113": { + "content": "<|reserved_special_token_105|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128114": { + "content": "<|reserved_special_token_106|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128115": { + "content": "<|reserved_special_token_107|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128116": { + "content": "<|reserved_special_token_108|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128117": { + "content": "<|reserved_special_token_109|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128118": { + "content": "<|reserved_special_token_110|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128119": { + "content": "<|reserved_special_token_111|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128120": { + "content": "<|reserved_special_token_112|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128121": { + "content": "<|reserved_special_token_113|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128122": { + "content": "<|reserved_special_token_114|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128123": { + "content": "<|reserved_special_token_115|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128124": { + "content": "<|reserved_special_token_116|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128125": { + "content": "<|reserved_special_token_117|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128126": { + "content": "<|reserved_special_token_118|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128127": { + "content": "<|reserved_special_token_119|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128128": { + "content": "<|reserved_special_token_120|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128129": { + "content": "<|reserved_special_token_121|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128130": { + "content": "<|reserved_special_token_122|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128131": { + "content": "<|reserved_special_token_123|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128132": { + "content": "<|reserved_special_token_124|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128133": { + "content": "<|reserved_special_token_125|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128134": { + "content": "<|reserved_special_token_126|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128135": { + "content": "<|reserved_special_token_127|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128136": { + "content": "<|reserved_special_token_128|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128137": { + "content": "<|reserved_special_token_129|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128138": { + "content": "<|reserved_special_token_130|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128139": { + "content": "<|reserved_special_token_131|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128140": { + "content": "<|reserved_special_token_132|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128141": { + "content": "<|reserved_special_token_133|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128142": { + "content": "<|reserved_special_token_134|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128143": { + "content": "<|reserved_special_token_135|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128144": { + "content": "<|reserved_special_token_136|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128145": { + "content": "<|reserved_special_token_137|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128146": { + "content": "<|reserved_special_token_138|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128147": { + "content": "<|reserved_special_token_139|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128148": { + "content": "<|reserved_special_token_140|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128149": { + "content": "<|reserved_special_token_141|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128150": { + "content": "<|reserved_special_token_142|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128151": { + "content": "<|reserved_special_token_143|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128152": { + "content": "<|reserved_special_token_144|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128153": { + "content": "<|reserved_special_token_145|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128154": { + "content": "<|reserved_special_token_146|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128155": { + "content": "<|reserved_special_token_147|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128156": { + "content": "<|reserved_special_token_148|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128157": { + "content": "<|reserved_special_token_149|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128158": { + "content": "<|reserved_special_token_150|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128159": { + "content": "<|reserved_special_token_151|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128160": { + "content": "<|reserved_special_token_152|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128161": { + "content": "<|reserved_special_token_153|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128162": { + "content": "<|reserved_special_token_154|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128163": { + "content": "<|reserved_special_token_155|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128164": { + "content": "<|reserved_special_token_156|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128165": { + "content": "<|reserved_special_token_157|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128166": { + "content": "<|reserved_special_token_158|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128167": { + "content": "<|reserved_special_token_159|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128168": { + "content": "<|reserved_special_token_160|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128169": { + "content": "<|reserved_special_token_161|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128170": { + "content": "<|reserved_special_token_162|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128171": { + "content": "<|reserved_special_token_163|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128172": { + "content": "<|reserved_special_token_164|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128173": { + "content": "<|reserved_special_token_165|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128174": { + "content": "<|reserved_special_token_166|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128175": { + "content": "<|reserved_special_token_167|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128176": { + "content": "<|reserved_special_token_168|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128177": { + "content": "<|reserved_special_token_169|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128178": { + "content": "<|reserved_special_token_170|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128179": { + "content": "<|reserved_special_token_171|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128180": { + "content": "<|reserved_special_token_172|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128181": { + "content": "<|reserved_special_token_173|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128182": { + "content": "<|reserved_special_token_174|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128183": { + "content": "<|reserved_special_token_175|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128184": { + "content": "<|reserved_special_token_176|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128185": { + "content": "<|reserved_special_token_177|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128186": { + "content": "<|reserved_special_token_178|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128187": { + "content": "<|reserved_special_token_179|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128188": { + "content": "<|reserved_special_token_180|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128189": { + "content": "<|reserved_special_token_181|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128190": { + "content": "<|reserved_special_token_182|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128191": { + "content": "<|reserved_special_token_183|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128192": { + "content": "<|reserved_special_token_184|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128193": { + "content": "<|reserved_special_token_185|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128194": { + "content": "<|reserved_special_token_186|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128195": { + "content": "<|reserved_special_token_187|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128196": { + "content": "<|reserved_special_token_188|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128197": { + "content": "<|reserved_special_token_189|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128198": { + "content": "<|reserved_special_token_190|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128199": { + "content": "<|reserved_special_token_191|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128200": { + "content": "<|reserved_special_token_192|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128201": { + "content": "<|reserved_special_token_193|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128202": { + "content": "<|reserved_special_token_194|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128203": { + "content": "<|reserved_special_token_195|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128204": { + "content": "<|reserved_special_token_196|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128205": { + "content": "<|reserved_special_token_197|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128206": { + "content": "<|reserved_special_token_198|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128207": { + "content": "<|reserved_special_token_199|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128208": { + "content": "<|reserved_special_token_200|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128209": { + "content": "<|reserved_special_token_201|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128210": { + "content": "<|reserved_special_token_202|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128211": { + "content": "<|reserved_special_token_203|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128212": { + "content": "<|reserved_special_token_204|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128213": { + "content": "<|reserved_special_token_205|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128214": { + "content": "<|reserved_special_token_206|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128215": { + "content": "<|reserved_special_token_207|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128216": { + "content": "<|reserved_special_token_208|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128217": { + "content": "<|reserved_special_token_209|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128218": { + "content": "<|reserved_special_token_210|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128219": { + "content": "<|reserved_special_token_211|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128220": { + "content": "<|reserved_special_token_212|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128221": { + "content": "<|reserved_special_token_213|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128222": { + "content": "<|reserved_special_token_214|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128223": { + "content": "<|reserved_special_token_215|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128224": { + "content": "<|reserved_special_token_216|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128225": { + "content": "<|reserved_special_token_217|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128226": { + "content": "<|reserved_special_token_218|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128227": { + "content": "<|reserved_special_token_219|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128228": { + "content": "<|reserved_special_token_220|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128229": { + "content": "<|reserved_special_token_221|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128230": { + "content": "<|reserved_special_token_222|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128231": { + "content": "<|reserved_special_token_223|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128232": { + "content": "<|reserved_special_token_224|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128233": { + "content": "<|reserved_special_token_225|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128234": { + "content": "<|reserved_special_token_226|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128235": { + "content": "<|reserved_special_token_227|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128236": { + "content": "<|reserved_special_token_228|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128237": { + "content": "<|reserved_special_token_229|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128238": { + "content": "<|reserved_special_token_230|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128239": { + "content": "<|reserved_special_token_231|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128240": { + "content": "<|reserved_special_token_232|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128241": { + "content": "<|reserved_special_token_233|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128242": { + "content": "<|reserved_special_token_234|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128243": { + "content": "<|reserved_special_token_235|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128244": { + "content": "<|reserved_special_token_236|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128245": { + "content": "<|reserved_special_token_237|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128246": { + "content": "<|reserved_special_token_238|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128247": { + "content": "<|reserved_special_token_239|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128248": { + "content": "<|reserved_special_token_240|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128249": { + "content": "<|reserved_special_token_241|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128250": { + "content": "<|reserved_special_token_242|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128251": { + "content": "<|reserved_special_token_243|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128252": { + "content": "<|reserved_special_token_244|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128253": { + "content": "<|reserved_special_token_245|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128254": { + "content": "<|reserved_special_token_246|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + }, + "128255": { + "content": "<|reserved_special_token_247|>", + "lstrip": false, + "normalized": false, + "rstrip": false, + "single_word": false, + "special": true + } + }, + "bos_token": "<|begin_of_text|>", + "clean_up_tokenization_spaces": true, + "eos_token": "<|eot_id|>", + "extra_special_tokens": {}, + "model_input_names": [ + "input_ids", + "attention_mask" + ], + "model_max_length": 131072, + "pad_token": "<|eot_id|>", + "tokenizer_class": "PreTrainedTokenizerFast" +} diff --git a/train_results.json b/train_results.json new file mode 100644 index 0000000..30d02f3 --- /dev/null +++ b/train_results.json @@ -0,0 +1,8 @@ +{ + "total_flos": 5.62815163329864e+19, + "train_loss": 0.0, + "train_runtime": 852.4232, + "train_samples": 125770, + "train_samples_per_second": 442.632, + "train_steps_per_second": 27.666 +} \ No newline at end of file diff --git a/trainer_state.json b/trainer_state.json new file mode 100644 index 0000000..7747533 --- /dev/null +++ b/trainer_state.json @@ -0,0 +1,283039 @@ +{ + "best_global_step": null, + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 3.0, + "eval_steps": 500, + "global_step": 23583, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.00012721027859051011, + "ewc_loss": 0.0, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 0.0, + "grad_norm": 4.835111141204834, + "learning_rate": 0.0, + "loss": 0.7982, + "mean_token_accuracy": 0.7762961387634277, + "num_tokens": 38493.0, + "step": 1 + }, + { + "epoch": 0.00025442055718102023, + "ewc_loss": 0.0, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 0.0, + "grad_norm": 4.588393211364746, + "learning_rate": 4.2390843577787196e-10, + "loss": 0.8329, + "mean_token_accuracy": 0.765798807144165, + "num_tokens": 80419.0, + "step": 2 + }, + { + "epoch": 0.0003816308357715303, + "ewc_loss": 2.0678296856931067e-14, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0339148851982007e-17, + "grad_norm": 4.7259440422058105, + "learning_rate": 8.478168715557439e-10, + "loss": 0.7225, + "mean_token_accuracy": 0.7960126996040344, + "num_tokens": 118717.0, + "step": 3 + }, + { + "epoch": 0.0005088411143620405, + "ewc_loss": 8.271776276089216e-13, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1358882396885614e-16, + "grad_norm": 5.365794658660889, + "learning_rate": 1.271725307333616e-09, + "loss": 0.8139, + "mean_token_accuracy": 0.7711150050163269, + "num_tokens": 150155.0, + "step": 4 + }, + { + "epoch": 0.0006360513929525506, + "ewc_loss": 6.2729764958857626e-12, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.136488253025079e-15, + "grad_norm": 4.307551860809326, + "learning_rate": 1.6956337431114878e-09, + "loss": 0.7919, + "mean_token_accuracy": 0.7746071815490723, + "num_tokens": 193616.0, + "step": 5 + }, + { + "epoch": 0.0007632616715430606, + "ewc_loss": 3.414657445688363e-11, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7073287363967085e-14, + "grad_norm": 5.210720062255859, + "learning_rate": 2.1195421788893596e-09, + "loss": 0.7894, + "mean_token_accuracy": 0.7784130573272705, + "num_tokens": 227640.0, + "step": 6 + }, + { + "epoch": 0.0008904719501335708, + "ewc_loss": 6.519375317370901e-11, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.259687712895559e-14, + "grad_norm": 4.8318772315979, + "learning_rate": 2.543450614667232e-09, + "loss": 0.816, + "mean_token_accuracy": 0.7749419212341309, + "num_tokens": 265114.0, + "step": 7 + }, + { + "epoch": 0.001017682228724081, + "ewc_loss": 2.945293453571196e-10, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.472646738711822e-13, + "grad_norm": 4.940948009490967, + "learning_rate": 2.967359050445104e-09, + "loss": 0.7585, + "mean_token_accuracy": 0.7885839343070984, + "num_tokens": 299865.0, + "step": 8 + }, + { + "epoch": 0.001144892507314591, + "ewc_loss": 4.560396227315522e-10, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.280198057279248e-13, + "grad_norm": 4.572490215301514, + "learning_rate": 3.3912674862229757e-09, + "loss": 0.8138, + "mean_token_accuracy": 0.773703932762146, + "num_tokens": 342063.0, + "step": 9 + }, + { + "epoch": 0.0012721027859051012, + "ewc_loss": 8.475235513394352e-10, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.23761774368675e-13, + "grad_norm": 5.454471588134766, + "learning_rate": 3.815175922000847e-09, + "loss": 0.8642, + "mean_token_accuracy": 0.7641021609306335, + "num_tokens": 374864.0, + "step": 10 + }, + { + "epoch": 0.0013993130644956112, + "ewc_loss": 2.557431377780972e-09, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2787157400648286e-12, + "grad_norm": 4.302767753601074, + "learning_rate": 4.239084357778719e-09, + "loss": 0.7764, + "mean_token_accuracy": 0.7753841280937195, + "num_tokens": 416605.0, + "step": 11 + }, + { + "epoch": 0.0015265233430861213, + "ewc_loss": 3.3536615706708517e-09, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6768307662534676e-12, + "grad_norm": 5.457753658294678, + "learning_rate": 4.662992793556591e-09, + "loss": 0.8345, + "mean_token_accuracy": 0.7695369720458984, + "num_tokens": 448798.0, + "step": 12 + }, + { + "epoch": 0.0016537336216766315, + "ewc_loss": 4.20076506912892e-09, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.100382584871441e-12, + "grad_norm": 5.451271057128906, + "learning_rate": 5.086901229334464e-09, + "loss": 0.8737, + "mean_token_accuracy": 0.7533203959465027, + "num_tokens": 480084.0, + "step": 13 + }, + { + "epoch": 0.0017809439002671415, + "ewc_loss": 1.2839114660323503e-08, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.419557160158851e-12, + "grad_norm": 4.386342525482178, + "learning_rate": 5.510809665112336e-09, + "loss": 0.8058, + "mean_token_accuracy": 0.7791518568992615, + "num_tokens": 524543.0, + "step": 14 + }, + { + "epoch": 0.0019081541788576518, + "ewc_loss": 1.9428966879786458e-08, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.714483557854425e-12, + "grad_norm": 4.78865909576416, + "learning_rate": 5.934718100890208e-09, + "loss": 0.7519, + "mean_token_accuracy": 0.7885907292366028, + "num_tokens": 563314.0, + "step": 15 + }, + { + "epoch": 0.002035364457448162, + "ewc_loss": 2.3057246778535045e-08, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1528623541923189e-11, + "grad_norm": 5.076462268829346, + "learning_rate": 6.3586265366680796e-09, + "loss": 0.8425, + "mean_token_accuracy": 0.7622871398925781, + "num_tokens": 598421.0, + "step": 16 + }, + { + "epoch": 0.002162574736038672, + "ewc_loss": 2.7678215275273033e-08, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3839107811108864e-11, + "grad_norm": 5.0403265953063965, + "learning_rate": 6.782534972445951e-09, + "loss": 0.8322, + "mean_token_accuracy": 0.7626286149024963, + "num_tokens": 634690.0, + "step": 17 + }, + { + "epoch": 0.002289785014629182, + "ewc_loss": 3.094926626090455e-08, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.547463372719715e-11, + "grad_norm": 5.12692403793335, + "learning_rate": 7.206443408223823e-09, + "loss": 0.8688, + "mean_token_accuracy": 0.7597525119781494, + "num_tokens": 674653.0, + "step": 18 + }, + { + "epoch": 0.0024169952932196924, + "ewc_loss": 8.378476223924736e-08, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.189238192453537e-11, + "grad_norm": 5.059974670410156, + "learning_rate": 7.630351844001695e-09, + "loss": 0.8006, + "mean_token_accuracy": 0.7778322696685791, + "num_tokens": 708238.0, + "step": 19 + }, + { + "epoch": 0.0025442055718102024, + "ewc_loss": 1.257342461258304e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.286712123104721e-11, + "grad_norm": 4.612924098968506, + "learning_rate": 8.054260279779567e-09, + "loss": 0.7823, + "mean_token_accuracy": 0.7783422470092773, + "num_tokens": 749312.0, + "step": 20 + }, + { + "epoch": 0.0026714158504007124, + "ewc_loss": 1.481610638620623e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.408053337432108e-11, + "grad_norm": 5.426064968109131, + "learning_rate": 8.478168715557438e-09, + "loss": 0.8527, + "mean_token_accuracy": 0.76656174659729, + "num_tokens": 783532.0, + "step": 21 + }, + { + "epoch": 0.0027986261289912225, + "ewc_loss": 1.6189754603601614e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.094877546049872e-11, + "grad_norm": 5.091753005981445, + "learning_rate": 8.902077151335311e-09, + "loss": 0.83, + "mean_token_accuracy": 0.7735406160354614, + "num_tokens": 817429.0, + "step": 22 + }, + { + "epoch": 0.0029258364075817325, + "ewc_loss": 1.818037418388485e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.090186936511202e-11, + "grad_norm": 4.821273326873779, + "learning_rate": 9.325985587113182e-09, + "loss": 0.7635, + "mean_token_accuracy": 0.7884499430656433, + "num_tokens": 853964.0, + "step": 23 + }, + { + "epoch": 0.0030530466861722425, + "ewc_loss": 1.978437040861536e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.89218498781419e-11, + "grad_norm": 5.4560980796813965, + "learning_rate": 9.749894022891054e-09, + "loss": 0.8277, + "mean_token_accuracy": 0.7668163776397705, + "num_tokens": 885070.0, + "step": 24 + }, + { + "epoch": 0.003180256964762753, + "ewc_loss": 2.2193621873611846e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1096810970112614e-10, + "grad_norm": 4.818403244018555, + "learning_rate": 1.0173802458668929e-08, + "loss": 0.8799, + "mean_token_accuracy": 0.7506303787231445, + "num_tokens": 926893.0, + "step": 25 + }, + { + "epoch": 0.003307467243353263, + "ewc_loss": 5.413052122094086e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7065261143377484e-10, + "grad_norm": 4.8876633644104, + "learning_rate": 1.05977108944468e-08, + "loss": 0.8162, + "mean_token_accuracy": 0.7673000693321228, + "num_tokens": 964773.0, + "step": 26 + }, + { + "epoch": 0.003434677521943773, + "ewc_loss": 7.975836524565239e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9879183288960007e-10, + "grad_norm": 4.753345489501953, + "learning_rate": 1.1021619330224672e-08, + "loss": 0.7691, + "mean_token_accuracy": 0.7851336598396301, + "num_tokens": 1002725.0, + "step": 27 + }, + { + "epoch": 0.003561887800534283, + "ewc_loss": 9.225842063642631e-07, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.612921156166294e-10, + "grad_norm": 4.988358020782471, + "learning_rate": 1.1445527766002543e-08, + "loss": 0.8731, + "mean_token_accuracy": 0.7570006251335144, + "num_tokens": 1040296.0, + "step": 28 + }, + { + "epoch": 0.003689098079124793, + "ewc_loss": 1.0009927109422279e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.004963665733442e-10, + "grad_norm": 4.444891452789307, + "learning_rate": 1.1869436201780416e-08, + "loss": 0.7655, + "mean_token_accuracy": 0.7850395441055298, + "num_tokens": 1081711.0, + "step": 29 + }, + { + "epoch": 0.0038163083577153036, + "ewc_loss": 1.0557930636423407e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.278965597987906e-10, + "grad_norm": 4.818784236907959, + "learning_rate": 1.2293344637558287e-08, + "loss": 0.812, + "mean_token_accuracy": 0.7752916812896729, + "num_tokens": 1120556.0, + "step": 30 + }, + { + "epoch": 0.003943518636305814, + "ewc_loss": 1.117682700169098e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.588413620749577e-10, + "grad_norm": 4.781866550445557, + "learning_rate": 1.2717253073336159e-08, + "loss": 0.7639, + "mean_token_accuracy": 0.7898972034454346, + "num_tokens": 1157723.0, + "step": 31 + }, + { + "epoch": 0.004070728914896324, + "ewc_loss": 1.2113181355744018e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.056590784453419e-10, + "grad_norm": 4.6953935623168945, + "learning_rate": 1.314116150911403e-08, + "loss": 0.8101, + "mean_token_accuracy": 0.7737487554550171, + "num_tokens": 1197879.0, + "step": 32 + }, + { + "epoch": 0.004197939193486834, + "ewc_loss": 1.2875572110715439e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.437785859958467e-10, + "grad_norm": 4.747684001922607, + "learning_rate": 1.3565069944891903e-08, + "loss": 0.8106, + "mean_token_accuracy": 0.7750945687294006, + "num_tokens": 1237342.0, + "step": 33 + }, + { + "epoch": 0.004325149472077344, + "ewc_loss": 1.3824010238749906e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.912004857362319e-10, + "grad_norm": 4.502740383148193, + "learning_rate": 1.3988978380669775e-08, + "loss": 0.7823, + "mean_token_accuracy": 0.7822189331054688, + "num_tokens": 1280197.0, + "step": 34 + }, + { + "epoch": 0.004452359750667854, + "ewc_loss": 1.7153181488538394e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.576590548869945e-10, + "grad_norm": 4.671781539916992, + "learning_rate": 1.4412886816447646e-08, + "loss": 0.7642, + "mean_token_accuracy": 0.7849018573760986, + "num_tokens": 1318625.0, + "step": 35 + }, + { + "epoch": 0.004579570029258364, + "ewc_loss": 3.2522027595405234e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6261013646712286e-09, + "grad_norm": 4.854300022125244, + "learning_rate": 1.4836795252225519e-08, + "loss": 0.8219, + "mean_token_accuracy": 0.7728296518325806, + "num_tokens": 1356868.0, + "step": 36 + }, + { + "epoch": 0.004706780307848874, + "ewc_loss": 4.497211193665862e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2486055240023006e-09, + "grad_norm": 4.886027812957764, + "learning_rate": 1.526070368800339e-08, + "loss": 0.7937, + "mean_token_accuracy": 0.7813162803649902, + "num_tokens": 1394696.0, + "step": 37 + }, + { + "epoch": 0.004833990586439385, + "ewc_loss": 5.240749942458933e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6203750280728855e-09, + "grad_norm": 4.4254584312438965, + "learning_rate": 1.5684612123781262e-08, + "loss": 0.7826, + "mean_token_accuracy": 0.7823439240455627, + "num_tokens": 1438738.0, + "step": 38 + }, + { + "epoch": 0.004961200865029895, + "ewc_loss": 5.710315690521384e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.855157887893256e-09, + "grad_norm": 4.809905529022217, + "learning_rate": 1.6108520559559135e-08, + "loss": 0.7484, + "mean_token_accuracy": 0.7921462059020996, + "num_tokens": 1475089.0, + "step": 39 + }, + { + "epoch": 0.005088411143620405, + "ewc_loss": 6.111950824561063e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.055975472676664e-09, + "grad_norm": 4.78151798248291, + "learning_rate": 1.6532428995337004e-08, + "loss": 0.8086, + "mean_token_accuracy": 0.7743784785270691, + "num_tokens": 1514566.0, + "step": 40 + }, + { + "epoch": 0.005215621422210915, + "ewc_loss": 6.4440423557243776e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2220210943734173e-09, + "grad_norm": 4.988718509674072, + "learning_rate": 1.6956337431114877e-08, + "loss": 0.8233, + "mean_token_accuracy": 0.7731878757476807, + "num_tokens": 1552560.0, + "step": 41 + }, + { + "epoch": 0.005342831700801425, + "ewc_loss": 6.688650501018856e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.344325261167569e-09, + "grad_norm": 5.419968605041504, + "learning_rate": 1.738024586689275e-08, + "loss": 0.8006, + "mean_token_accuracy": 0.777315616607666, + "num_tokens": 1584759.0, + "step": 42 + }, + { + "epoch": 0.005470041979391935, + "ewc_loss": 6.887244580866536e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4436222762224133e-09, + "grad_norm": 4.881171226501465, + "learning_rate": 1.7804154302670622e-08, + "loss": 0.802, + "mean_token_accuracy": 0.7751519680023193, + "num_tokens": 1621825.0, + "step": 43 + }, + { + "epoch": 0.005597252257982445, + "ewc_loss": 7.175601240305696e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5878007231815445e-09, + "grad_norm": 4.8338494300842285, + "learning_rate": 1.8228062738448494e-08, + "loss": 0.8276, + "mean_token_accuracy": 0.7677674293518066, + "num_tokens": 1662946.0, + "step": 44 + }, + { + "epoch": 0.005724462536572955, + "ewc_loss": 7.4458530434640124e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.722926411597882e-09, + "grad_norm": 4.703912258148193, + "learning_rate": 1.8651971174226364e-08, + "loss": 0.8153, + "mean_token_accuracy": 0.7729322910308838, + "num_tokens": 1699433.0, + "step": 45 + }, + { + "epoch": 0.005851672815163465, + "ewc_loss": 7.682784598728176e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.841392093306695e-09, + "grad_norm": 4.389205455780029, + "learning_rate": 1.9075879610004236e-08, + "loss": 0.7465, + "mean_token_accuracy": 0.7884572148323059, + "num_tokens": 1742812.0, + "step": 46 + }, + { + "epoch": 0.005978883093753975, + "ewc_loss": 7.953307431307621e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9766536730212465e-09, + "grad_norm": 4.89954948425293, + "learning_rate": 1.949978804578211e-08, + "loss": 0.8017, + "mean_token_accuracy": 0.7784188985824585, + "num_tokens": 1778725.0, + "step": 47 + }, + { + "epoch": 0.006106093372344485, + "ewc_loss": 8.605506081948988e-06, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.302753264795456e-09, + "grad_norm": 4.865689754486084, + "learning_rate": 1.9923696481559985e-08, + "loss": 0.8479, + "mean_token_accuracy": 0.7620370388031006, + "num_tokens": 1816592.0, + "step": 48 + }, + { + "epoch": 0.006233303650934996, + "ewc_loss": 1.0697298421291634e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.348649079195411e-09, + "grad_norm": 4.470602512359619, + "learning_rate": 2.0347604917337857e-08, + "loss": 0.7398, + "mean_token_accuracy": 0.7899926900863647, + "num_tokens": 1859907.0, + "step": 49 + }, + { + "epoch": 0.006360513929525506, + "ewc_loss": 1.6880450857570395e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.440225407468915e-09, + "grad_norm": 4.787687301635742, + "learning_rate": 2.0771513353115727e-08, + "loss": 0.7295, + "mean_token_accuracy": 0.7996907234191895, + "num_tokens": 1896627.0, + "step": 50 + }, + { + "epoch": 0.006487724208116016, + "ewc_loss": 2.2946403987589292e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1473201944056655e-08, + "grad_norm": 5.357914447784424, + "learning_rate": 2.11954217888936e-08, + "loss": 0.8445, + "mean_token_accuracy": 0.7668433785438538, + "num_tokens": 1934041.0, + "step": 51 + }, + { + "epoch": 0.006614934486706526, + "ewc_loss": 2.7276308173895814e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3638153717465684e-08, + "grad_norm": 4.695926666259766, + "learning_rate": 2.1619330224671472e-08, + "loss": 0.7738, + "mean_token_accuracy": 0.7846387624740601, + "num_tokens": 1976482.0, + "step": 52 + }, + { + "epoch": 0.006742144765297036, + "ewc_loss": 3.003784513566643e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5018922994158856e-08, + "grad_norm": 5.398840427398682, + "learning_rate": 2.2043238660449344e-08, + "loss": 0.8031, + "mean_token_accuracy": 0.771857500076294, + "num_tokens": 2009224.0, + "step": 53 + }, + { + "epoch": 0.006869355043887546, + "ewc_loss": 3.208108319086023e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.604054133963473e-08, + "grad_norm": 5.018485069274902, + "learning_rate": 2.2467147096227214e-08, + "loss": 0.8476, + "mean_token_accuracy": 0.7621122598648071, + "num_tokens": 2049235.0, + "step": 54 + }, + { + "epoch": 0.006996565322478056, + "ewc_loss": 3.344025753904134e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.672012928111144e-08, + "grad_norm": 4.731117248535156, + "learning_rate": 2.2891055532005086e-08, + "loss": 0.8603, + "mean_token_accuracy": 0.7615601420402527, + "num_tokens": 2090260.0, + "step": 55 + }, + { + "epoch": 0.007123775601068566, + "ewc_loss": 3.434641257626936e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7173206856568868e-08, + "grad_norm": 5.126198768615723, + "learning_rate": 2.331496396778296e-08, + "loss": 0.8659, + "mean_token_accuracy": 0.7603456377983093, + "num_tokens": 2126686.0, + "step": 56 + }, + { + "epoch": 0.007250985879659076, + "ewc_loss": 3.490435119601898e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.745217481641248e-08, + "grad_norm": 4.5326690673828125, + "learning_rate": 2.373887240356083e-08, + "loss": 0.7675, + "mean_token_accuracy": 0.7851545810699463, + "num_tokens": 2171355.0, + "step": 57 + }, + { + "epoch": 0.007378196158249586, + "ewc_loss": 3.521793769323267e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7608968505555822e-08, + "grad_norm": 4.74222993850708, + "learning_rate": 2.4162780839338704e-08, + "loss": 0.7408, + "mean_token_accuracy": 0.7923159003257751, + "num_tokens": 2211660.0, + "step": 58 + }, + { + "epoch": 0.007505406436840096, + "ewc_loss": 3.534569259500131e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7672846297500655e-08, + "grad_norm": 5.646154880523682, + "learning_rate": 2.4586689275116573e-08, + "loss": 0.8709, + "mean_token_accuracy": 0.7595604658126831, + "num_tokens": 2244411.0, + "step": 59 + }, + { + "epoch": 0.007632616715430607, + "ewc_loss": 3.535337600624189e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7676688557344278e-08, + "grad_norm": 4.500057220458984, + "learning_rate": 2.5010597710894446e-08, + "loss": 0.7974, + "mean_token_accuracy": 0.7711610794067383, + "num_tokens": 2285253.0, + "step": 60 + }, + { + "epoch": 0.007759826994021117, + "ewc_loss": 3.534814095473848e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7674070207363002e-08, + "grad_norm": 4.63735294342041, + "learning_rate": 2.5434506146672318e-08, + "loss": 0.7623, + "mean_token_accuracy": 0.7854529619216919, + "num_tokens": 2328577.0, + "step": 61 + }, + { + "epoch": 0.007887037272611627, + "ewc_loss": 3.568383544916287e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.784191816511793e-08, + "grad_norm": 4.830782413482666, + "learning_rate": 2.585841458245019e-08, + "loss": 0.8424, + "mean_token_accuracy": 0.7697674036026001, + "num_tokens": 2366524.0, + "step": 62 + }, + { + "epoch": 0.008014247551202136, + "ewc_loss": 3.620553979999386e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.81027690615565e-08, + "grad_norm": 4.370166301727295, + "learning_rate": 2.628232301822806e-08, + "loss": 0.7556, + "mean_token_accuracy": 0.7896984815597534, + "num_tokens": 2408628.0, + "step": 63 + }, + { + "epoch": 0.008141457829792647, + "ewc_loss": 3.6748446291312575e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.837422303196945e-08, + "grad_norm": 4.428055763244629, + "learning_rate": 2.6706231454005933e-08, + "loss": 0.7952, + "mean_token_accuracy": 0.7758893370628357, + "num_tokens": 2451800.0, + "step": 64 + }, + { + "epoch": 0.008268668108383158, + "ewc_loss": 3.7557641917373985e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8778820276565966e-08, + "grad_norm": 5.802945613861084, + "learning_rate": 2.7130139889783805e-08, + "loss": 0.866, + "mean_token_accuracy": 0.7547186017036438, + "num_tokens": 2481448.0, + "step": 65 + }, + { + "epoch": 0.008395878386973667, + "ewc_loss": 3.972240665461868e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9861204236804042e-08, + "grad_norm": 4.447017192840576, + "learning_rate": 2.7554048325561678e-08, + "loss": 0.7928, + "mean_token_accuracy": 0.7756186723709106, + "num_tokens": 2526339.0, + "step": 66 + }, + { + "epoch": 0.008523088665564178, + "ewc_loss": 4.362030085758306e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.181015013036358e-08, + "grad_norm": 4.620953559875488, + "learning_rate": 2.797795676133955e-08, + "loss": 0.8179, + "mean_token_accuracy": 0.7701829671859741, + "num_tokens": 2570691.0, + "step": 67 + }, + { + "epoch": 0.008650298944154687, + "ewc_loss": 5.225126733421348e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.612563321235939e-08, + "grad_norm": 4.811750888824463, + "learning_rate": 2.840186519711742e-08, + "loss": 0.8045, + "mean_token_accuracy": 0.7750339508056641, + "num_tokens": 2609207.0, + "step": 68 + }, + { + "epoch": 0.008777509222745198, + "ewc_loss": 7.328209176193923e-05, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.664104752942876e-08, + "grad_norm": 5.205588340759277, + "learning_rate": 2.8825773632895292e-08, + "loss": 0.7929, + "mean_token_accuracy": 0.7776550054550171, + "num_tokens": 2645494.0, + "step": 69 + }, + { + "epoch": 0.008904719501335707, + "ewc_loss": 0.00010782559547806159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.391279600530652e-08, + "grad_norm": 4.892863750457764, + "learning_rate": 2.9249682068673165e-08, + "loss": 0.7502, + "mean_token_accuracy": 0.7921037077903748, + "num_tokens": 2686258.0, + "step": 70 + }, + { + "epoch": 0.009031929779926218, + "ewc_loss": 0.00013258808758109808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.629404225577673e-08, + "grad_norm": 5.468591690063477, + "learning_rate": 2.9673590504451037e-08, + "loss": 0.8049, + "mean_token_accuracy": 0.7780673503875732, + "num_tokens": 2720544.0, + "step": 71 + }, + { + "epoch": 0.009159140058516728, + "ewc_loss": 0.00014559754345100373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.279876967913879e-08, + "grad_norm": 5.2715888023376465, + "learning_rate": 3.0097498940228907e-08, + "loss": 0.7681, + "mean_token_accuracy": 0.7857587337493896, + "num_tokens": 2758068.0, + "step": 72 + }, + { + "epoch": 0.009286350337107238, + "ewc_loss": 0.00015166714729275554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.583357586327111e-08, + "grad_norm": 5.452653408050537, + "learning_rate": 3.052140737600678e-08, + "loss": 0.8518, + "mean_token_accuracy": 0.7631019353866577, + "num_tokens": 2793342.0, + "step": 73 + }, + { + "epoch": 0.009413560615697748, + "ewc_loss": 0.0001549808803247288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.749044073079858e-08, + "grad_norm": 6.2212724685668945, + "learning_rate": 3.094531581178465e-08, + "loss": 0.889, + "mean_token_accuracy": 0.7555886507034302, + "num_tokens": 2828003.0, + "step": 74 + }, + { + "epoch": 0.009540770894288259, + "ewc_loss": 0.00015816176892258227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90808840633872e-08, + "grad_norm": 4.725070953369141, + "learning_rate": 3.1369224247562524e-08, + "loss": 0.7642, + "mean_token_accuracy": 0.7842235565185547, + "num_tokens": 2874755.0, + "step": 75 + }, + { + "epoch": 0.00966798117287877, + "ewc_loss": 0.00015816134691704065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.908067090056647e-08, + "grad_norm": 5.041355609893799, + "learning_rate": 3.17931326833404e-08, + "loss": 0.7862, + "mean_token_accuracy": 0.777023434638977, + "num_tokens": 2909703.0, + "step": 76 + }, + { + "epoch": 0.009795191451469279, + "ewc_loss": 0.0001578392111696303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.891960507322437e-08, + "grad_norm": 4.615080833435059, + "learning_rate": 3.221704111911827e-08, + "loss": 0.7164, + "mean_token_accuracy": 0.7989314198493958, + "num_tokens": 2954020.0, + "step": 77 + }, + { + "epoch": 0.00992240173005979, + "ewc_loss": 0.00015650350542273372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.825175174502874e-08, + "grad_norm": 5.594666481018066, + "learning_rate": 3.264094955489614e-08, + "loss": 0.8112, + "mean_token_accuracy": 0.7756859064102173, + "num_tokens": 2984894.0, + "step": 78 + }, + { + "epoch": 0.010049612008650299, + "ewc_loss": 0.00015668600099161267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.834299964315505e-08, + "grad_norm": 5.344277858734131, + "learning_rate": 3.306485799067401e-08, + "loss": 0.7186, + "mean_token_accuracy": 0.7955904006958008, + "num_tokens": 3017773.0, + "step": 79 + }, + { + "epoch": 0.01017682228724081, + "ewc_loss": 0.00015658636402804404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.829318349195091e-08, + "grad_norm": 4.718110084533691, + "learning_rate": 3.348876642645188e-08, + "loss": 0.718, + "mean_token_accuracy": 0.7979786992073059, + "num_tokens": 3059739.0, + "step": 80 + }, + { + "epoch": 0.010304032565831319, + "ewc_loss": 0.0001551485329400748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.75742634573362e-08, + "grad_norm": 5.22985315322876, + "learning_rate": 3.391267486222975e-08, + "loss": 0.81, + "mean_token_accuracy": 0.7719138860702515, + "num_tokens": 3094910.0, + "step": 81 + }, + { + "epoch": 0.01043124284442183, + "ewc_loss": 0.00015381924458779395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.69096217823062e-08, + "grad_norm": 4.996803283691406, + "learning_rate": 3.4336583298007626e-08, + "loss": 0.8139, + "mean_token_accuracy": 0.7736663818359375, + "num_tokens": 3133884.0, + "step": 82 + }, + { + "epoch": 0.010558453123012339, + "ewc_loss": 0.00015192235878203064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.596118223318626e-08, + "grad_norm": 5.528860569000244, + "learning_rate": 3.47604917337855e-08, + "loss": 0.8466, + "mean_token_accuracy": 0.7610335350036621, + "num_tokens": 3166755.0, + "step": 83 + }, + { + "epoch": 0.01068566340160285, + "ewc_loss": 0.00015003071166574955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.501535748133392e-08, + "grad_norm": 5.137533187866211, + "learning_rate": 3.518440016956337e-08, + "loss": 0.8153, + "mean_token_accuracy": 0.7706539034843445, + "num_tokens": 3204825.0, + "step": 84 + }, + { + "epoch": 0.010812873680193359, + "ewc_loss": 0.00014765950618311763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.382975297787198e-08, + "grad_norm": 5.14057731628418, + "learning_rate": 3.5608308605341244e-08, + "loss": 0.851, + "mean_token_accuracy": 0.7629127502441406, + "num_tokens": 3244036.0, + "step": 85 + }, + { + "epoch": 0.01094008395878387, + "ewc_loss": 0.000146108926855959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.305446558802942e-08, + "grad_norm": 4.9245171546936035, + "learning_rate": 3.6032217041119116e-08, + "loss": 0.8381, + "mean_token_accuracy": 0.7672082185745239, + "num_tokens": 3285969.0, + "step": 86 + }, + { + "epoch": 0.01106729423737438, + "ewc_loss": 0.00014508047024719417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.254023159930512e-08, + "grad_norm": 4.881991386413574, + "learning_rate": 3.645612547689699e-08, + "loss": 0.8095, + "mean_token_accuracy": 0.7749202251434326, + "num_tokens": 3327648.0, + "step": 87 + }, + { + "epoch": 0.01119450451596489, + "ewc_loss": 0.00014487578300759196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.243789212907359e-08, + "grad_norm": 5.010996341705322, + "learning_rate": 3.6880033912674855e-08, + "loss": 0.8122, + "mean_token_accuracy": 0.7726508975028992, + "num_tokens": 3367399.0, + "step": 88 + }, + { + "epoch": 0.0113217147945554, + "ewc_loss": 0.0001452831638744101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.264158341513394e-08, + "grad_norm": 5.222996711730957, + "learning_rate": 3.730394234845273e-08, + "loss": 0.7956, + "mean_token_accuracy": 0.7802305221557617, + "num_tokens": 3405402.0, + "step": 89 + }, + { + "epoch": 0.01144892507314591, + "ewc_loss": 0.00014662495232187212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.331247786623862e-08, + "grad_norm": 5.31928014755249, + "learning_rate": 3.77278507842306e-08, + "loss": 0.8788, + "mean_token_accuracy": 0.7537456750869751, + "num_tokens": 3441791.0, + "step": 90 + }, + { + "epoch": 0.01157613535173642, + "ewc_loss": 0.00014885605196468532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.442802285595462e-08, + "grad_norm": 5.25593900680542, + "learning_rate": 3.815175922000847e-08, + "loss": 0.8384, + "mean_token_accuracy": 0.7692067623138428, + "num_tokens": 3480151.0, + "step": 91 + }, + { + "epoch": 0.01170334563032693, + "ewc_loss": 0.0001525337138446048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.626685771811026e-08, + "grad_norm": 5.180698871612549, + "learning_rate": 3.8575667655786345e-08, + "loss": 0.8021, + "mean_token_accuracy": 0.7759397029876709, + "num_tokens": 3516867.0, + "step": 92 + }, + { + "epoch": 0.01183055590891744, + "ewc_loss": 0.00015936109411995858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.968054660523194e-08, + "grad_norm": 4.812746047973633, + "learning_rate": 3.899957609156422e-08, + "loss": 0.7782, + "mean_token_accuracy": 0.7824602127075195, + "num_tokens": 3557466.0, + "step": 93 + }, + { + "epoch": 0.01195776618750795, + "ewc_loss": 0.00017097398813348264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.548699526045311e-08, + "grad_norm": 5.033514976501465, + "learning_rate": 3.94234845273421e-08, + "loss": 0.8075, + "mean_token_accuracy": 0.773303210735321, + "num_tokens": 3596009.0, + "step": 94 + }, + { + "epoch": 0.012084976466098461, + "ewc_loss": 0.00019274068472441286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.637034281695378e-08, + "grad_norm": 5.304544925689697, + "learning_rate": 3.984739296311997e-08, + "loss": 0.8001, + "mean_token_accuracy": 0.7748421430587769, + "num_tokens": 3631582.0, + "step": 95 + }, + { + "epoch": 0.01221218674468897, + "ewc_loss": 0.00023364051594398916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.168202601320445e-07, + "grad_norm": 5.319375038146973, + "learning_rate": 4.027130139889784e-08, + "loss": 0.8264, + "mean_token_accuracy": 0.7678883075714111, + "num_tokens": 3672855.0, + "step": 96 + }, + { + "epoch": 0.012339397023279481, + "ewc_loss": 0.00030852702911943197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5426351751557377e-07, + "grad_norm": 5.288572788238525, + "learning_rate": 4.0695209834675715e-08, + "loss": 0.8186, + "mean_token_accuracy": 0.7726658582687378, + "num_tokens": 3714100.0, + "step": 97 + }, + { + "epoch": 0.012466607301869992, + "ewc_loss": 0.0004246046009939164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1230229663160571e-07, + "grad_norm": 5.5618205070495605, + "learning_rate": 4.111911827045358e-08, + "loss": 0.7705, + "mean_token_accuracy": 0.7847996950149536, + "num_tokens": 3752834.0, + "step": 98 + }, + { + "epoch": 0.012593817580460501, + "ewc_loss": 0.0005458933301270008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.729466643813794e-07, + "grad_norm": 5.800991058349609, + "learning_rate": 4.154302670623145e-08, + "loss": 0.7815, + "mean_token_accuracy": 0.7812479734420776, + "num_tokens": 3791513.0, + "step": 99 + }, + { + "epoch": 0.012721027859051012, + "ewc_loss": 0.0006228438578546047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1142192824518133e-07, + "grad_norm": 6.3036651611328125, + "learning_rate": 4.1966935142009326e-08, + "loss": 0.8049, + "mean_token_accuracy": 0.7739890813827515, + "num_tokens": 3825085.0, + "step": 100 + }, + { + "epoch": 0.012848238137641521, + "ewc_loss": 0.0006558759487234056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.279379825471551e-07, + "grad_norm": 5.752474308013916, + "learning_rate": 4.23908435777872e-08, + "loss": 0.8042, + "mean_token_accuracy": 0.7761634588241577, + "num_tokens": 3866492.0, + "step": 101 + }, + { + "epoch": 0.012975448416232032, + "ewc_loss": 0.0006562311900779605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.281155898093857e-07, + "grad_norm": 5.998240947723389, + "learning_rate": 4.281475201356507e-08, + "loss": 0.8313, + "mean_token_accuracy": 0.7629861831665039, + "num_tokens": 3904053.0, + "step": 102 + }, + { + "epoch": 0.013102658694822541, + "ewc_loss": 0.0006508126971311867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.254063472013513e-07, + "grad_norm": 6.006903648376465, + "learning_rate": 4.3238660449342943e-08, + "loss": 0.74, + "mean_token_accuracy": 0.7944260835647583, + "num_tokens": 3935118.0, + "step": 103 + }, + { + "epoch": 0.013229868973413052, + "ewc_loss": 0.0006485642516054213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.242821264848317e-07, + "grad_norm": 5.58139181137085, + "learning_rate": 4.3662568885120816e-08, + "loss": 0.7895, + "mean_token_accuracy": 0.7761110067367554, + "num_tokens": 3977899.0, + "step": 104 + }, + { + "epoch": 0.013357079252003561, + "ewc_loss": 0.0006471734959632158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.235867609419074e-07, + "grad_norm": 5.610540866851807, + "learning_rate": 4.408647732089869e-08, + "loss": 0.8095, + "mean_token_accuracy": 0.773040771484375, + "num_tokens": 4019416.0, + "step": 105 + }, + { + "epoch": 0.013484289530594072, + "ewc_loss": 0.000646617787424475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2330888188880635e-07, + "grad_norm": 5.565205097198486, + "learning_rate": 4.451038575667656e-08, + "loss": 0.7984, + "mean_token_accuracy": 0.7730852365493774, + "num_tokens": 4063887.0, + "step": 106 + }, + { + "epoch": 0.013611499809184581, + "ewc_loss": 0.0006431643269024789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.21582177775781e-07, + "grad_norm": 5.688173770904541, + "learning_rate": 4.493429419245443e-08, + "loss": 0.8299, + "mean_token_accuracy": 0.7674654722213745, + "num_tokens": 4101309.0, + "step": 107 + }, + { + "epoch": 0.013738710087775092, + "ewc_loss": 0.0006374709191732109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1873545935923175e-07, + "grad_norm": 5.774162769317627, + "learning_rate": 4.53582026282323e-08, + "loss": 0.8139, + "mean_token_accuracy": 0.7706881761550903, + "num_tokens": 4136669.0, + "step": 108 + }, + { + "epoch": 0.013865920366365603, + "ewc_loss": 0.000631371745839715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1568586678076827e-07, + "grad_norm": 5.398986339569092, + "learning_rate": 4.578211106401017e-08, + "loss": 0.7447, + "mean_token_accuracy": 0.7898049354553223, + "num_tokens": 4178583.0, + "step": 109 + }, + { + "epoch": 0.013993130644956112, + "ewc_loss": 0.000618415477219969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.092077349720057e-07, + "grad_norm": 5.381132125854492, + "learning_rate": 4.6206019499788045e-08, + "loss": 0.7751, + "mean_token_accuracy": 0.7769235372543335, + "num_tokens": 4219552.0, + "step": 110 + }, + { + "epoch": 0.014120340923546623, + "ewc_loss": 0.0006028703064657748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0143516482894483e-07, + "grad_norm": 5.3857316970825195, + "learning_rate": 4.662992793556592e-08, + "loss": 0.7679, + "mean_token_accuracy": 0.7855036854743958, + "num_tokens": 4264487.0, + "step": 111 + }, + { + "epoch": 0.014247551202137132, + "ewc_loss": 0.0005875825881958008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9379128818618483e-07, + "grad_norm": 5.4725751876831055, + "learning_rate": 4.705383637134379e-08, + "loss": 0.7345, + "mean_token_accuracy": 0.790213942527771, + "num_tokens": 4303015.0, + "step": 112 + }, + { + "epoch": 0.014374761480727643, + "ewc_loss": 0.0005740186315961182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8700932830361126e-07, + "grad_norm": 5.412577152252197, + "learning_rate": 4.747774480712166e-08, + "loss": 0.7888, + "mean_token_accuracy": 0.7775403261184692, + "num_tokens": 4345446.0, + "step": 113 + }, + { + "epoch": 0.014501971759318152, + "ewc_loss": 0.0005601425655186176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.800712763928459e-07, + "grad_norm": 5.494870662689209, + "learning_rate": 4.7901653242899535e-08, + "loss": 0.7416, + "mean_token_accuracy": 0.7911067008972168, + "num_tokens": 4383854.0, + "step": 114 + }, + { + "epoch": 0.014629182037908663, + "ewc_loss": 0.0005483757704496384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.74187897275624e-07, + "grad_norm": 5.551680088043213, + "learning_rate": 4.832556167867741e-08, + "loss": 0.8105, + "mean_token_accuracy": 0.7728090286254883, + "num_tokens": 4420750.0, + "step": 115 + }, + { + "epoch": 0.014756392316499172, + "ewc_loss": 0.0005372232408262789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.686116147287976e-07, + "grad_norm": 5.668928146362305, + "learning_rate": 4.8749470114455274e-08, + "loss": 0.8073, + "mean_token_accuracy": 0.7727006673812866, + "num_tokens": 4458761.0, + "step": 116 + }, + { + "epoch": 0.014883602595089683, + "ewc_loss": 0.0005265817744657397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6329090019316936e-07, + "grad_norm": 5.473377227783203, + "learning_rate": 4.9173378550233146e-08, + "loss": 0.7675, + "mean_token_accuracy": 0.7807177901268005, + "num_tokens": 4496547.0, + "step": 117 + }, + { + "epoch": 0.015010812873680193, + "ewc_loss": 0.0005141801084391773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5709005058160983e-07, + "grad_norm": 5.508200168609619, + "learning_rate": 4.959728698601102e-08, + "loss": 0.7656, + "mean_token_accuracy": 0.789251446723938, + "num_tokens": 4533357.0, + "step": 118 + }, + { + "epoch": 0.015138023152270703, + "ewc_loss": 0.0005024926504120231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.512463197490433e-07, + "grad_norm": 5.334718227386475, + "learning_rate": 5.002119542178889e-08, + "loss": 0.7399, + "mean_token_accuracy": 0.7908957004547119, + "num_tokens": 4573570.0, + "step": 119 + }, + { + "epoch": 0.015265233430861214, + "ewc_loss": 0.0004904319648630917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.452159719723568e-07, + "grad_norm": 5.503737449645996, + "learning_rate": 5.0445103857566764e-08, + "loss": 0.7953, + "mean_token_accuracy": 0.7742894291877747, + "num_tokens": 4613195.0, + "step": 120 + }, + { + "epoch": 0.015392443709451724, + "ewc_loss": 0.000481808849144727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.409044270734739e-07, + "grad_norm": 5.279105186462402, + "learning_rate": 5.0869012293344637e-08, + "loss": 0.716, + "mean_token_accuracy": 0.7978960871696472, + "num_tokens": 4657230.0, + "step": 121 + }, + { + "epoch": 0.015519653988042234, + "ewc_loss": 0.0004712633090093732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.356316599616548e-07, + "grad_norm": 5.379126071929932, + "learning_rate": 5.129292072912251e-08, + "loss": 0.7898, + "mean_token_accuracy": 0.780325174331665, + "num_tokens": 4701436.0, + "step": 122 + }, + { + "epoch": 0.015646864266632744, + "ewc_loss": 0.00046314403880387545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3157201667345362e-07, + "grad_norm": 5.425112724304199, + "learning_rate": 5.171682916490038e-08, + "loss": 0.7774, + "mean_token_accuracy": 0.780068039894104, + "num_tokens": 4741806.0, + "step": 123 + }, + { + "epoch": 0.015774074545223254, + "ewc_loss": 0.00045682015479542315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2841007307761174e-07, + "grad_norm": 5.601383686065674, + "learning_rate": 5.2140737600678254e-08, + "loss": 0.8146, + "mean_token_accuracy": 0.7652926445007324, + "num_tokens": 4780671.0, + "step": 124 + }, + { + "epoch": 0.015901284823813765, + "ewc_loss": 0.00045270920963957906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.263546008407502e-07, + "grad_norm": 5.58080530166626, + "learning_rate": 5.256464603645612e-08, + "loss": 0.7634, + "mean_token_accuracy": 0.7860082387924194, + "num_tokens": 4816879.0, + "step": 125 + }, + { + "epoch": 0.016028495102404273, + "ewc_loss": 0.0004475752648431808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.237876373101244e-07, + "grad_norm": 5.473385810852051, + "learning_rate": 5.298855447223399e-08, + "loss": 0.8159, + "mean_token_accuracy": 0.7668724656105042, + "num_tokens": 4858704.0, + "step": 126 + }, + { + "epoch": 0.016155705380994784, + "ewc_loss": 0.0004431757843121886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2158789647619415e-07, + "grad_norm": 5.450045108795166, + "learning_rate": 5.3412462908011865e-08, + "loss": 0.7731, + "mean_token_accuracy": 0.782326340675354, + "num_tokens": 4896780.0, + "step": 127 + }, + { + "epoch": 0.016282915659585295, + "ewc_loss": 0.00044134820927865803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2067411009629723e-07, + "grad_norm": 5.648962497711182, + "learning_rate": 5.383637134378974e-08, + "loss": 0.803, + "mean_token_accuracy": 0.7719708681106567, + "num_tokens": 4932108.0, + "step": 128 + }, + { + "epoch": 0.016410125938175806, + "ewc_loss": 0.0004413727729115635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2068638827477116e-07, + "grad_norm": 5.405695915222168, + "learning_rate": 5.426027977956761e-08, + "loss": 0.779, + "mean_token_accuracy": 0.7817366123199463, + "num_tokens": 4970384.0, + "step": 129 + }, + { + "epoch": 0.016537336216766316, + "ewc_loss": 0.00044412954594008625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2206477012787218e-07, + "grad_norm": 5.671617031097412, + "learning_rate": 5.468418821534548e-08, + "loss": 0.7878, + "mean_token_accuracy": 0.7766879796981812, + "num_tokens": 5005996.0, + "step": 130 + }, + { + "epoch": 0.016664546495356824, + "ewc_loss": 0.000450467923656106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.252339612596188e-07, + "grad_norm": 5.508945941925049, + "learning_rate": 5.5108096651123356e-08, + "loss": 0.768, + "mean_token_accuracy": 0.7808712720870972, + "num_tokens": 5044208.0, + "step": 131 + }, + { + "epoch": 0.016791756773947335, + "ewc_loss": 0.0004608878225553781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3044391639359674e-07, + "grad_norm": 5.438352584838867, + "learning_rate": 5.553200508690123e-08, + "loss": 0.783, + "mean_token_accuracy": 0.7759688496589661, + "num_tokens": 5085173.0, + "step": 132 + }, + { + "epoch": 0.016918967052537846, + "ewc_loss": 0.0004759839503094554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3799196924301214e-07, + "grad_norm": 5.757871627807617, + "learning_rate": 5.59559135226791e-08, + "loss": 0.8006, + "mean_token_accuracy": 0.7732002139091492, + "num_tokens": 5118440.0, + "step": 133 + }, + { + "epoch": 0.017046177331128357, + "ewc_loss": 0.0005010722670704126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.505361464955058e-07, + "grad_norm": 5.85996675491333, + "learning_rate": 5.637982195845697e-08, + "loss": 0.8052, + "mean_token_accuracy": 0.7737536430358887, + "num_tokens": 5155202.0, + "step": 134 + }, + { + "epoch": 0.017173387609718864, + "ewc_loss": 0.0005359504139050841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6797519581123197e-07, + "grad_norm": 5.697122573852539, + "learning_rate": 5.680373039423484e-08, + "loss": 0.7915, + "mean_token_accuracy": 0.7790801525115967, + "num_tokens": 5193240.0, + "step": 135 + }, + { + "epoch": 0.017300597888309375, + "ewc_loss": 0.0005826152628287673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9130762868589954e-07, + "grad_norm": 5.889211177825928, + "learning_rate": 5.722763883001271e-08, + "loss": 0.7804, + "mean_token_accuracy": 0.7799608707427979, + "num_tokens": 5230975.0, + "step": 136 + }, + { + "epoch": 0.017427808166899886, + "ewc_loss": 0.000645283202175051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2264159699479933e-07, + "grad_norm": 6.284303665161133, + "learning_rate": 5.7651547265790585e-08, + "loss": 0.8444, + "mean_token_accuracy": 0.7630558013916016, + "num_tokens": 5264786.0, + "step": 137 + }, + { + "epoch": 0.017555018445490397, + "ewc_loss": 0.0007287401240319014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.643700665634242e-07, + "grad_norm": 6.089518070220947, + "learning_rate": 5.807545570156846e-08, + "loss": 0.8073, + "mean_token_accuracy": 0.7726304531097412, + "num_tokens": 5301554.0, + "step": 138 + }, + { + "epoch": 0.017682228724080904, + "ewc_loss": 0.000829501950647682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1475098555565637e-07, + "grad_norm": 6.090585708618164, + "learning_rate": 5.849936413734633e-08, + "loss": 0.7474, + "mean_token_accuracy": 0.7885740995407104, + "num_tokens": 5337356.0, + "step": 139 + }, + { + "epoch": 0.017809439002671415, + "ewc_loss": 0.0009510786039754748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.7553930926369503e-07, + "grad_norm": 6.222638130187988, + "learning_rate": 5.89232725731242e-08, + "loss": 0.8188, + "mean_token_accuracy": 0.7724970579147339, + "num_tokens": 5375767.0, + "step": 140 + }, + { + "epoch": 0.017936649281261926, + "ewc_loss": 0.0010869276011362672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.43463784197229e-07, + "grad_norm": 6.363585948944092, + "learning_rate": 5.9347181008902075e-08, + "loss": 0.7698, + "mean_token_accuracy": 0.7820302248001099, + "num_tokens": 5413683.0, + "step": 141 + }, + { + "epoch": 0.018063859559852437, + "ewc_loss": 0.0012216399190947413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.108199386289925e-07, + "grad_norm": 6.232363700866699, + "learning_rate": 5.977108944467995e-08, + "loss": 0.7501, + "mean_token_accuracy": 0.7863543033599854, + "num_tokens": 5457750.0, + "step": 142 + }, + { + "epoch": 0.018191069838442948, + "ewc_loss": 0.0013312050141394138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.65602499339002e-07, + "grad_norm": 6.426016330718994, + "learning_rate": 6.019499788045781e-08, + "loss": 0.7459, + "mean_token_accuracy": 0.7877301573753357, + "num_tokens": 5495743.0, + "step": 143 + }, + { + "epoch": 0.018318280117033455, + "ewc_loss": 0.0014244462363421917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.122231409084634e-07, + "grad_norm": 6.52103328704834, + "learning_rate": 6.061890631623569e-08, + "loss": 0.7592, + "mean_token_accuracy": 0.7802367210388184, + "num_tokens": 5532198.0, + "step": 144 + }, + { + "epoch": 0.018445490395623966, + "ewc_loss": 0.0015120564494282007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.560282142549113e-07, + "grad_norm": 6.666462421417236, + "learning_rate": 6.104281475201356e-08, + "loss": 0.7722, + "mean_token_accuracy": 0.7810252904891968, + "num_tokens": 5568977.0, + "step": 145 + }, + { + "epoch": 0.018572700674214477, + "ewc_loss": 0.001595314359292388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.976571509971109e-07, + "grad_norm": 6.66953706741333, + "learning_rate": 6.146672318779143e-08, + "loss": 0.7838, + "mean_token_accuracy": 0.7728227972984314, + "num_tokens": 5606229.0, + "step": 146 + }, + { + "epoch": 0.018699910952804988, + "ewc_loss": 0.00167158676777035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.35793400710827e-07, + "grad_norm": 6.567051887512207, + "learning_rate": 6.18906316235693e-08, + "loss": 0.7835, + "mean_token_accuracy": 0.7746384143829346, + "num_tokens": 5649828.0, + "step": 147 + }, + { + "epoch": 0.018827121231395495, + "ewc_loss": 0.001726049347780645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.630246952634479e-07, + "grad_norm": 6.69411039352417, + "learning_rate": 6.231454005934718e-08, + "loss": 0.7307, + "mean_token_accuracy": 0.7949666976928711, + "num_tokens": 5687433.0, + "step": 148 + }, + { + "epoch": 0.018954331509986006, + "ewc_loss": 0.0017701677279546857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.850838639773428e-07, + "grad_norm": 6.811391830444336, + "learning_rate": 6.273844849512505e-08, + "loss": 0.7614, + "mean_token_accuracy": 0.7814013957977295, + "num_tokens": 5723247.0, + "step": 149 + }, + { + "epoch": 0.019081541788576517, + "ewc_loss": 0.0018066199263557792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.033099672706157e-07, + "grad_norm": 6.787527084350586, + "learning_rate": 6.316235693090292e-08, + "loss": 0.7751, + "mean_token_accuracy": 0.7772716879844666, + "num_tokens": 5760305.0, + "step": 150 + }, + { + "epoch": 0.019208752067167028, + "ewc_loss": 0.0018268122803419828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.134061542681593e-07, + "grad_norm": 6.733615398406982, + "learning_rate": 6.35862653666808e-08, + "loss": 0.775, + "mean_token_accuracy": 0.7759841680526733, + "num_tokens": 5800586.0, + "step": 151 + }, + { + "epoch": 0.01933596234575754, + "ewc_loss": 0.0018294819165021181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.147409514298488e-07, + "grad_norm": 6.731024265289307, + "learning_rate": 6.401017380245867e-08, + "loss": 0.7661, + "mean_token_accuracy": 0.7782683968544006, + "num_tokens": 5840351.0, + "step": 152 + }, + { + "epoch": 0.019463172624348046, + "ewc_loss": 0.001820682198740542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.103411002797657e-07, + "grad_norm": 6.827009677886963, + "learning_rate": 6.443408223823654e-08, + "loss": 0.8067, + "mean_token_accuracy": 0.7684017419815063, + "num_tokens": 5880162.0, + "step": 153 + }, + { + "epoch": 0.019590382902938557, + "ewc_loss": 0.0018090122612193227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.045061233337037e-07, + "grad_norm": 6.7971930503845215, + "learning_rate": 6.485799067401441e-08, + "loss": 0.862, + "mean_token_accuracy": 0.7535910606384277, + "num_tokens": 5922213.0, + "step": 154 + }, + { + "epoch": 0.019717593181529068, + "ewc_loss": 0.0017946363659575582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.973181593319168e-07, + "grad_norm": 6.800468921661377, + "learning_rate": 6.528189910979228e-08, + "loss": 0.7437, + "mean_token_accuracy": 0.784450352191925, + "num_tokens": 5957461.0, + "step": 155 + }, + { + "epoch": 0.01984480346011958, + "ewc_loss": 0.0017760907066985965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.880453492565721e-07, + "grad_norm": 6.7372355461120605, + "learning_rate": 6.570580754557016e-08, + "loss": 0.7285, + "mean_token_accuracy": 0.7849030494689941, + "num_tokens": 5994675.0, + "step": 156 + }, + { + "epoch": 0.019972013738710086, + "ewc_loss": 0.0017559853149577975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.77992647474457e-07, + "grad_norm": 6.60902214050293, + "learning_rate": 6.612971598134802e-08, + "loss": 0.691, + "mean_token_accuracy": 0.8000175952911377, + "num_tokens": 6032514.0, + "step": 157 + }, + { + "epoch": 0.020099224017300597, + "ewc_loss": 0.00172799255233258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.639962629786169e-07, + "grad_norm": 6.696821689605713, + "learning_rate": 6.655362441712589e-08, + "loss": 0.7635, + "mean_token_accuracy": 0.7789120674133301, + "num_tokens": 6069160.0, + "step": 158 + }, + { + "epoch": 0.020226434295891108, + "ewc_loss": 0.0017063528066501021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.531764024155564e-07, + "grad_norm": 6.727567672729492, + "learning_rate": 6.697753285290376e-08, + "loss": 0.8177, + "mean_token_accuracy": 0.7632681131362915, + "num_tokens": 6106652.0, + "step": 159 + }, + { + "epoch": 0.02035364457448162, + "ewc_loss": 0.0016804023180156946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.402011530961317e-07, + "grad_norm": 6.64253568649292, + "learning_rate": 6.740144128868163e-08, + "loss": 0.752, + "mean_token_accuracy": 0.7845339775085449, + "num_tokens": 6143070.0, + "step": 160 + }, + { + "epoch": 0.020480854853072127, + "ewc_loss": 0.0016547980485484004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.273990488305571e-07, + "grad_norm": 6.454069137573242, + "learning_rate": 6.78253497244595e-08, + "loss": 0.7434, + "mean_token_accuracy": 0.7853793501853943, + "num_tokens": 6186495.0, + "step": 161 + }, + { + "epoch": 0.020608065131662637, + "ewc_loss": 0.0016200457466766238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.10022868336091e-07, + "grad_norm": 6.587480545043945, + "learning_rate": 6.824925816023738e-08, + "loss": 0.7448, + "mean_token_accuracy": 0.7850009202957153, + "num_tokens": 6222454.0, + "step": 162 + }, + { + "epoch": 0.02073527541025315, + "ewc_loss": 0.0015899117570370436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.949558948894264e-07, + "grad_norm": 6.835123062133789, + "learning_rate": 6.867316659601525e-08, + "loss": 0.7581, + "mean_token_accuracy": 0.7839377522468567, + "num_tokens": 6253260.0, + "step": 163 + }, + { + "epoch": 0.02086248568884366, + "ewc_loss": 0.0015736088389530778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.868044349379488e-07, + "grad_norm": 6.400243759155273, + "learning_rate": 6.909707503179312e-08, + "loss": 0.737, + "mean_token_accuracy": 0.7885956764221191, + "num_tokens": 6294773.0, + "step": 164 + }, + { + "epoch": 0.02098969596743417, + "ewc_loss": 0.0015410594642162323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.705297093707486e-07, + "grad_norm": 6.4774885177612305, + "learning_rate": 6.9520983467571e-08, + "loss": 0.7815, + "mean_token_accuracy": 0.7744121551513672, + "num_tokens": 6335334.0, + "step": 165 + }, + { + "epoch": 0.021116906246024678, + "ewc_loss": 0.0015103736659511924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.551868179689336e-07, + "grad_norm": 6.493871688842773, + "learning_rate": 6.994489190334887e-08, + "loss": 0.717, + "mean_token_accuracy": 0.7898973226547241, + "num_tokens": 6372255.0, + "step": 166 + }, + { + "epoch": 0.02124411652461519, + "ewc_loss": 0.0014861891977488995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.430945743180928e-07, + "grad_norm": 6.366494655609131, + "learning_rate": 7.036880033912674e-08, + "loss": 0.6925, + "mean_token_accuracy": 0.7988641262054443, + "num_tokens": 6411971.0, + "step": 167 + }, + { + "epoch": 0.0213713268032057, + "ewc_loss": 0.0014589346246793866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.294673309843347e-07, + "grad_norm": 6.647204399108887, + "learning_rate": 7.079270877490461e-08, + "loss": 0.7648, + "mean_token_accuracy": 0.7782291173934937, + "num_tokens": 6450366.0, + "step": 168 + }, + { + "epoch": 0.02149853708179621, + "ewc_loss": 0.0014441150706261396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.220575071187341e-07, + "grad_norm": 6.744596481323242, + "learning_rate": 7.121661721068249e-08, + "loss": 0.7283, + "mean_token_accuracy": 0.7879616618156433, + "num_tokens": 6482199.0, + "step": 169 + }, + { + "epoch": 0.021625747360386718, + "ewc_loss": 0.0014320079935714602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.160040240705712e-07, + "grad_norm": 6.4784979820251465, + "learning_rate": 7.164052564646036e-08, + "loss": 0.7078, + "mean_token_accuracy": 0.796673059463501, + "num_tokens": 6519697.0, + "step": 170 + }, + { + "epoch": 0.02175295763897723, + "ewc_loss": 0.001405288465321064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.026442290225532e-07, + "grad_norm": 6.5804901123046875, + "learning_rate": 7.206443408223823e-08, + "loss": 0.7695, + "mean_token_accuracy": 0.7768174409866333, + "num_tokens": 6554687.0, + "step": 171 + }, + { + "epoch": 0.02188016791756774, + "ewc_loss": 0.0013857368612661958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.928684115337091e-07, + "grad_norm": 6.500015735626221, + "learning_rate": 7.24883425180161e-08, + "loss": 0.732, + "mean_token_accuracy": 0.7810563445091248, + "num_tokens": 6593066.0, + "step": 172 + }, + { + "epoch": 0.02200737819615825, + "ewc_loss": 0.0013646670849993825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.823335638728167e-07, + "grad_norm": 6.297686576843262, + "learning_rate": 7.291225095379398e-08, + "loss": 0.6893, + "mean_token_accuracy": 0.8007980585098267, + "num_tokens": 6635484.0, + "step": 173 + }, + { + "epoch": 0.02213458847474876, + "ewc_loss": 0.0013437039451673627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.718519784953969e-07, + "grad_norm": 6.400178909301758, + "learning_rate": 7.333615938957185e-08, + "loss": 0.7819, + "mean_token_accuracy": 0.7740365266799927, + "num_tokens": 6677090.0, + "step": 174 + }, + { + "epoch": 0.02226179875333927, + "ewc_loss": 0.0013281635474413633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.640817673542188e-07, + "grad_norm": 6.498749732971191, + "learning_rate": 7.376006782534971e-08, + "loss": 0.6979, + "mean_token_accuracy": 0.7948863506317139, + "num_tokens": 6710250.0, + "step": 175 + }, + { + "epoch": 0.02238900903192978, + "ewc_loss": 0.0013195194769650698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.597597348445561e-07, + "grad_norm": 6.315557479858398, + "learning_rate": 7.418397626112758e-08, + "loss": 0.7237, + "mean_token_accuracy": 0.7867454290390015, + "num_tokens": 6752998.0, + "step": 176 + }, + { + "epoch": 0.02251621931052029, + "ewc_loss": 0.001303833327256143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.519166504403984e-07, + "grad_norm": 6.458813190460205, + "learning_rate": 7.460788469690545e-08, + "loss": 0.7257, + "mean_token_accuracy": 0.7913997173309326, + "num_tokens": 6789568.0, + "step": 177 + }, + { + "epoch": 0.0226434295891108, + "ewc_loss": 0.0012941674795001745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.470837661254336e-07, + "grad_norm": 6.605006694793701, + "learning_rate": 7.503179313268333e-08, + "loss": 0.7664, + "mean_token_accuracy": 0.7758868336677551, + "num_tokens": 6822810.0, + "step": 178 + }, + { + "epoch": 0.02277063986770131, + "ewc_loss": 0.0012908712960779667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.454356480389833e-07, + "grad_norm": 6.385477066040039, + "learning_rate": 7.54557015684612e-08, + "loss": 0.7052, + "mean_token_accuracy": 0.7917125225067139, + "num_tokens": 6861598.0, + "step": 179 + }, + { + "epoch": 0.02289785014629182, + "ewc_loss": 0.001276632072404027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.383160098266671e-07, + "grad_norm": 6.437620639801025, + "learning_rate": 7.587961000423907e-08, + "loss": 0.728, + "mean_token_accuracy": 0.788697361946106, + "num_tokens": 6900220.0, + "step": 180 + }, + { + "epoch": 0.02302506042488233, + "ewc_loss": 0.001265979022718966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.329894972623151e-07, + "grad_norm": 6.469874858856201, + "learning_rate": 7.630351844001694e-08, + "loss": 0.8365, + "mean_token_accuracy": 0.7587976455688477, + "num_tokens": 6946983.0, + "step": 181 + }, + { + "epoch": 0.02315227070347284, + "ewc_loss": 0.0012622569920495152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.311285005722311e-07, + "grad_norm": 6.516287326812744, + "learning_rate": 7.672742687579482e-08, + "loss": 0.7421, + "mean_token_accuracy": 0.7853771448135376, + "num_tokens": 6986019.0, + "step": 182 + }, + { + "epoch": 0.02327948098206335, + "ewc_loss": 0.001258087228052318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.290435976552544e-07, + "grad_norm": 6.517285346984863, + "learning_rate": 7.715133531157269e-08, + "loss": 0.7166, + "mean_token_accuracy": 0.7857057452201843, + "num_tokens": 7020722.0, + "step": 183 + }, + { + "epoch": 0.02340669126065386, + "ewc_loss": 0.0012584245996549726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.292123089224333e-07, + "grad_norm": 6.414140701293945, + "learning_rate": 7.757524374735056e-08, + "loss": 0.7876, + "mean_token_accuracy": 0.7726555466651917, + "num_tokens": 7062691.0, + "step": 184 + }, + { + "epoch": 0.02353390153924437, + "ewc_loss": 0.0012515820562839508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.25791017228039e-07, + "grad_norm": 6.452288627624512, + "learning_rate": 7.799915218312844e-08, + "loss": 0.7356, + "mean_token_accuracy": 0.7871745228767395, + "num_tokens": 7101090.0, + "step": 185 + }, + { + "epoch": 0.02366111181783488, + "ewc_loss": 0.0012511698296293616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.255849029912497e-07, + "grad_norm": 6.424145698547363, + "learning_rate": 7.842306061890631e-08, + "loss": 0.7091, + "mean_token_accuracy": 0.7884447574615479, + "num_tokens": 7139891.0, + "step": 186 + }, + { + "epoch": 0.023788322096425393, + "ewc_loss": 0.0012490443186834455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.24522158432228e-07, + "grad_norm": 6.487399101257324, + "learning_rate": 7.88469690546842e-08, + "loss": 0.7577, + "mean_token_accuracy": 0.7791844010353088, + "num_tokens": 7179501.0, + "step": 187 + }, + { + "epoch": 0.0239155323750159, + "ewc_loss": 0.0012511597014963627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.255798439269711e-07, + "grad_norm": 6.460659503936768, + "learning_rate": 7.927087749046207e-08, + "loss": 0.7273, + "mean_token_accuracy": 0.7867100238800049, + "num_tokens": 7213809.0, + "step": 188 + }, + { + "epoch": 0.02404274265360641, + "ewc_loss": 0.0012559077003970742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.279538524722739e-07, + "grad_norm": 6.498436450958252, + "learning_rate": 7.969478592623994e-08, + "loss": 0.8028, + "mean_token_accuracy": 0.7645776271820068, + "num_tokens": 7254493.0, + "step": 189 + }, + { + "epoch": 0.024169952932196922, + "ewc_loss": 0.0012614475563168526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.307237754299422e-07, + "grad_norm": 6.488617420196533, + "learning_rate": 8.011869436201781e-08, + "loss": 0.6606, + "mean_token_accuracy": 0.8062105178833008, + "num_tokens": 7292643.0, + "step": 190 + }, + { + "epoch": 0.024297163210787433, + "ewc_loss": 0.0012662799563258886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.331399617920397e-07, + "grad_norm": 6.6192708015441895, + "learning_rate": 8.054260279779568e-08, + "loss": 0.7853, + "mean_token_accuracy": 0.7736090421676636, + "num_tokens": 7329810.0, + "step": 191 + }, + { + "epoch": 0.02442437348937794, + "ewc_loss": 0.0012764232233166695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.382115884662198e-07, + "grad_norm": 6.493789196014404, + "learning_rate": 8.096651123357356e-08, + "loss": 0.7707, + "mean_token_accuracy": 0.7756034135818481, + "num_tokens": 7367630.0, + "step": 192 + }, + { + "epoch": 0.02455158376796845, + "ewc_loss": 0.0012813189532607794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.406594934560417e-07, + "grad_norm": 6.436516761779785, + "learning_rate": 8.139041966935143e-08, + "loss": 0.7513, + "mean_token_accuracy": 0.7792439460754395, + "num_tokens": 7411580.0, + "step": 193 + }, + { + "epoch": 0.024678794046558962, + "ewc_loss": 0.0012889646459370852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.444823270612687e-07, + "grad_norm": 6.470629692077637, + "learning_rate": 8.181432810512929e-08, + "loss": 0.7075, + "mean_token_accuracy": 0.7933007478713989, + "num_tokens": 7451631.0, + "step": 194 + }, + { + "epoch": 0.024806004325149473, + "ewc_loss": 0.001299114665016532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.495573074971617e-07, + "grad_norm": 6.455161094665527, + "learning_rate": 8.223823654090716e-08, + "loss": 0.7163, + "mean_token_accuracy": 0.7892184257507324, + "num_tokens": 7493645.0, + "step": 195 + }, + { + "epoch": 0.024933214603739984, + "ewc_loss": 0.001311626867391169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.558134373335633e-07, + "grad_norm": 6.510015964508057, + "learning_rate": 8.266214497668503e-08, + "loss": 0.6772, + "mean_token_accuracy": 0.8028495907783508, + "num_tokens": 7538042.0, + "step": 196 + }, + { + "epoch": 0.02506042488233049, + "ewc_loss": 0.0013292009243741632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.646004635513236e-07, + "grad_norm": 6.549056529998779, + "learning_rate": 8.30860534124629e-08, + "loss": 0.7415, + "mean_token_accuracy": 0.782960832118988, + "num_tokens": 7575374.0, + "step": 197 + }, + { + "epoch": 0.025187635160921002, + "ewc_loss": 0.0013423921773210168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.71196062285162e-07, + "grad_norm": 6.729876518249512, + "learning_rate": 8.350996184824078e-08, + "loss": 0.7365, + "mean_token_accuracy": 0.7830025553703308, + "num_tokens": 7606880.0, + "step": 198 + }, + { + "epoch": 0.025314845439511513, + "ewc_loss": 0.00136487593408674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.82437985233264e-07, + "grad_norm": 6.554811000823975, + "learning_rate": 8.393387028401865e-08, + "loss": 0.7128, + "mean_token_accuracy": 0.7920687794685364, + "num_tokens": 7644840.0, + "step": 199 + }, + { + "epoch": 0.025442055718102024, + "ewc_loss": 0.0013814361300319433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.907180818416236e-07, + "grad_norm": 6.56201171875, + "learning_rate": 8.435777871979652e-08, + "loss": 0.7524, + "mean_token_accuracy": 0.778841495513916, + "num_tokens": 7683856.0, + "step": 200 + }, + { + "epoch": 0.02556926599669253, + "ewc_loss": 0.0014010327868163586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.005164093243366e-07, + "grad_norm": 6.775506019592285, + "learning_rate": 8.47816871555744e-08, + "loss": 0.714, + "mean_token_accuracy": 0.7903810143470764, + "num_tokens": 7715306.0, + "step": 201 + }, + { + "epoch": 0.025696476275283042, + "ewc_loss": 0.001433694502338767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.168472393459524e-07, + "grad_norm": 6.735413074493408, + "learning_rate": 8.520559559135227e-08, + "loss": 0.6725, + "mean_token_accuracy": 0.8016611337661743, + "num_tokens": 7752442.0, + "step": 202 + }, + { + "epoch": 0.025823686553873553, + "ewc_loss": 0.0014597837580367923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.298918944798061e-07, + "grad_norm": 6.618638515472412, + "learning_rate": 8.562950402713014e-08, + "loss": 0.7559, + "mean_token_accuracy": 0.7813581228256226, + "num_tokens": 7799891.0, + "step": 203 + }, + { + "epoch": 0.025950896832464064, + "ewc_loss": 0.0014809652930125594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.404826192214387e-07, + "grad_norm": 6.774810314178467, + "learning_rate": 8.605341246290801e-08, + "loss": 0.749, + "mean_token_accuracy": 0.7788377404212952, + "num_tokens": 7830696.0, + "step": 204 + }, + { + "epoch": 0.026078107111054575, + "ewc_loss": 0.0015140442410483956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.570221214336925e-07, + "grad_norm": 6.724698066711426, + "learning_rate": 8.647732089868589e-08, + "loss": 0.7174, + "mean_token_accuracy": 0.7909634113311768, + "num_tokens": 7870968.0, + "step": 205 + }, + { + "epoch": 0.026205317389645082, + "ewc_loss": 0.00154264981392771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.713248919571924e-07, + "grad_norm": 6.7418622970581055, + "learning_rate": 8.690122933446376e-08, + "loss": 0.7401, + "mean_token_accuracy": 0.7845864295959473, + "num_tokens": 7914418.0, + "step": 206 + }, + { + "epoch": 0.026332527668235593, + "ewc_loss": 0.0015741256065666676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.870627882766712e-07, + "grad_norm": 6.640389919281006, + "learning_rate": 8.732513777024163e-08, + "loss": 0.6786, + "mean_token_accuracy": 0.8052254915237427, + "num_tokens": 7954544.0, + "step": 207 + }, + { + "epoch": 0.026459737946826104, + "ewc_loss": 0.0016015685396268964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.007842779989005e-07, + "grad_norm": 6.591437339782715, + "learning_rate": 8.77490462060195e-08, + "loss": 0.7277, + "mean_token_accuracy": 0.7902453541755676, + "num_tokens": 7997373.0, + "step": 208 + }, + { + "epoch": 0.026586948225416615, + "ewc_loss": 0.0016309167258441448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.154583497343992e-07, + "grad_norm": 7.211565971374512, + "learning_rate": 8.817295464179738e-08, + "loss": 0.6937, + "mean_token_accuracy": 0.7925572395324707, + "num_tokens": 8033314.0, + "step": 209 + }, + { + "epoch": 0.026714158504007122, + "ewc_loss": 0.0016780226724222302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.390113634959562e-07, + "grad_norm": 6.736510276794434, + "learning_rate": 8.859686307757525e-08, + "loss": 0.7111, + "mean_token_accuracy": 0.7925777435302734, + "num_tokens": 8069362.0, + "step": 210 + }, + { + "epoch": 0.026841368782597633, + "ewc_loss": 0.001706827781163156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.534138942195568e-07, + "grad_norm": 6.806504249572754, + "learning_rate": 8.902077151335312e-08, + "loss": 0.7447, + "mean_token_accuracy": 0.7796697616577148, + "num_tokens": 8109126.0, + "step": 211 + }, + { + "epoch": 0.026968579061188144, + "ewc_loss": 0.0017374585149809718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.687292734066432e-07, + "grad_norm": 7.065062046051025, + "learning_rate": 8.944467994913098e-08, + "loss": 0.8003, + "mean_token_accuracy": 0.7704470157623291, + "num_tokens": 8143083.0, + "step": 212 + }, + { + "epoch": 0.027095789339778655, + "ewc_loss": 0.0017855394398793578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.927697194849316e-07, + "grad_norm": 7.217250823974609, + "learning_rate": 8.986858838490885e-08, + "loss": 0.6485, + "mean_token_accuracy": 0.8037785887718201, + "num_tokens": 8176636.0, + "step": 213 + }, + { + "epoch": 0.027222999618369163, + "ewc_loss": 0.0018343136180192232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.171567967314331e-07, + "grad_norm": 6.847113132476807, + "learning_rate": 9.029249682068673e-08, + "loss": 0.7061, + "mean_token_accuracy": 0.7908580899238586, + "num_tokens": 8220262.0, + "step": 214 + }, + { + "epoch": 0.027350209896959674, + "ewc_loss": 0.0018558583687990904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.279291930397449e-07, + "grad_norm": 7.027007579803467, + "learning_rate": 9.07164052564646e-08, + "loss": 0.7277, + "mean_token_accuracy": 0.787722647190094, + "num_tokens": 8256303.0, + "step": 215 + }, + { + "epoch": 0.027477420175550184, + "ewc_loss": 0.0018935241969302297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.467620998293569e-07, + "grad_norm": 6.828755855560303, + "learning_rate": 9.114031369224247e-08, + "loss": 0.6903, + "mean_token_accuracy": 0.7980394959449768, + "num_tokens": 8293888.0, + "step": 216 + }, + { + "epoch": 0.027604630454140695, + "ewc_loss": 0.0019202802795916796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.601401416148292e-07, + "grad_norm": 7.0778398513793945, + "learning_rate": 9.156422212802034e-08, + "loss": 0.6522, + "mean_token_accuracy": 0.8070340156555176, + "num_tokens": 8335368.0, + "step": 217 + }, + { + "epoch": 0.027731840732731206, + "ewc_loss": 0.001957795349881053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.788976740310318e-07, + "grad_norm": 7.065819263458252, + "learning_rate": 9.198813056379822e-08, + "loss": 0.709, + "mean_token_accuracy": 0.7935703992843628, + "num_tokens": 8370338.0, + "step": 218 + }, + { + "epoch": 0.027859051011321714, + "ewc_loss": 0.001995170721784234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.975854027288733e-07, + "grad_norm": 7.259815692901611, + "learning_rate": 9.241203899957609e-08, + "loss": 0.6929, + "mean_token_accuracy": 0.795347273349762, + "num_tokens": 8409904.0, + "step": 219 + }, + { + "epoch": 0.027986261289912225, + "ewc_loss": 0.0020358392503112555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.017919657897437e-06, + "grad_norm": 7.517327308654785, + "learning_rate": 9.283594743535396e-08, + "loss": 0.8208, + "mean_token_accuracy": 0.7622523307800293, + "num_tokens": 8446579.0, + "step": 220 + }, + { + "epoch": 0.028113471568502735, + "ewc_loss": 0.0020765350200235844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0382675554865273e-06, + "grad_norm": 7.162256717681885, + "learning_rate": 9.325985587113183e-08, + "loss": 0.6655, + "mean_token_accuracy": 0.8049097657203674, + "num_tokens": 8483663.0, + "step": 221 + }, + { + "epoch": 0.028240681847093246, + "ewc_loss": 0.0020962858106940985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0481428489583777e-06, + "grad_norm": 7.348745346069336, + "learning_rate": 9.368376430690971e-08, + "loss": 0.7002, + "mean_token_accuracy": 0.7925146222114563, + "num_tokens": 8516346.0, + "step": 222 + }, + { + "epoch": 0.028367892125683754, + "ewc_loss": 0.0021241153590381145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0620576631481526e-06, + "grad_norm": 7.116180419921875, + "learning_rate": 9.410767274268758e-08, + "loss": 0.6935, + "mean_token_accuracy": 0.7973346710205078, + "num_tokens": 8556914.0, + "step": 223 + }, + { + "epoch": 0.028495102404274265, + "ewc_loss": 0.0021424926817417145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0712462881201645e-06, + "grad_norm": 7.933953762054443, + "learning_rate": 9.453158117846545e-08, + "loss": 0.6808, + "mean_token_accuracy": 0.7954399585723877, + "num_tokens": 8596548.0, + "step": 224 + }, + { + "epoch": 0.028622312682864776, + "ewc_loss": 0.002197474241256714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.098737129723304e-06, + "grad_norm": 7.355027675628662, + "learning_rate": 9.495548961424333e-08, + "loss": 0.7123, + "mean_token_accuracy": 0.7940149903297424, + "num_tokens": 8634366.0, + "step": 225 + }, + { + "epoch": 0.028749522961455286, + "ewc_loss": 0.0022120855282992125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1060427596021327e-06, + "grad_norm": 7.644140243530273, + "learning_rate": 9.53793980500212e-08, + "loss": 0.6525, + "mean_token_accuracy": 0.805831789970398, + "num_tokens": 8666480.0, + "step": 226 + }, + { + "epoch": 0.028876733240045797, + "ewc_loss": 0.002239528577774763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1197643061677809e-06, + "grad_norm": 8.988035202026367, + "learning_rate": 9.580330648579907e-08, + "loss": 0.8019, + "mean_token_accuracy": 0.7696486711502075, + "num_tokens": 8705880.0, + "step": 227 + }, + { + "epoch": 0.029003943518636305, + "ewc_loss": 0.002316064666956663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1580323189264163e-06, + "grad_norm": 7.513918399810791, + "learning_rate": 9.622721492157694e-08, + "loss": 0.6641, + "mean_token_accuracy": 0.8033554553985596, + "num_tokens": 8743772.0, + "step": 228 + }, + { + "epoch": 0.029131153797226816, + "ewc_loss": 0.002300409832969308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1502048664624454e-06, + "grad_norm": 8.186564445495605, + "learning_rate": 9.665112335735482e-08, + "loss": 0.6899, + "mean_token_accuracy": 0.7920101881027222, + "num_tokens": 8775919.0, + "step": 229 + }, + { + "epoch": 0.029258364075817327, + "ewc_loss": 0.002310890005901456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1554450338735478e-06, + "grad_norm": 7.662014007568359, + "learning_rate": 9.707503179313267e-08, + "loss": 0.6359, + "mean_token_accuracy": 0.8094770908355713, + "num_tokens": 8809336.0, + "step": 230 + }, + { + "epoch": 0.029385574354407838, + "ewc_loss": 0.002308906987309456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1544534572749399e-06, + "grad_norm": 7.526935577392578, + "learning_rate": 9.749894022891055e-08, + "loss": 0.6876, + "mean_token_accuracy": 0.7961510419845581, + "num_tokens": 8845952.0, + "step": 231 + }, + { + "epoch": 0.029512784632998345, + "ewc_loss": 0.00230609648860991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1530482879607007e-06, + "grad_norm": 7.369327068328857, + "learning_rate": 9.792284866468842e-08, + "loss": 0.708, + "mean_token_accuracy": 0.7866420745849609, + "num_tokens": 8889801.0, + "step": 232 + }, + { + "epoch": 0.029639994911588856, + "ewc_loss": 0.0023056231439113617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1528115919645643e-06, + "grad_norm": 8.045612335205078, + "learning_rate": 9.834675710046629e-08, + "loss": 0.6677, + "mean_token_accuracy": 0.7978543043136597, + "num_tokens": 8925429.0, + "step": 233 + }, + { + "epoch": 0.029767205190179367, + "ewc_loss": 0.0023437354248017073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1718676660166238e-06, + "grad_norm": 7.4615044593811035, + "learning_rate": 9.877066553624416e-08, + "loss": 0.6988, + "mean_token_accuracy": 0.7909080982208252, + "num_tokens": 8963360.0, + "step": 234 + }, + { + "epoch": 0.029894415468769878, + "ewc_loss": 0.0023409994319081306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1704996722983196e-06, + "grad_norm": 7.706396102905273, + "learning_rate": 9.919457397202204e-08, + "loss": 0.7117, + "mean_token_accuracy": 0.7913352251052856, + "num_tokens": 8998314.0, + "step": 235 + }, + { + "epoch": 0.030021625747360385, + "ewc_loss": 0.002352363197132945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1761816267608083e-06, + "grad_norm": 8.339431762695312, + "learning_rate": 9.961848240779991e-08, + "loss": 0.7825, + "mean_token_accuracy": 0.7721515893936157, + "num_tokens": 9034804.0, + "step": 236 + }, + { + "epoch": 0.030148836025950896, + "ewc_loss": 0.0023913541808724403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1956771004406619e-06, + "grad_norm": 8.288474082946777, + "learning_rate": 1.0004239084357778e-07, + "loss": 0.705, + "mean_token_accuracy": 0.7921186685562134, + "num_tokens": 9070840.0, + "step": 237 + }, + { + "epoch": 0.030276046304541407, + "ewc_loss": 0.002414130140095949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2070651109752362e-06, + "grad_norm": 7.571855545043945, + "learning_rate": 1.0046629927935566e-07, + "loss": 0.6844, + "mean_token_accuracy": 0.7988932132720947, + "num_tokens": 9109370.0, + "step": 238 + }, + { + "epoch": 0.030403256583131918, + "ewc_loss": 0.0023864698596298695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1932348797927261e-06, + "grad_norm": 8.368889808654785, + "learning_rate": 1.0089020771513353e-07, + "loss": 0.6266, + "mean_token_accuracy": 0.8130732774734497, + "num_tokens": 9148191.0, + "step": 239 + }, + { + "epoch": 0.03053046686172243, + "ewc_loss": 0.0023993682116270065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.199684106722998e-06, + "grad_norm": 7.593578338623047, + "learning_rate": 1.013141161509114e-07, + "loss": 0.6942, + "mean_token_accuracy": 0.7945796251296997, + "num_tokens": 9182923.0, + "step": 240 + }, + { + "epoch": 0.030657677140312936, + "ewc_loss": 0.002380114747211337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.190057332678407e-06, + "grad_norm": 7.699078559875488, + "learning_rate": 1.0173802458668927e-07, + "loss": 0.6946, + "mean_token_accuracy": 0.7913695573806763, + "num_tokens": 9215882.0, + "step": 241 + }, + { + "epoch": 0.030784887418903447, + "ewc_loss": 0.002378421602770686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1892108204847318e-06, + "grad_norm": 8.2575101852417, + "learning_rate": 1.0216193302246715e-07, + "loss": 0.7268, + "mean_token_accuracy": 0.7806740403175354, + "num_tokens": 9249718.0, + "step": 242 + }, + { + "epoch": 0.030912097697493958, + "ewc_loss": 0.0024136602878570557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2068301202816656e-06, + "grad_norm": 7.813071250915527, + "learning_rate": 1.0258584145824502e-07, + "loss": 0.6455, + "mean_token_accuracy": 0.8077678680419922, + "num_tokens": 9291146.0, + "step": 243 + }, + { + "epoch": 0.03103930797608447, + "ewc_loss": 0.0024040790740400553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2020395843137521e-06, + "grad_norm": 8.281041145324707, + "learning_rate": 1.0300974989402289e-07, + "loss": 0.6406, + "mean_token_accuracy": 0.8075567483901978, + "num_tokens": 9326403.0, + "step": 244 + }, + { + "epoch": 0.031166518254674976, + "ewc_loss": 0.0024223271757364273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2111636351619381e-06, + "grad_norm": 7.510015964508057, + "learning_rate": 1.0343365832980076e-07, + "loss": 0.6919, + "mean_token_accuracy": 0.7941529750823975, + "num_tokens": 9368491.0, + "step": 245 + }, + { + "epoch": 0.03129372853326549, + "ewc_loss": 0.0023888321593403816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1944160860366537e-06, + "grad_norm": 8.405774116516113, + "learning_rate": 1.0385756676557864e-07, + "loss": 0.6888, + "mean_token_accuracy": 0.7973818778991699, + "num_tokens": 9409041.0, + "step": 246 + }, + { + "epoch": 0.031420938811855995, + "ewc_loss": 0.0024151059333235025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2075529411958996e-06, + "grad_norm": 9.154777526855469, + "learning_rate": 1.0428147520135651e-07, + "loss": 0.6611, + "mean_token_accuracy": 0.8030679821968079, + "num_tokens": 9448591.0, + "step": 247 + }, + { + "epoch": 0.03154814909044651, + "ewc_loss": 0.002464160555973649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2320803080001497e-06, + "grad_norm": 8.358177185058594, + "learning_rate": 1.0470538363713437e-07, + "loss": 0.6914, + "mean_token_accuracy": 0.793225109577179, + "num_tokens": 9486604.0, + "step": 248 + }, + { + "epoch": 0.031675359369037016, + "ewc_loss": 0.002442310331389308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.22115511658194e-06, + "grad_norm": 7.943212985992432, + "learning_rate": 1.0512929207291224e-07, + "loss": 0.6974, + "mean_token_accuracy": 0.788100004196167, + "num_tokens": 9525285.0, + "step": 249 + }, + { + "epoch": 0.03180256964762753, + "ewc_loss": 0.002402100246399641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.201050167765061e-06, + "grad_norm": 9.148421287536621, + "learning_rate": 1.0555320050869011e-07, + "loss": 0.677, + "mean_token_accuracy": 0.7941411733627319, + "num_tokens": 9557898.0, + "step": 250 + }, + { + "epoch": 0.03192977992621804, + "ewc_loss": 0.002435552654787898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2177763437648537e-06, + "grad_norm": 8.06045913696289, + "learning_rate": 1.0597710894446799e-07, + "loss": 0.7385, + "mean_token_accuracy": 0.7781370282173157, + "num_tokens": 9590438.0, + "step": 251 + }, + { + "epoch": 0.032056990204808546, + "ewc_loss": 0.0024038951378315687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2019476116620353e-06, + "grad_norm": 7.6036505699157715, + "learning_rate": 1.0640101738024586e-07, + "loss": 0.6449, + "mean_token_accuracy": 0.8075905442237854, + "num_tokens": 9631402.0, + "step": 252 + }, + { + "epoch": 0.03218420048339906, + "ewc_loss": 0.002358964178711176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1794820693467045e-06, + "grad_norm": 9.097552299499512, + "learning_rate": 1.0682492581602373e-07, + "loss": 0.6287, + "mean_token_accuracy": 0.8118212223052979, + "num_tokens": 9663702.0, + "step": 253 + }, + { + "epoch": 0.03231141076198957, + "ewc_loss": 0.0024146721698343754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2073360267095268e-06, + "grad_norm": 8.078313827514648, + "learning_rate": 1.072488342518016e-07, + "loss": 0.6159, + "mean_token_accuracy": 0.812401533126831, + "num_tokens": 9698969.0, + "step": 254 + }, + { + "epoch": 0.03243862104058008, + "ewc_loss": 0.0023957048542797565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1978523843936273e-06, + "grad_norm": 7.671243667602539, + "learning_rate": 1.0767274268757948e-07, + "loss": 0.685, + "mean_token_accuracy": 0.7935555577278137, + "num_tokens": 9738075.0, + "step": 255 + }, + { + "epoch": 0.03256583131917059, + "ewc_loss": 0.002351085189729929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.175542593045975e-06, + "grad_norm": 8.025350570678711, + "learning_rate": 1.0809665112335735e-07, + "loss": 0.689, + "mean_token_accuracy": 0.7967517375946045, + "num_tokens": 9782675.0, + "step": 256 + }, + { + "epoch": 0.0326930415977611, + "ewc_loss": 0.0023555313237011433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1777656254707836e-06, + "grad_norm": 8.0563325881958, + "learning_rate": 1.0852055955913522e-07, + "loss": 0.5889, + "mean_token_accuracy": 0.8219985365867615, + "num_tokens": 9820585.0, + "step": 257 + }, + { + "epoch": 0.03282025187635161, + "ewc_loss": 0.0023670715745538473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1835358009193442e-06, + "grad_norm": 7.847565650939941, + "learning_rate": 1.089444679949131e-07, + "loss": 0.7144, + "mean_token_accuracy": 0.788586437702179, + "num_tokens": 9860693.0, + "step": 258 + }, + { + "epoch": 0.03294746215494212, + "ewc_loss": 0.0023519753012806177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1759876770156552e-06, + "grad_norm": 8.505414009094238, + "learning_rate": 1.0936837643069097e-07, + "loss": 0.6471, + "mean_token_accuracy": 0.8062025308609009, + "num_tokens": 9902629.0, + "step": 259 + }, + { + "epoch": 0.03307467243353263, + "ewc_loss": 0.0023794088046997786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.189704448734119e-06, + "grad_norm": 8.119163513183594, + "learning_rate": 1.0979228486646884e-07, + "loss": 0.6195, + "mean_token_accuracy": 0.8108576536178589, + "num_tokens": 9937304.0, + "step": 260 + }, + { + "epoch": 0.03320188271212314, + "ewc_loss": 0.0023638831917196512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.181941570393974e-06, + "grad_norm": 7.851895809173584, + "learning_rate": 1.1021619330224671e-07, + "loss": 0.7524, + "mean_token_accuracy": 0.7770236730575562, + "num_tokens": 9976911.0, + "step": 261 + }, + { + "epoch": 0.03332909299071365, + "ewc_loss": 0.002344760112464428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1723800525942352e-06, + "grad_norm": 7.719058513641357, + "learning_rate": 1.1064010173802458e-07, + "loss": 0.6862, + "mean_token_accuracy": 0.7960692644119263, + "num_tokens": 10015740.0, + "step": 262 + }, + { + "epoch": 0.03345630326930416, + "ewc_loss": 0.0023255045525729656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1627522553681047e-06, + "grad_norm": 10.158771514892578, + "learning_rate": 1.1106401017380246e-07, + "loss": 0.625, + "mean_token_accuracy": 0.8136388659477234, + "num_tokens": 10053054.0, + "step": 263 + }, + { + "epoch": 0.03358351354789467, + "ewc_loss": 0.0024480437859892845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2240218438819284e-06, + "grad_norm": 10.066934585571289, + "learning_rate": 1.1148791860958033e-07, + "loss": 0.7248, + "mean_token_accuracy": 0.7809624671936035, + "num_tokens": 10082088.0, + "step": 264 + }, + { + "epoch": 0.03371072382648518, + "ewc_loss": 0.0024861707352101803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2430854212652775e-06, + "grad_norm": 8.038081169128418, + "learning_rate": 1.119118270453582e-07, + "loss": 0.6944, + "mean_token_accuracy": 0.7923434376716614, + "num_tokens": 10121122.0, + "step": 265 + }, + { + "epoch": 0.03383793410507569, + "ewc_loss": 0.0023552589118480682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.177629428639193e-06, + "grad_norm": 7.945167541503906, + "learning_rate": 1.1233573548113607e-07, + "loss": 0.6873, + "mean_token_accuracy": 0.7952240705490112, + "num_tokens": 10161017.0, + "step": 266 + }, + { + "epoch": 0.0339651443836662, + "ewc_loss": 0.0023047439754009247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1523719649630948e-06, + "grad_norm": 7.50272798538208, + "learning_rate": 1.1275964391691393e-07, + "loss": 0.6516, + "mean_token_accuracy": 0.8063594102859497, + "num_tokens": 10204913.0, + "step": 267 + }, + { + "epoch": 0.03409235466225671, + "ewc_loss": 0.002284769667312503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1423848036429263e-06, + "grad_norm": 7.546519756317139, + "learning_rate": 1.131835523526918e-07, + "loss": 0.7009, + "mean_token_accuracy": 0.7878906726837158, + "num_tokens": 10245154.0, + "step": 268 + }, + { + "epoch": 0.03421956494084722, + "ewc_loss": 0.0023043800611048937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1521900660227402e-06, + "grad_norm": 8.597042083740234, + "learning_rate": 1.1360746078846968e-07, + "loss": 0.7085, + "mean_token_accuracy": 0.7876276969909668, + "num_tokens": 10283006.0, + "step": 269 + }, + { + "epoch": 0.03434677521943773, + "ewc_loss": 0.0023704045452177525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1852022225866676e-06, + "grad_norm": 8.665881156921387, + "learning_rate": 1.1403136922424755e-07, + "loss": 0.6694, + "mean_token_accuracy": 0.7983725666999817, + "num_tokens": 10316689.0, + "step": 270 + }, + { + "epoch": 0.03447398549802824, + "ewc_loss": 0.002384067280218005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1920336646653595e-06, + "grad_norm": 8.198529243469238, + "learning_rate": 1.1445527766002542e-07, + "loss": 0.6618, + "mean_token_accuracy": 0.7996371388435364, + "num_tokens": 10358145.0, + "step": 271 + }, + { + "epoch": 0.03460119577661875, + "ewc_loss": 0.0023349830880761147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1674915185722057e-06, + "grad_norm": 9.448094367980957, + "learning_rate": 1.148791860958033e-07, + "loss": 0.7639, + "mean_token_accuracy": 0.7730717658996582, + "num_tokens": 10393424.0, + "step": 272 + }, + { + "epoch": 0.034728406055209264, + "ewc_loss": 0.0023790961131453514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1895480156454141e-06, + "grad_norm": 7.840945243835449, + "learning_rate": 1.1530309453158117e-07, + "loss": 0.6767, + "mean_token_accuracy": 0.7966768145561218, + "num_tokens": 10433560.0, + "step": 273 + }, + { + "epoch": 0.03485561633379977, + "ewc_loss": 0.0023025183472782373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1512591981954756e-06, + "grad_norm": 8.315547943115234, + "learning_rate": 1.1572700296735904e-07, + "loss": 0.6747, + "mean_token_accuracy": 0.7973127365112305, + "num_tokens": 10472024.0, + "step": 274 + }, + { + "epoch": 0.03498282661239028, + "ewc_loss": 0.0023064834531396627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1532417829585029e-06, + "grad_norm": 8.14175796508789, + "learning_rate": 1.1615091140313691e-07, + "loss": 0.7139, + "mean_token_accuracy": 0.7866561412811279, + "num_tokens": 10510874.0, + "step": 275 + }, + { + "epoch": 0.03511003689098079, + "ewc_loss": 0.0023109677713364363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1554839147720486e-06, + "grad_norm": 8.018966674804688, + "learning_rate": 1.1657481983891479e-07, + "loss": 0.65, + "mean_token_accuracy": 0.8055415153503418, + "num_tokens": 10552411.0, + "step": 276 + }, + { + "epoch": 0.0352372471695713, + "ewc_loss": 0.0023082217667251825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1541109188328846e-06, + "grad_norm": 10.036711692810059, + "learning_rate": 1.1699872827469266e-07, + "loss": 0.6679, + "mean_token_accuracy": 0.8010569214820862, + "num_tokens": 10591537.0, + "step": 277 + }, + { + "epoch": 0.03536445744816181, + "ewc_loss": 0.002412056550383568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.206028287015215e-06, + "grad_norm": 8.386226654052734, + "learning_rate": 1.1742263671047053e-07, + "loss": 0.6755, + "mean_token_accuracy": 0.798138439655304, + "num_tokens": 10632228.0, + "step": 278 + }, + { + "epoch": 0.03549166772675232, + "ewc_loss": 0.0023448364809155464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1724182513717096e-06, + "grad_norm": 9.995002746582031, + "learning_rate": 1.178465451462484e-07, + "loss": 0.731, + "mean_token_accuracy": 0.7797740697860718, + "num_tokens": 10672829.0, + "step": 279 + }, + { + "epoch": 0.03561887800534283, + "ewc_loss": 0.0023843448143452406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.19217236260738e-06, + "grad_norm": 9.904221534729004, + "learning_rate": 1.1827045358202628e-07, + "loss": 0.7059, + "mean_token_accuracy": 0.7942780256271362, + "num_tokens": 10706146.0, + "step": 280 + }, + { + "epoch": 0.035746088283933344, + "ewc_loss": 0.0023862377274781466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.19311891921825e-06, + "grad_norm": 8.304967880249023, + "learning_rate": 1.1869436201780415e-07, + "loss": 0.6208, + "mean_token_accuracy": 0.8145540356636047, + "num_tokens": 10744112.0, + "step": 281 + }, + { + "epoch": 0.03587329856252385, + "ewc_loss": 0.002294966485351324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1474832035673899e-06, + "grad_norm": 8.148405075073242, + "learning_rate": 1.1911827045358202e-07, + "loss": 0.6524, + "mean_token_accuracy": 0.8093932867050171, + "num_tokens": 10783726.0, + "step": 282 + }, + { + "epoch": 0.03600050884111436, + "ewc_loss": 0.0022718010004609823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1359004474797985e-06, + "grad_norm": 8.019560813903809, + "learning_rate": 1.195421788893599e-07, + "loss": 0.7163, + "mean_token_accuracy": 0.7885172367095947, + "num_tokens": 10824476.0, + "step": 283 + }, + { + "epoch": 0.036127719119704874, + "ewc_loss": 0.002278207801282406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1391039151931182e-06, + "grad_norm": 8.092009544372559, + "learning_rate": 1.1996608732513778e-07, + "loss": 0.6773, + "mean_token_accuracy": 0.7984544634819031, + "num_tokens": 10859997.0, + "step": 284 + }, + { + "epoch": 0.03625492939829538, + "ewc_loss": 0.0022990622092038393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.149531158262107e-06, + "grad_norm": 8.396607398986816, + "learning_rate": 1.2038999576091563e-07, + "loss": 0.6475, + "mean_token_accuracy": 0.8067190051078796, + "num_tokens": 10898800.0, + "step": 285 + }, + { + "epoch": 0.036382139676885895, + "ewc_loss": 0.002316411817446351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1582059187276172e-06, + "grad_norm": 8.11975383758545, + "learning_rate": 1.208139041966935e-07, + "loss": 0.6125, + "mean_token_accuracy": 0.8164637088775635, + "num_tokens": 10937462.0, + "step": 286 + }, + { + "epoch": 0.0365093499554764, + "ewc_loss": 0.002308706985786557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1543535265445826e-06, + "grad_norm": 8.515785217285156, + "learning_rate": 1.2123781263247137e-07, + "loss": 0.6755, + "mean_token_accuracy": 0.7953523397445679, + "num_tokens": 10970739.0, + "step": 287 + }, + { + "epoch": 0.03663656023406691, + "ewc_loss": 0.00232721958309412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1636097951850388e-06, + "grad_norm": 8.733708381652832, + "learning_rate": 1.2166172106824924e-07, + "loss": 0.7274, + "mean_token_accuracy": 0.78873610496521, + "num_tokens": 11003000.0, + "step": 288 + }, + { + "epoch": 0.036763770512657425, + "ewc_loss": 0.0023506474681198597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.175323745883361e-06, + "grad_norm": 8.438678741455078, + "learning_rate": 1.2208562950402712e-07, + "loss": 0.6312, + "mean_token_accuracy": 0.8074706792831421, + "num_tokens": 11039665.0, + "step": 289 + }, + { + "epoch": 0.03689098079124793, + "ewc_loss": 0.0023366552777588367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1683276852636482e-06, + "grad_norm": 8.512065887451172, + "learning_rate": 1.22509537939805e-07, + "loss": 0.6588, + "mean_token_accuracy": 0.7973082065582275, + "num_tokens": 11078368.0, + "step": 290 + }, + { + "epoch": 0.03701819106983844, + "ewc_loss": 0.002337784506380558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1688922540997737e-06, + "grad_norm": 8.128265380859375, + "learning_rate": 1.2293344637558286e-07, + "loss": 0.6837, + "mean_token_accuracy": 0.7964304089546204, + "num_tokens": 11122654.0, + "step": 291 + }, + { + "epoch": 0.037145401348428954, + "ewc_loss": 0.0023187731858342886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1593865565373562e-06, + "grad_norm": 8.2014741897583, + "learning_rate": 1.2335735481136073e-07, + "loss": 0.6393, + "mean_token_accuracy": 0.8018869757652283, + "num_tokens": 11162622.0, + "step": 292 + }, + { + "epoch": 0.03727261162701946, + "ewc_loss": 0.0023365579545497894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1682790272971033e-06, + "grad_norm": 8.556510925292969, + "learning_rate": 1.237812632471386e-07, + "loss": 0.6506, + "mean_token_accuracy": 0.8049689531326294, + "num_tokens": 11202564.0, + "step": 293 + }, + { + "epoch": 0.037399821905609976, + "ewc_loss": 0.002370112109929323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1850560213133576e-06, + "grad_norm": 8.849364280700684, + "learning_rate": 1.2420517168291648e-07, + "loss": 0.6693, + "mean_token_accuracy": 0.794671893119812, + "num_tokens": 11243422.0, + "step": 294 + }, + { + "epoch": 0.03752703218420048, + "ewc_loss": 0.002394672716036439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1973363598372089e-06, + "grad_norm": 8.79472827911377, + "learning_rate": 1.2462908011869435e-07, + "loss": 0.6506, + "mean_token_accuracy": 0.8012773990631104, + "num_tokens": 11280867.0, + "step": 295 + }, + { + "epoch": 0.03765424246279099, + "ewc_loss": 0.0023821070790290833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1910535704373615e-06, + "grad_norm": 10.695225715637207, + "learning_rate": 1.2505298855447223e-07, + "loss": 0.5985, + "mean_token_accuracy": 0.8178490400314331, + "num_tokens": 11318454.0, + "step": 296 + }, + { + "epoch": 0.037781452741381505, + "ewc_loss": 0.0024800521787256002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2400261084621889e-06, + "grad_norm": 9.619751930236816, + "learning_rate": 1.254768969902501e-07, + "loss": 0.7409, + "mean_token_accuracy": 0.7806042432785034, + "num_tokens": 11354721.0, + "step": 297 + }, + { + "epoch": 0.03790866301997201, + "ewc_loss": 0.002429245039820671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2146225571996183e-06, + "grad_norm": 9.353029251098633, + "learning_rate": 1.2590080542602797e-07, + "loss": 0.7, + "mean_token_accuracy": 0.7907037734985352, + "num_tokens": 11385938.0, + "step": 298 + }, + { + "epoch": 0.03803587329856253, + "ewc_loss": 0.002383837243542075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1919186135855853e-06, + "grad_norm": 8.28943157196045, + "learning_rate": 1.2632471386180584e-07, + "loss": 0.6787, + "mean_token_accuracy": 0.7971154451370239, + "num_tokens": 11424911.0, + "step": 299 + }, + { + "epoch": 0.038163083577153034, + "ewc_loss": 0.002336133737117052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1680668876579148e-06, + "grad_norm": 8.325156211853027, + "learning_rate": 1.2674862229758372e-07, + "loss": 0.6446, + "mean_token_accuracy": 0.8070300817489624, + "num_tokens": 11460580.0, + "step": 300 + }, + { + "epoch": 0.03829029385574354, + "ewc_loss": 0.00235503981821239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1775199482144671e-06, + "grad_norm": 8.969012260437012, + "learning_rate": 1.271725307333616e-07, + "loss": 0.6355, + "mean_token_accuracy": 0.8096516132354736, + "num_tokens": 11501031.0, + "step": 301 + }, + { + "epoch": 0.038417504134334056, + "ewc_loss": 0.002406900515779853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.203450210596202e-06, + "grad_norm": 8.296321868896484, + "learning_rate": 1.2759643916913946e-07, + "loss": 0.6443, + "mean_token_accuracy": 0.8067268133163452, + "num_tokens": 11544053.0, + "step": 302 + }, + { + "epoch": 0.03854471441292456, + "ewc_loss": 0.002371726091951132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1858630841743434e-06, + "grad_norm": 8.667457580566406, + "learning_rate": 1.2802034760491733e-07, + "loss": 0.662, + "mean_token_accuracy": 0.7985119819641113, + "num_tokens": 11576944.0, + "step": 303 + }, + { + "epoch": 0.03867192469151508, + "ewc_loss": 0.002396652242168784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1983261174464133e-06, + "grad_norm": 8.297847747802734, + "learning_rate": 1.284442560406952e-07, + "loss": 0.7111, + "mean_token_accuracy": 0.7857096791267395, + "num_tokens": 11619735.0, + "step": 304 + }, + { + "epoch": 0.038799134970105585, + "ewc_loss": 0.002381704980507493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.190852458421432e-06, + "grad_norm": 8.910730361938477, + "learning_rate": 1.2886816447647308e-07, + "loss": 0.7502, + "mean_token_accuracy": 0.7727857232093811, + "num_tokens": 11655025.0, + "step": 305 + }, + { + "epoch": 0.03892634524869609, + "ewc_loss": 0.002430891152471304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.215445536217885e-06, + "grad_norm": 8.689637184143066, + "learning_rate": 1.2929207291225095e-07, + "loss": 0.6455, + "mean_token_accuracy": 0.805659294128418, + "num_tokens": 11691970.0, + "step": 306 + }, + { + "epoch": 0.03905355552728661, + "ewc_loss": 0.002419054275378585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2095271131329355e-06, + "grad_norm": 8.931110382080078, + "learning_rate": 1.2971598134802882e-07, + "loss": 0.6618, + "mean_token_accuracy": 0.7999780178070068, + "num_tokens": 11729561.0, + "step": 307 + }, + { + "epoch": 0.039180765805877114, + "ewc_loss": 0.0024343894328922033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2171947219030699e-06, + "grad_norm": 9.223662376403809, + "learning_rate": 1.301398897838067e-07, + "loss": 0.6822, + "mean_token_accuracy": 0.7923961877822876, + "num_tokens": 11765707.0, + "step": 308 + }, + { + "epoch": 0.03930797608446762, + "ewc_loss": 0.00245507899671793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2275395420147106e-06, + "grad_norm": 8.647607803344727, + "learning_rate": 1.3056379821958457e-07, + "loss": 0.6909, + "mean_token_accuracy": 0.79698646068573, + "num_tokens": 11806223.0, + "step": 309 + }, + { + "epoch": 0.039435186363058136, + "ewc_loss": 0.002429518848657608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2147594361522351e-06, + "grad_norm": 8.187288284301758, + "learning_rate": 1.3098770665536244e-07, + "loss": 0.6444, + "mean_token_accuracy": 0.8061856031417847, + "num_tokens": 11845477.0, + "step": 310 + }, + { + "epoch": 0.039562396641648644, + "ewc_loss": 0.0024109473451972008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.205473722620809e-06, + "grad_norm": 9.528751373291016, + "learning_rate": 1.3141161509114031e-07, + "loss": 0.6304, + "mean_token_accuracy": 0.8050544857978821, + "num_tokens": 11877973.0, + "step": 311 + }, + { + "epoch": 0.03968960692023916, + "ewc_loss": 0.0024999608285725117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2499804142862558e-06, + "grad_norm": 9.024177551269531, + "learning_rate": 1.3183552352691819e-07, + "loss": 0.6391, + "mean_token_accuracy": 0.8079618215560913, + "num_tokens": 11912093.0, + "step": 312 + }, + { + "epoch": 0.039816817198829665, + "ewc_loss": 0.002487730933353305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2438654266588856e-06, + "grad_norm": 8.49204158782959, + "learning_rate": 1.3225943196269603e-07, + "loss": 0.6627, + "mean_token_accuracy": 0.8038612604141235, + "num_tokens": 11952541.0, + "step": 313 + }, + { + "epoch": 0.03994402747742017, + "ewc_loss": 0.002441025571897626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.220512785948813e-06, + "grad_norm": 9.0615234375, + "learning_rate": 1.3268334039847393e-07, + "loss": 0.6406, + "mean_token_accuracy": 0.8052499294281006, + "num_tokens": 11990414.0, + "step": 314 + }, + { + "epoch": 0.04007123775601069, + "ewc_loss": 0.0024865649174898863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2432824405550491e-06, + "grad_norm": 9.014471054077148, + "learning_rate": 1.3310724883425178e-07, + "loss": 0.7102, + "mean_token_accuracy": 0.7844064831733704, + "num_tokens": 12025925.0, + "step": 315 + }, + { + "epoch": 0.040198448034601195, + "ewc_loss": 0.002498985268175602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2494925840655924e-06, + "grad_norm": 8.821235656738281, + "learning_rate": 1.3353115727002968e-07, + "loss": 0.6744, + "mean_token_accuracy": 0.7979735732078552, + "num_tokens": 12068879.0, + "step": 316 + }, + { + "epoch": 0.04032565831319171, + "ewc_loss": 0.0024840605910867453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2420302937243832e-06, + "grad_norm": 8.959529876708984, + "learning_rate": 1.3395506570580752e-07, + "loss": 0.6404, + "mean_token_accuracy": 0.8055540323257446, + "num_tokens": 12111931.0, + "step": 317 + }, + { + "epoch": 0.040452868591782216, + "ewc_loss": 0.002505514770746231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2527574426712818e-06, + "grad_norm": 10.091325759887695, + "learning_rate": 1.3437897414158542e-07, + "loss": 0.7123, + "mean_token_accuracy": 0.7828385233879089, + "num_tokens": 12140431.0, + "step": 318 + }, + { + "epoch": 0.040580078870372724, + "ewc_loss": 0.0025834154803305864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2917076901430846e-06, + "grad_norm": 9.48925495147705, + "learning_rate": 1.3480288257736327e-07, + "loss": 0.6378, + "mean_token_accuracy": 0.8060691356658936, + "num_tokens": 12173932.0, + "step": 319 + }, + { + "epoch": 0.04070728914896324, + "ewc_loss": 0.0025461167097091675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.273058387596393e-06, + "grad_norm": 8.60384464263916, + "learning_rate": 1.3522679101314117e-07, + "loss": 0.6176, + "mean_token_accuracy": 0.8090829253196716, + "num_tokens": 12210040.0, + "step": 320 + }, + { + "epoch": 0.040834499427553746, + "ewc_loss": 0.002493363805115223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2466819043766009e-06, + "grad_norm": 9.136018753051758, + "learning_rate": 1.35650699448919e-07, + "loss": 0.6491, + "mean_token_accuracy": 0.8049640655517578, + "num_tokens": 12248556.0, + "step": 321 + }, + { + "epoch": 0.04096170970614425, + "ewc_loss": 0.0025439830496907234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2719915503112134e-06, + "grad_norm": 9.873863220214844, + "learning_rate": 1.360746078846969e-07, + "loss": 0.6382, + "mean_token_accuracy": 0.805972158908844, + "num_tokens": 12285664.0, + "step": 322 + }, + { + "epoch": 0.04108891998473477, + "ewc_loss": 0.0025991948787122965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.299597442994127e-06, + "grad_norm": 8.871613502502441, + "learning_rate": 1.3649851632047476e-07, + "loss": 0.6912, + "mean_token_accuracy": 0.7918824553489685, + "num_tokens": 12326964.0, + "step": 323 + }, + { + "epoch": 0.041216130263325275, + "ewc_loss": 0.0025404631160199642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.270231564376445e-06, + "grad_norm": 9.164097785949707, + "learning_rate": 1.3692242475625266e-07, + "loss": 0.5986, + "mean_token_accuracy": 0.8204982280731201, + "num_tokens": 12366541.0, + "step": 324 + }, + { + "epoch": 0.04134334054191579, + "ewc_loss": 0.0025697804521769285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2848902315454325e-06, + "grad_norm": 8.974222183227539, + "learning_rate": 1.373463331920305e-07, + "loss": 0.6604, + "mean_token_accuracy": 0.8064484000205994, + "num_tokens": 12405664.0, + "step": 325 + }, + { + "epoch": 0.0414705508205063, + "ewc_loss": 0.0025863468181341887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2931734545418294e-06, + "grad_norm": 10.767931938171387, + "learning_rate": 1.377702416278084e-07, + "loss": 0.7263, + "mean_token_accuracy": 0.7819913625717163, + "num_tokens": 12445039.0, + "step": 326 + }, + { + "epoch": 0.041597761099096804, + "ewc_loss": 0.0026791987475007772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3395994074016926e-06, + "grad_norm": 9.13834285736084, + "learning_rate": 1.3819415006358625e-07, + "loss": 0.5957, + "mean_token_accuracy": 0.8191677331924438, + "num_tokens": 12485481.0, + "step": 327 + }, + { + "epoch": 0.04172497137768732, + "ewc_loss": 0.0025818266440182924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2909133602079237e-06, + "grad_norm": 9.226635932922363, + "learning_rate": 1.3861805849936415e-07, + "loss": 0.6218, + "mean_token_accuracy": 0.8150193691253662, + "num_tokens": 12530272.0, + "step": 328 + }, + { + "epoch": 0.041852181656277826, + "ewc_loss": 0.002584038767963648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2920194194521173e-06, + "grad_norm": 8.917266845703125, + "learning_rate": 1.39041966935142e-07, + "loss": 0.6684, + "mean_token_accuracy": 0.7976665496826172, + "num_tokens": 12565854.0, + "step": 329 + }, + { + "epoch": 0.04197939193486834, + "ewc_loss": 0.002595430938526988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2977154710824834e-06, + "grad_norm": 8.85628604888916, + "learning_rate": 1.394658753709199e-07, + "loss": 0.6534, + "mean_token_accuracy": 0.8040499687194824, + "num_tokens": 12607313.0, + "step": 330 + }, + { + "epoch": 0.04210660221345885, + "ewc_loss": 0.0026012579910457134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.300629037359613e-06, + "grad_norm": 9.447221755981445, + "learning_rate": 1.3988978380669774e-07, + "loss": 0.6913, + "mean_token_accuracy": 0.7919289469718933, + "num_tokens": 12642057.0, + "step": 331 + }, + { + "epoch": 0.042233812492049355, + "ewc_loss": 0.0026515647768974304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3257823638923583e-06, + "grad_norm": 8.980212211608887, + "learning_rate": 1.403136922424756e-07, + "loss": 0.6812, + "mean_token_accuracy": 0.7948542833328247, + "num_tokens": 12684118.0, + "step": 332 + }, + { + "epoch": 0.04236102277063987, + "ewc_loss": 0.00263052131049335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3152606470612227e-06, + "grad_norm": 9.287285804748535, + "learning_rate": 1.4073760067825348e-07, + "loss": 0.6466, + "mean_token_accuracy": 0.8033022880554199, + "num_tokens": 12721945.0, + "step": 333 + }, + { + "epoch": 0.04248823304923038, + "ewc_loss": 0.002656680066138506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3283400903674192e-06, + "grad_norm": 9.815367698669434, + "learning_rate": 1.4116150911403136e-07, + "loss": 0.66, + "mean_token_accuracy": 0.8010707497596741, + "num_tokens": 12764175.0, + "step": 334 + }, + { + "epoch": 0.04261544332782089, + "ewc_loss": 0.0027033232618123293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.351661580883956e-06, + "grad_norm": 10.188202857971191, + "learning_rate": 1.4158541754980923e-07, + "loss": 0.6118, + "mean_token_accuracy": 0.8154395818710327, + "num_tokens": 12801600.0, + "step": 335 + }, + { + "epoch": 0.0427426536064114, + "ewc_loss": 0.0027175957802683115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3587979310614173e-06, + "grad_norm": 10.295228004455566, + "learning_rate": 1.420093259855871e-07, + "loss": 0.6427, + "mean_token_accuracy": 0.8108149170875549, + "num_tokens": 12843584.0, + "step": 336 + }, + { + "epoch": 0.042869863885001906, + "ewc_loss": 0.002717691008001566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.358845452159585e-06, + "grad_norm": 9.550485610961914, + "learning_rate": 1.4243323442136497e-07, + "loss": 0.658, + "mean_token_accuracy": 0.8007786273956299, + "num_tokens": 12878105.0, + "step": 337 + }, + { + "epoch": 0.04299707416359242, + "ewc_loss": 0.002666139742359519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3330699175639893e-06, + "grad_norm": 9.023027420043945, + "learning_rate": 1.4285714285714285e-07, + "loss": 0.6661, + "mean_token_accuracy": 0.799410879611969, + "num_tokens": 12911712.0, + "step": 338 + }, + { + "epoch": 0.04312428444218293, + "ewc_loss": 0.0026483999099582434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.324199956798111e-06, + "grad_norm": 9.228604316711426, + "learning_rate": 1.4328105129292072e-07, + "loss": 0.6456, + "mean_token_accuracy": 0.8067402839660645, + "num_tokens": 12951303.0, + "step": 339 + }, + { + "epoch": 0.043251494720773435, + "ewc_loss": 0.0026832325384020805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3416163255897118e-06, + "grad_norm": 9.47970962524414, + "learning_rate": 1.437049597286986e-07, + "loss": 0.6329, + "mean_token_accuracy": 0.8089169263839722, + "num_tokens": 12990544.0, + "step": 340 + }, + { + "epoch": 0.04337870499936395, + "ewc_loss": 0.0027199035976082087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3599518524642917e-06, + "grad_norm": 9.916214942932129, + "learning_rate": 1.4412886816447646e-07, + "loss": 0.6923, + "mean_token_accuracy": 0.7905144095420837, + "num_tokens": 13023766.0, + "step": 341 + }, + { + "epoch": 0.04350591527795446, + "ewc_loss": 0.0027479648124426603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3739823998548673e-06, + "grad_norm": 10.889143943786621, + "learning_rate": 1.4455277660025434e-07, + "loss": 0.6072, + "mean_token_accuracy": 0.8161842823028564, + "num_tokens": 13056337.0, + "step": 342 + }, + { + "epoch": 0.04363312555654497, + "ewc_loss": 0.0028051307890564203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4025654309079982e-06, + "grad_norm": 9.32048225402832, + "learning_rate": 1.449766850360322e-07, + "loss": 0.663, + "mean_token_accuracy": 0.8006805181503296, + "num_tokens": 13097345.0, + "step": 343 + }, + { + "epoch": 0.04376033583513548, + "ewc_loss": 0.0027092124801129103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3546061836677836e-06, + "grad_norm": 8.853976249694824, + "learning_rate": 1.4540059347181008e-07, + "loss": 0.6501, + "mean_token_accuracy": 0.8026846647262573, + "num_tokens": 13127332.0, + "step": 344 + }, + { + "epoch": 0.043887546113725986, + "ewc_loss": 0.002682736376300454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3413681472229655e-06, + "grad_norm": 9.050396919250488, + "learning_rate": 1.4582450190758795e-07, + "loss": 0.6783, + "mean_token_accuracy": 0.7941771149635315, + "num_tokens": 13170760.0, + "step": 345 + }, + { + "epoch": 0.0440147563923165, + "ewc_loss": 0.0027231157291680574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.361557906420785e-06, + "grad_norm": 10.083486557006836, + "learning_rate": 1.4624841034336583e-07, + "loss": 0.6485, + "mean_token_accuracy": 0.8057905435562134, + "num_tokens": 13207061.0, + "step": 346 + }, + { + "epoch": 0.04414196667090701, + "ewc_loss": 0.0028084388468414545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.40421946071001e-06, + "grad_norm": 9.17567253112793, + "learning_rate": 1.466723187791437e-07, + "loss": 0.6147, + "mean_token_accuracy": 0.813872218132019, + "num_tokens": 13247903.0, + "step": 347 + }, + { + "epoch": 0.04426917694949752, + "ewc_loss": 0.002755178138613701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3775891147815855e-06, + "grad_norm": 9.490979194641113, + "learning_rate": 1.4709622721492157e-07, + "loss": 0.625, + "mean_token_accuracy": 0.8064330816268921, + "num_tokens": 13287193.0, + "step": 348 + }, + { + "epoch": 0.04439638722808803, + "ewc_loss": 0.0027737407945096493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3868703945263405e-06, + "grad_norm": 9.174638748168945, + "learning_rate": 1.4752013565069942e-07, + "loss": 0.6302, + "mean_token_accuracy": 0.8085142970085144, + "num_tokens": 13323112.0, + "step": 349 + }, + { + "epoch": 0.04452359750667854, + "ewc_loss": 0.0027652059216052294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3826029316987842e-06, + "grad_norm": 9.849648475646973, + "learning_rate": 1.4794404408647732e-07, + "loss": 0.6258, + "mean_token_accuracy": 0.8116105794906616, + "num_tokens": 13359653.0, + "step": 350 + }, + { + "epoch": 0.04465080778526905, + "ewc_loss": 0.0028215686324983835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4107843071542447e-06, + "grad_norm": 9.594340324401855, + "learning_rate": 1.4836795252225516e-07, + "loss": 0.5976, + "mean_token_accuracy": 0.8164453506469727, + "num_tokens": 13400571.0, + "step": 351 + }, + { + "epoch": 0.04477801806385956, + "ewc_loss": 0.002806719159707427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4033595334694837e-06, + "grad_norm": 9.453241348266602, + "learning_rate": 1.4879186095803306e-07, + "loss": 0.575, + "mean_token_accuracy": 0.8193520307540894, + "num_tokens": 13437656.0, + "step": 352 + }, + { + "epoch": 0.04490522834245007, + "ewc_loss": 0.0027988171204924583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3994085747981444e-06, + "grad_norm": 8.831299781799316, + "learning_rate": 1.492157693938109e-07, + "loss": 0.6343, + "mean_token_accuracy": 0.8067421913146973, + "num_tokens": 13474688.0, + "step": 353 + }, + { + "epoch": 0.04503243862104058, + "ewc_loss": 0.0027729887515306473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3864943184671574e-06, + "grad_norm": 8.553106307983398, + "learning_rate": 1.496396778295888e-07, + "loss": 0.5701, + "mean_token_accuracy": 0.8250693678855896, + "num_tokens": 13518256.0, + "step": 354 + }, + { + "epoch": 0.04515964889963109, + "ewc_loss": 0.0027747887652367353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3873943771613995e-06, + "grad_norm": 9.330770492553711, + "learning_rate": 1.5006358626536665e-07, + "loss": 0.6584, + "mean_token_accuracy": 0.8024502992630005, + "num_tokens": 13554315.0, + "step": 355 + }, + { + "epoch": 0.0452868591782216, + "ewc_loss": 0.00286119244992733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4305961713034776e-06, + "grad_norm": 9.340907096862793, + "learning_rate": 1.5048749470114455e-07, + "loss": 0.6809, + "mean_token_accuracy": 0.7951305508613586, + "num_tokens": 13594388.0, + "step": 356 + }, + { + "epoch": 0.04541406945681211, + "ewc_loss": 0.0028642136603593826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4321068420031224e-06, + "grad_norm": 9.013487815856934, + "learning_rate": 1.509114031369224e-07, + "loss": 0.6054, + "mean_token_accuracy": 0.8143614530563354, + "num_tokens": 13633704.0, + "step": 357 + }, + { + "epoch": 0.04554127973540262, + "ewc_loss": 0.002841515466570854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.420757712367049e-06, + "grad_norm": 9.344697952270508, + "learning_rate": 1.513353115727003e-07, + "loss": 0.6569, + "mean_token_accuracy": 0.7969862222671509, + "num_tokens": 13671183.0, + "step": 358 + }, + { + "epoch": 0.04566849001399313, + "ewc_loss": 0.0028777550905942917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.438877575310471e-06, + "grad_norm": 9.531999588012695, + "learning_rate": 1.5175922000847814e-07, + "loss": 0.6102, + "mean_token_accuracy": 0.8133977651596069, + "num_tokens": 13709640.0, + "step": 359 + }, + { + "epoch": 0.04579570029258364, + "ewc_loss": 0.0028961971402168274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4480986010312336e-06, + "grad_norm": 8.99255084991455, + "learning_rate": 1.5218312844425604e-07, + "loss": 0.6318, + "mean_token_accuracy": 0.800492525100708, + "num_tokens": 13751114.0, + "step": 360 + }, + { + "epoch": 0.045922910571174154, + "ewc_loss": 0.0028650963213294744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4325481743071578e-06, + "grad_norm": 9.304298400878906, + "learning_rate": 1.526070368800339e-07, + "loss": 0.5637, + "mean_token_accuracy": 0.8221079111099243, + "num_tokens": 13786448.0, + "step": 361 + }, + { + "epoch": 0.04605012084976466, + "ewc_loss": 0.002900183666497469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4500918723570067e-06, + "grad_norm": 9.341620445251465, + "learning_rate": 1.530309453158118e-07, + "loss": 0.5861, + "mean_token_accuracy": 0.8246387839317322, + "num_tokens": 13827663.0, + "step": 362 + }, + { + "epoch": 0.04617733112835517, + "ewc_loss": 0.0029112566262483597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4556283076672116e-06, + "grad_norm": 8.88522720336914, + "learning_rate": 1.5345485375158963e-07, + "loss": 0.6646, + "mean_token_accuracy": 0.7986174821853638, + "num_tokens": 13866078.0, + "step": 363 + }, + { + "epoch": 0.04630454140694568, + "ewc_loss": 0.002888405229896307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4442026667893515e-06, + "grad_norm": 9.163836479187012, + "learning_rate": 1.5387876218736753e-07, + "loss": 0.5423, + "mean_token_accuracy": 0.8318270444869995, + "num_tokens": 13909767.0, + "step": 364 + }, + { + "epoch": 0.04643175168553619, + "ewc_loss": 0.0029218897689133883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.460944872633263e-06, + "grad_norm": 9.553585052490234, + "learning_rate": 1.5430267062314538e-07, + "loss": 0.6512, + "mean_token_accuracy": 0.8050602674484253, + "num_tokens": 13948937.0, + "step": 365 + }, + { + "epoch": 0.0465589619641267, + "ewc_loss": 0.002951470436528325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.475735189160332e-06, + "grad_norm": 10.037973403930664, + "learning_rate": 1.5472657905892328e-07, + "loss": 0.662, + "mean_token_accuracy": 0.7977120876312256, + "num_tokens": 13984039.0, + "step": 366 + }, + { + "epoch": 0.04668617224271721, + "ewc_loss": 0.0029852844309061766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.49264224091894e-06, + "grad_norm": 11.271716117858887, + "learning_rate": 1.5515048749470113e-07, + "loss": 0.646, + "mean_token_accuracy": 0.8020370006561279, + "num_tokens": 14018162.0, + "step": 367 + }, + { + "epoch": 0.04681338252130772, + "ewc_loss": 0.0030574286356568336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5287142787201446e-06, + "grad_norm": 9.836433410644531, + "learning_rate": 1.55574395930479e-07, + "loss": 0.6451, + "mean_token_accuracy": 0.8039584159851074, + "num_tokens": 14056493.0, + "step": 368 + }, + { + "epoch": 0.046940592799898234, + "ewc_loss": 0.002950577298179269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4752886272617616e-06, + "grad_norm": 9.995136260986328, + "learning_rate": 1.5599830436625687e-07, + "loss": 0.6388, + "mean_token_accuracy": 0.8074550628662109, + "num_tokens": 14097530.0, + "step": 369 + }, + { + "epoch": 0.04706780307848874, + "ewc_loss": 0.002952344948425889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4761724287382094e-06, + "grad_norm": 9.256299018859863, + "learning_rate": 1.5642221280203474e-07, + "loss": 0.6569, + "mean_token_accuracy": 0.8054272532463074, + "num_tokens": 14136240.0, + "step": 370 + }, + { + "epoch": 0.04719501335707925, + "ewc_loss": 0.002921282546594739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4606413287765463e-06, + "grad_norm": 9.477063179016113, + "learning_rate": 1.5684612123781262e-07, + "loss": 0.5669, + "mean_token_accuracy": 0.8260200023651123, + "num_tokens": 14171010.0, + "step": 371 + }, + { + "epoch": 0.04732222363566976, + "ewc_loss": 0.0029603091534227133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4801545376030845e-06, + "grad_norm": 9.126946449279785, + "learning_rate": 1.572700296735905e-07, + "loss": 0.6369, + "mean_token_accuracy": 0.8114617466926575, + "num_tokens": 14206985.0, + "step": 372 + }, + { + "epoch": 0.04744943391426027, + "ewc_loss": 0.002950946567580104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4754732546862215e-06, + "grad_norm": 9.506839752197266, + "learning_rate": 1.576939381093684e-07, + "loss": 0.6373, + "mean_token_accuracy": 0.8024393916130066, + "num_tokens": 14236728.0, + "step": 373 + }, + { + "epoch": 0.047576644192850785, + "ewc_loss": 0.002987535437569022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.493767740612384e-06, + "grad_norm": 9.205388069152832, + "learning_rate": 1.5811784654514623e-07, + "loss": 0.5627, + "mean_token_accuracy": 0.8275644779205322, + "num_tokens": 14270758.0, + "step": 374 + }, + { + "epoch": 0.04770385447144129, + "ewc_loss": 0.002972671529278159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4863358046568464e-06, + "grad_norm": 9.186524391174316, + "learning_rate": 1.5854175498092413e-07, + "loss": 0.6811, + "mean_token_accuracy": 0.7906891703605652, + "num_tokens": 14315002.0, + "step": 375 + }, + { + "epoch": 0.0478310647500318, + "ewc_loss": 0.0029787584207952023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4893792013026541e-06, + "grad_norm": 9.471956253051758, + "learning_rate": 1.5896566341670198e-07, + "loss": 0.652, + "mean_token_accuracy": 0.799332857131958, + "num_tokens": 14353385.0, + "step": 376 + }, + { + "epoch": 0.047958275028622314, + "ewc_loss": 0.0030193706043064594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5096852621354628e-06, + "grad_norm": 9.78052806854248, + "learning_rate": 1.5938957185247988e-07, + "loss": 0.684, + "mean_token_accuracy": 0.7887850403785706, + "num_tokens": 14391702.0, + "step": 377 + }, + { + "epoch": 0.04808548530721282, + "ewc_loss": 0.0030476737301796675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5238368860082119e-06, + "grad_norm": 9.44019889831543, + "learning_rate": 1.5981348028825772e-07, + "loss": 0.6521, + "mean_token_accuracy": 0.8021460771560669, + "num_tokens": 14427876.0, + "step": 378 + }, + { + "epoch": 0.048212695585803336, + "ewc_loss": 0.003017701907083392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.508850914433424e-06, + "grad_norm": 9.80068588256836, + "learning_rate": 1.6023738872403562e-07, + "loss": 0.7405, + "mean_token_accuracy": 0.7741665244102478, + "num_tokens": 14460349.0, + "step": 379 + }, + { + "epoch": 0.048339905864393844, + "ewc_loss": 0.003061995841562748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5309979062294587e-06, + "grad_norm": 10.227896690368652, + "learning_rate": 1.6066129715981347e-07, + "loss": 0.5892, + "mean_token_accuracy": 0.822334885597229, + "num_tokens": 14497763.0, + "step": 380 + }, + { + "epoch": 0.04846711614298435, + "ewc_loss": 0.0030942759476602077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5471380265807966e-06, + "grad_norm": 9.450421333312988, + "learning_rate": 1.6108520559559137e-07, + "loss": 0.5818, + "mean_token_accuracy": 0.8195147514343262, + "num_tokens": 14537172.0, + "step": 381 + }, + { + "epoch": 0.048594326421574865, + "ewc_loss": 0.0030492444057017565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5246222346831928e-06, + "grad_norm": 9.790388107299805, + "learning_rate": 1.6150911403136921e-07, + "loss": 0.6707, + "mean_token_accuracy": 0.8009375333786011, + "num_tokens": 14565515.0, + "step": 382 + }, + { + "epoch": 0.04872153670016537, + "ewc_loss": 0.0030850619077682495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5425309811689658e-06, + "grad_norm": 9.490379333496094, + "learning_rate": 1.619330224671471e-07, + "loss": 0.6601, + "mean_token_accuracy": 0.8003735542297363, + "num_tokens": 14608104.0, + "step": 383 + }, + { + "epoch": 0.04884874697875588, + "ewc_loss": 0.003079793183133006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5398966297652805e-06, + "grad_norm": 9.629522323608398, + "learning_rate": 1.6235693090292496e-07, + "loss": 0.6662, + "mean_token_accuracy": 0.8019912838935852, + "num_tokens": 14645328.0, + "step": 384 + }, + { + "epoch": 0.048975957257346395, + "ewc_loss": 0.0030898035038262606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5449018064828124e-06, + "grad_norm": 9.84558391571045, + "learning_rate": 1.6278083933870286e-07, + "loss": 0.704, + "mean_token_accuracy": 0.7830262184143066, + "num_tokens": 14678792.0, + "step": 385 + }, + { + "epoch": 0.0491031675359369, + "ewc_loss": 0.0031185545958578587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5592772797390353e-06, + "grad_norm": 9.458402633666992, + "learning_rate": 1.632047477744807e-07, + "loss": 0.6256, + "mean_token_accuracy": 0.8097678422927856, + "num_tokens": 14715095.0, + "step": 386 + }, + { + "epoch": 0.049230377814527417, + "ewc_loss": 0.003103458322584629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5517291558353463e-06, + "grad_norm": 9.645994186401367, + "learning_rate": 1.6362865621025858e-07, + "loss": 0.6448, + "mean_token_accuracy": 0.8049489259719849, + "num_tokens": 14753641.0, + "step": 387 + }, + { + "epoch": 0.049357588093117924, + "ewc_loss": 0.0031293747015297413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5646874089725316e-06, + "grad_norm": 9.517361640930176, + "learning_rate": 1.6405256464603645e-07, + "loss": 0.6158, + "mean_token_accuracy": 0.8101399540901184, + "num_tokens": 14788816.0, + "step": 388 + }, + { + "epoch": 0.04948479837170843, + "ewc_loss": 0.003132960991933942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5664804777770769e-06, + "grad_norm": 10.030665397644043, + "learning_rate": 1.6447647308181432e-07, + "loss": 0.6608, + "mean_token_accuracy": 0.7980300784111023, + "num_tokens": 14821607.0, + "step": 389 + }, + { + "epoch": 0.049612008650298946, + "ewc_loss": 0.0031850652303546667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5925326124488492e-06, + "grad_norm": 9.425734519958496, + "learning_rate": 1.649003815175922e-07, + "loss": 0.6774, + "mean_token_accuracy": 0.7971729040145874, + "num_tokens": 14861298.0, + "step": 390 + }, + { + "epoch": 0.04973921892888945, + "ewc_loss": 0.003144597401842475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5722987427579938e-06, + "grad_norm": 9.535057067871094, + "learning_rate": 1.6532428995337007e-07, + "loss": 0.5934, + "mean_token_accuracy": 0.8176689147949219, + "num_tokens": 14899951.0, + "step": 391 + }, + { + "epoch": 0.04986642920747997, + "ewc_loss": 0.0031620978843420744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.581048991283751e-06, + "grad_norm": 9.572754859924316, + "learning_rate": 1.6574819838914794e-07, + "loss": 0.5731, + "mean_token_accuracy": 0.8261079788208008, + "num_tokens": 14937169.0, + "step": 392 + }, + { + "epoch": 0.049993639486070475, + "ewc_loss": 0.0031880009919404984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5940005368975108e-06, + "grad_norm": 9.429615020751953, + "learning_rate": 1.661721068249258e-07, + "loss": 0.6278, + "mean_token_accuracy": 0.8085108995437622, + "num_tokens": 14975549.0, + "step": 393 + }, + { + "epoch": 0.05012084976466098, + "ewc_loss": 0.003186800517141819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5934002703943406e-06, + "grad_norm": 10.087952613830566, + "learning_rate": 1.6659601526070368e-07, + "loss": 0.6545, + "mean_token_accuracy": 0.803094744682312, + "num_tokens": 15016630.0, + "step": 394 + }, + { + "epoch": 0.0502480600432515, + "ewc_loss": 0.0032452354207634926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6226176740019582e-06, + "grad_norm": 9.509163856506348, + "learning_rate": 1.6701992369648156e-07, + "loss": 0.6351, + "mean_token_accuracy": 0.8053081631660461, + "num_tokens": 15057822.0, + "step": 395 + }, + { + "epoch": 0.050375270321842004, + "ewc_loss": 0.0031910045072436333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5955022263369756e-06, + "grad_norm": 10.389147758483887, + "learning_rate": 1.6744383213225943e-07, + "loss": 0.6096, + "mean_token_accuracy": 0.8080530166625977, + "num_tokens": 15094385.0, + "step": 396 + }, + { + "epoch": 0.05050248060043251, + "ewc_loss": 0.003271596971899271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6357985259674024e-06, + "grad_norm": 9.655713081359863, + "learning_rate": 1.678677405680373e-07, + "loss": 0.5688, + "mean_token_accuracy": 0.822006344795227, + "num_tokens": 15130969.0, + "step": 397 + }, + { + "epoch": 0.050629690879023026, + "ewc_loss": 0.0032203546725213528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6101773780974327e-06, + "grad_norm": 10.170416831970215, + "learning_rate": 1.6829164900381518e-07, + "loss": 0.5894, + "mean_token_accuracy": 0.8149724006652832, + "num_tokens": 15166586.0, + "step": 398 + }, + { + "epoch": 0.05075690115761353, + "ewc_loss": 0.0032577328383922577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6288664710373268e-06, + "grad_norm": 9.848089218139648, + "learning_rate": 1.6871555743959305e-07, + "loss": 0.5795, + "mean_token_accuracy": 0.8182505369186401, + "num_tokens": 15209603.0, + "step": 399 + }, + { + "epoch": 0.05088411143620405, + "ewc_loss": 0.0032434798777103424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6217398979279096e-06, + "grad_norm": 9.709707260131836, + "learning_rate": 1.6913946587537092e-07, + "loss": 0.597, + "mean_token_accuracy": 0.8154892921447754, + "num_tokens": 15251542.0, + "step": 400 + }, + { + "epoch": 0.051011321714794555, + "ewc_loss": 0.003245072439312935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6225362742261495e-06, + "grad_norm": 9.692500114440918, + "learning_rate": 1.695633743111488e-07, + "loss": 0.6117, + "mean_token_accuracy": 0.8138018846511841, + "num_tokens": 15288940.0, + "step": 401 + }, + { + "epoch": 0.05113853199338506, + "ewc_loss": 0.003255405928939581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6277029999400838e-06, + "grad_norm": 10.17699909210205, + "learning_rate": 1.6998728274692667e-07, + "loss": 0.5786, + "mean_token_accuracy": 0.8253940343856812, + "num_tokens": 15321965.0, + "step": 402 + }, + { + "epoch": 0.05126574227197558, + "ewc_loss": 0.0032989613246172667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6494806232003612e-06, + "grad_norm": 9.944330215454102, + "learning_rate": 1.7041119118270454e-07, + "loss": 0.5857, + "mean_token_accuracy": 0.8209705948829651, + "num_tokens": 15362895.0, + "step": 403 + }, + { + "epoch": 0.051392952550566084, + "ewc_loss": 0.0032715292181819677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6357646472897613e-06, + "grad_norm": 9.414359092712402, + "learning_rate": 1.7083509961848238e-07, + "loss": 0.6478, + "mean_token_accuracy": 0.8008999824523926, + "num_tokens": 15396675.0, + "step": 404 + }, + { + "epoch": 0.0515201628291566, + "ewc_loss": 0.0032304194755852222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.61520972596918e-06, + "grad_norm": 9.358126640319824, + "learning_rate": 1.7125900805426028e-07, + "loss": 0.6669, + "mean_token_accuracy": 0.7969921827316284, + "num_tokens": 15443070.0, + "step": 405 + }, + { + "epoch": 0.051647373107747106, + "ewc_loss": 0.0032562820706516504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6281410353258252e-06, + "grad_norm": 9.570756912231445, + "learning_rate": 1.7168291649003813e-07, + "loss": 0.595, + "mean_token_accuracy": 0.8156142234802246, + "num_tokens": 15476747.0, + "step": 406 + }, + { + "epoch": 0.051774583386337614, + "ewc_loss": 0.003297070972621441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6485355445183814e-06, + "grad_norm": 9.57222843170166, + "learning_rate": 1.7210682492581603e-07, + "loss": 0.6191, + "mean_token_accuracy": 0.8109716176986694, + "num_tokens": 15514866.0, + "step": 407 + }, + { + "epoch": 0.05190179366492813, + "ewc_loss": 0.0033038200344890356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6519099972356344e-06, + "grad_norm": 9.259293556213379, + "learning_rate": 1.7253073336159387e-07, + "loss": 0.5954, + "mean_token_accuracy": 0.8151727914810181, + "num_tokens": 15554245.0, + "step": 408 + }, + { + "epoch": 0.052029003943518635, + "ewc_loss": 0.0032937678042799234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6468839021399617e-06, + "grad_norm": 10.180353164672852, + "learning_rate": 1.7295464179737177e-07, + "loss": 0.6237, + "mean_token_accuracy": 0.810813307762146, + "num_tokens": 15593958.0, + "step": 409 + }, + { + "epoch": 0.05215621422210915, + "ewc_loss": 0.0033773004543036222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.688650172582129e-06, + "grad_norm": 9.775959014892578, + "learning_rate": 1.7337855023314962e-07, + "loss": 0.5695, + "mean_token_accuracy": 0.8248869180679321, + "num_tokens": 15632567.0, + "step": 410 + }, + { + "epoch": 0.05228342450069966, + "ewc_loss": 0.003334518289193511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.667259198256943e-06, + "grad_norm": 9.436399459838867, + "learning_rate": 1.7380245866892752e-07, + "loss": 0.5803, + "mean_token_accuracy": 0.8212052583694458, + "num_tokens": 15668255.0, + "step": 411 + }, + { + "epoch": 0.052410634779290165, + "ewc_loss": 0.0033148203510791063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6574101664446061e-06, + "grad_norm": 10.954446792602539, + "learning_rate": 1.7422636710470536e-07, + "loss": 0.6337, + "mean_token_accuracy": 0.8050768375396729, + "num_tokens": 15705435.0, + "step": 412 + }, + { + "epoch": 0.05253784505788068, + "ewc_loss": 0.003455674508586526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.727837229736906e-06, + "grad_norm": 10.164175987243652, + "learning_rate": 1.7465027554048326e-07, + "loss": 0.6537, + "mean_token_accuracy": 0.8005737066268921, + "num_tokens": 15744657.0, + "step": 413 + }, + { + "epoch": 0.05266505533647119, + "ewc_loss": 0.003365025157108903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6825125612740521e-06, + "grad_norm": 10.171884536743164, + "learning_rate": 1.750741839762611e-07, + "loss": 0.595, + "mean_token_accuracy": 0.8185167908668518, + "num_tokens": 15777374.0, + "step": 414 + }, + { + "epoch": 0.052792265615061694, + "ewc_loss": 0.0033603694755584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6801847095848643e-06, + "grad_norm": 9.589461326599121, + "learning_rate": 1.75498092412039e-07, + "loss": 0.5674, + "mean_token_accuracy": 0.8258081674575806, + "num_tokens": 15814047.0, + "step": 415 + }, + { + "epoch": 0.05291947589365221, + "ewc_loss": 0.0033284013625234365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6642006812617183e-06, + "grad_norm": 9.442021369934082, + "learning_rate": 1.7592200084781686e-07, + "loss": 0.651, + "mean_token_accuracy": 0.7994555234909058, + "num_tokens": 15849110.0, + "step": 416 + }, + { + "epoch": 0.053046686172242716, + "ewc_loss": 0.00334188062697649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6709402643755311e-06, + "grad_norm": 9.771879196166992, + "learning_rate": 1.7634590928359475e-07, + "loss": 0.62, + "mean_token_accuracy": 0.8128324151039124, + "num_tokens": 15888910.0, + "step": 417 + }, + { + "epoch": 0.05317389645083323, + "ewc_loss": 0.0033780636731535196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6890318192963605e-06, + "grad_norm": 9.696864128112793, + "learning_rate": 1.767698177193726e-07, + "loss": 0.6069, + "mean_token_accuracy": 0.8132838010787964, + "num_tokens": 15925604.0, + "step": 418 + }, + { + "epoch": 0.05330110672942374, + "ewc_loss": 0.00338410260155797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6920513417062466e-06, + "grad_norm": 9.400003433227539, + "learning_rate": 1.771937261551505e-07, + "loss": 0.58, + "mean_token_accuracy": 0.8270120024681091, + "num_tokens": 15961401.0, + "step": 419 + }, + { + "epoch": 0.053428317008014245, + "ewc_loss": 0.003360397880896926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6801989204395795e-06, + "grad_norm": 9.782746315002441, + "learning_rate": 1.7761763459092835e-07, + "loss": 0.6106, + "mean_token_accuracy": 0.8128219842910767, + "num_tokens": 16001029.0, + "step": 420 + }, + { + "epoch": 0.05355552728660476, + "ewc_loss": 0.0034145559184253216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7072779883164912e-06, + "grad_norm": 9.668516159057617, + "learning_rate": 1.7804154302670624e-07, + "loss": 0.5923, + "mean_token_accuracy": 0.818305492401123, + "num_tokens": 16036014.0, + "step": 421 + }, + { + "epoch": 0.05368273756519527, + "ewc_loss": 0.0034048680681735277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7024340195348486e-06, + "grad_norm": 9.740029335021973, + "learning_rate": 1.784654514624841e-07, + "loss": 0.5743, + "mean_token_accuracy": 0.8208136558532715, + "num_tokens": 16075373.0, + "step": 422 + }, + { + "epoch": 0.05380994784378578, + "ewc_loss": 0.0034098697360605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7049349025910487e-06, + "grad_norm": 9.594090461730957, + "learning_rate": 1.7888935989826196e-07, + "loss": 0.6156, + "mean_token_accuracy": 0.8070545196533203, + "num_tokens": 16111352.0, + "step": 423 + }, + { + "epoch": 0.05393715812237629, + "ewc_loss": 0.0034061670303344727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7030835124387522e-06, + "grad_norm": 9.745980262756348, + "learning_rate": 1.7931326833403984e-07, + "loss": 0.6324, + "mean_token_accuracy": 0.8045213222503662, + "num_tokens": 16148826.0, + "step": 424 + }, + { + "epoch": 0.054064368400966796, + "ewc_loss": 0.003439516294747591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.719758188301057e-06, + "grad_norm": 9.388895988464355, + "learning_rate": 1.797371767698177e-07, + "loss": 0.5363, + "mean_token_accuracy": 0.8353404402732849, + "num_tokens": 16193491.0, + "step": 425 + }, + { + "epoch": 0.05419157867955731, + "ewc_loss": 0.0034101149067282677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7050574570021126e-06, + "grad_norm": 10.050385475158691, + "learning_rate": 1.8016108520559558e-07, + "loss": 0.5897, + "mean_token_accuracy": 0.8192829489707947, + "num_tokens": 16235595.0, + "step": 426 + }, + { + "epoch": 0.05431878895814782, + "ewc_loss": 0.003487576497718692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7437882888771128e-06, + "grad_norm": 10.220744132995605, + "learning_rate": 1.8058499364137345e-07, + "loss": 0.6444, + "mean_token_accuracy": 0.8084006309509277, + "num_tokens": 16274105.0, + "step": 427 + }, + { + "epoch": 0.054445999236738325, + "ewc_loss": 0.0034951078705489635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7475539380029659e-06, + "grad_norm": 10.333312034606934, + "learning_rate": 1.8100890207715133e-07, + "loss": 0.6877, + "mean_token_accuracy": 0.7862816452980042, + "num_tokens": 16315886.0, + "step": 428 + }, + { + "epoch": 0.05457320951532884, + "ewc_loss": 0.00350983371026814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7549168660480063e-06, + "grad_norm": 9.88739013671875, + "learning_rate": 1.814328105129292e-07, + "loss": 0.6285, + "mean_token_accuracy": 0.8068214058876038, + "num_tokens": 16353021.0, + "step": 429 + }, + { + "epoch": 0.05470041979391935, + "ewc_loss": 0.003471106756478548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7355533827867475e-06, + "grad_norm": 10.360724449157715, + "learning_rate": 1.8185671894870707e-07, + "loss": 0.6033, + "mean_token_accuracy": 0.8116248846054077, + "num_tokens": 16380887.0, + "step": 430 + }, + { + "epoch": 0.05482763007250986, + "ewc_loss": 0.0035230328794568777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7615163869777462e-06, + "grad_norm": 9.705154418945312, + "learning_rate": 1.8228062738448494e-07, + "loss": 0.6596, + "mean_token_accuracy": 0.799491822719574, + "num_tokens": 16421066.0, + "step": 431 + }, + { + "epoch": 0.05495484035110037, + "ewc_loss": 0.003478393889963627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7391969322488876e-06, + "grad_norm": 10.448175430297852, + "learning_rate": 1.8270453582026282e-07, + "loss": 0.6289, + "mean_token_accuracy": 0.8068453669548035, + "num_tokens": 16457197.0, + "step": 432 + }, + { + "epoch": 0.055082050629690876, + "ewc_loss": 0.0035628885962069035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7814443253882928e-06, + "grad_norm": 9.80079460144043, + "learning_rate": 1.831284442560407e-07, + "loss": 0.5814, + "mean_token_accuracy": 0.8200396299362183, + "num_tokens": 16497385.0, + "step": 433 + }, + { + "epoch": 0.05520926090828139, + "ewc_loss": 0.003499742364510894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7498712168162456e-06, + "grad_norm": 9.79856014251709, + "learning_rate": 1.8355235269181856e-07, + "loss": 0.5969, + "mean_token_accuracy": 0.8172187209129333, + "num_tokens": 16540099.0, + "step": 434 + }, + { + "epoch": 0.0553364711868719, + "ewc_loss": 0.003512837691232562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7564188965479843e-06, + "grad_norm": 9.956624031066895, + "learning_rate": 1.8397626112759643e-07, + "loss": 0.5958, + "mean_token_accuracy": 0.8170946836471558, + "num_tokens": 16572760.0, + "step": 435 + }, + { + "epoch": 0.05546368146546241, + "ewc_loss": 0.00354243372566998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7712168300931808e-06, + "grad_norm": 9.905064582824707, + "learning_rate": 1.844001695633743e-07, + "loss": 0.5545, + "mean_token_accuracy": 0.8275076150894165, + "num_tokens": 16608549.0, + "step": 436 + }, + { + "epoch": 0.05559089174405292, + "ewc_loss": 0.003539174562320113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7695872429612791e-06, + "grad_norm": 9.850946426391602, + "learning_rate": 1.8482407799915218e-07, + "loss": 0.5935, + "mean_token_accuracy": 0.8181353211402893, + "num_tokens": 16643800.0, + "step": 437 + }, + { + "epoch": 0.05571810202264343, + "ewc_loss": 0.0035463536623865366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7731767911755014e-06, + "grad_norm": 10.262828826904297, + "learning_rate": 1.8524798643493005e-07, + "loss": 0.6654, + "mean_token_accuracy": 0.7942959070205688, + "num_tokens": 16677948.0, + "step": 438 + }, + { + "epoch": 0.05584531230123394, + "ewc_loss": 0.003599170595407486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.799585334083531e-06, + "grad_norm": 9.630355834960938, + "learning_rate": 1.8567189487070792e-07, + "loss": 0.6363, + "mean_token_accuracy": 0.8059969544410706, + "num_tokens": 16716905.0, + "step": 439 + }, + { + "epoch": 0.05597252257982445, + "ewc_loss": 0.003558526514098048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7792632434066036e-06, + "grad_norm": 10.085576057434082, + "learning_rate": 1.8609580330648577e-07, + "loss": 0.5482, + "mean_token_accuracy": 0.8304485082626343, + "num_tokens": 16756158.0, + "step": 440 + }, + { + "epoch": 0.05609973285841496, + "ewc_loss": 0.003605233272537589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8026166799245402e-06, + "grad_norm": 10.26605224609375, + "learning_rate": 1.8651971174226367e-07, + "loss": 0.64, + "mean_token_accuracy": 0.8063358068466187, + "num_tokens": 16794261.0, + "step": 441 + }, + { + "epoch": 0.05622694313700547, + "ewc_loss": 0.003622298128902912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.811149104469223e-06, + "grad_norm": 9.66153335571289, + "learning_rate": 1.8694362017804152e-07, + "loss": 0.5966, + "mean_token_accuracy": 0.8151970505714417, + "num_tokens": 16834953.0, + "step": 442 + }, + { + "epoch": 0.05635415341559598, + "ewc_loss": 0.003572865854948759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7864329038275173e-06, + "grad_norm": 9.934182167053223, + "learning_rate": 1.8736752861381941e-07, + "loss": 0.5549, + "mean_token_accuracy": 0.829565167427063, + "num_tokens": 16873954.0, + "step": 443 + }, + { + "epoch": 0.05648136369418649, + "ewc_loss": 0.0036276851315051317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8138425730285235e-06, + "grad_norm": 11.153541564941406, + "learning_rate": 1.8779143704959726e-07, + "loss": 0.5802, + "mean_token_accuracy": 0.8183349370956421, + "num_tokens": 16906018.0, + "step": 444 + }, + { + "epoch": 0.056608573972777, + "ewc_loss": 0.0037431323435157537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8715661553869722e-06, + "grad_norm": 9.896824836730957, + "learning_rate": 1.8821534548537516e-07, + "loss": 0.6344, + "mean_token_accuracy": 0.8063291311264038, + "num_tokens": 16938945.0, + "step": 445 + }, + { + "epoch": 0.05673578425136751, + "ewc_loss": 0.003588925814256072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7944629462363082e-06, + "grad_norm": 9.578784942626953, + "learning_rate": 1.88639253921153e-07, + "loss": 0.5903, + "mean_token_accuracy": 0.8208186626434326, + "num_tokens": 16982214.0, + "step": 446 + }, + { + "epoch": 0.05686299452995802, + "ewc_loss": 0.0035930185113102198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7965093093152973e-06, + "grad_norm": 9.913663864135742, + "learning_rate": 1.890631623569309e-07, + "loss": 0.5979, + "mean_token_accuracy": 0.8143082857131958, + "num_tokens": 17024367.0, + "step": 447 + }, + { + "epoch": 0.05699020480854853, + "ewc_loss": 0.0036693348083645105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.834667386901856e-06, + "grad_norm": 10.634071350097656, + "learning_rate": 1.8948707079270875e-07, + "loss": 0.6228, + "mean_token_accuracy": 0.8040542006492615, + "num_tokens": 17057451.0, + "step": 448 + }, + { + "epoch": 0.057117415087139044, + "ewc_loss": 0.003736465936526656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8682329709918122e-06, + "grad_norm": 10.36082649230957, + "learning_rate": 1.8991097922848665e-07, + "loss": 0.5732, + "mean_token_accuracy": 0.8218260407447815, + "num_tokens": 17094633.0, + "step": 449 + }, + { + "epoch": 0.05724462536572955, + "ewc_loss": 0.0036806061398237944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8403030708213919e-06, + "grad_norm": 10.85236644744873, + "learning_rate": 1.903348876642645e-07, + "loss": 0.6274, + "mean_token_accuracy": 0.8061672449111938, + "num_tokens": 17135892.0, + "step": 450 + }, + { + "epoch": 0.05737183564432006, + "ewc_loss": 0.0037121789064258337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.85608939773374e-06, + "grad_norm": 10.655503273010254, + "learning_rate": 1.907587961000424e-07, + "loss": 0.5694, + "mean_token_accuracy": 0.8236808776855469, + "num_tokens": 17173881.0, + "step": 451 + }, + { + "epoch": 0.05749904592291057, + "ewc_loss": 0.003692411817610264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8462059188095736e-06, + "grad_norm": 9.94164752960205, + "learning_rate": 1.9118270453582024e-07, + "loss": 0.6196, + "mean_token_accuracy": 0.8104858994483948, + "num_tokens": 17215668.0, + "step": 452 + }, + { + "epoch": 0.05762625620150108, + "ewc_loss": 0.003633935237303376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8169675968238153e-06, + "grad_norm": 9.709959983825684, + "learning_rate": 1.9160661297159814e-07, + "loss": 0.6449, + "mean_token_accuracy": 0.8031895160675049, + "num_tokens": 17254339.0, + "step": 453 + }, + { + "epoch": 0.057753466480091595, + "ewc_loss": 0.0036643394269049168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.832169687077112e-06, + "grad_norm": 10.107816696166992, + "learning_rate": 1.9203052140737599e-07, + "loss": 0.6273, + "mean_token_accuracy": 0.8042759299278259, + "num_tokens": 17288858.0, + "step": 454 + }, + { + "epoch": 0.0578806767586821, + "ewc_loss": 0.0037252488546073437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8626244582264917e-06, + "grad_norm": 9.876338958740234, + "learning_rate": 1.9245442984315389e-07, + "loss": 0.6033, + "mean_token_accuracy": 0.8158303499221802, + "num_tokens": 17330601.0, + "step": 455 + }, + { + "epoch": 0.05800788703727261, + "ewc_loss": 0.003705499693751335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8527498468756676e-06, + "grad_norm": 10.289810180664062, + "learning_rate": 1.9287833827893173e-07, + "loss": 0.6225, + "mean_token_accuracy": 0.8077617883682251, + "num_tokens": 17367797.0, + "step": 456 + }, + { + "epoch": 0.058135097315863124, + "ewc_loss": 0.003760234685614705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.880117338259879e-06, + "grad_norm": 10.071045875549316, + "learning_rate": 1.9330224671470963e-07, + "loss": 0.6097, + "mean_token_accuracy": 0.8119773268699646, + "num_tokens": 17403206.0, + "step": 457 + }, + { + "epoch": 0.05826230759445363, + "ewc_loss": 0.0037398356944322586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8699178099268465e-06, + "grad_norm": 9.996992111206055, + "learning_rate": 1.9372615515048748e-07, + "loss": 0.6559, + "mean_token_accuracy": 0.8027868270874023, + "num_tokens": 17437207.0, + "step": 458 + }, + { + "epoch": 0.05838951787304414, + "ewc_loss": 0.003736992599442601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8684962697079754e-06, + "grad_norm": 10.002583503723145, + "learning_rate": 1.9415006358626535e-07, + "loss": 0.5918, + "mean_token_accuracy": 0.8200641870498657, + "num_tokens": 17473335.0, + "step": 459 + }, + { + "epoch": 0.05851672815163465, + "ewc_loss": 0.003763178363442421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.881589128061023e-06, + "grad_norm": 10.328213691711426, + "learning_rate": 1.9457397202204322e-07, + "loss": 0.5809, + "mean_token_accuracy": 0.8168801069259644, + "num_tokens": 17505104.0, + "step": 460 + }, + { + "epoch": 0.05864393843022516, + "ewc_loss": 0.003800540929660201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9002704902959522e-06, + "grad_norm": 10.192492485046387, + "learning_rate": 1.949978804578211e-07, + "loss": 0.5664, + "mean_token_accuracy": 0.826923131942749, + "num_tokens": 17539052.0, + "step": 461 + }, + { + "epoch": 0.058771148708815675, + "ewc_loss": 0.003783134277909994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8915670807473361e-06, + "grad_norm": 9.966232299804688, + "learning_rate": 1.9542178889359897e-07, + "loss": 0.6381, + "mean_token_accuracy": 0.8028356432914734, + "num_tokens": 17578942.0, + "step": 462 + }, + { + "epoch": 0.05889835898740618, + "ewc_loss": 0.003771434538066387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8857173245123704e-06, + "grad_norm": 9.902146339416504, + "learning_rate": 1.9584569732937684e-07, + "loss": 0.6174, + "mean_token_accuracy": 0.8124775290489197, + "num_tokens": 17611931.0, + "step": 463 + }, + { + "epoch": 0.05902556926599669, + "ewc_loss": 0.0037943467032164335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8971733197759022e-06, + "grad_norm": 10.0015287399292, + "learning_rate": 1.962696057651547e-07, + "loss": 0.5213, + "mean_token_accuracy": 0.8385884761810303, + "num_tokens": 17648764.0, + "step": 464 + }, + { + "epoch": 0.059152779544587204, + "ewc_loss": 0.0038158628158271313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.907931391542661e-06, + "grad_norm": 10.027978897094727, + "learning_rate": 1.9669351420093258e-07, + "loss": 0.6329, + "mean_token_accuracy": 0.8060111403465271, + "num_tokens": 17683099.0, + "step": 465 + }, + { + "epoch": 0.05927998982317771, + "ewc_loss": 0.003822182770818472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.911091430883971e-06, + "grad_norm": 10.350601196289062, + "learning_rate": 1.9711742263671046e-07, + "loss": 0.6173, + "mean_token_accuracy": 0.8140720129013062, + "num_tokens": 17727120.0, + "step": 466 + }, + { + "epoch": 0.059407200101768226, + "ewc_loss": 0.0038497692439705133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.924884600157384e-06, + "grad_norm": 10.730484008789062, + "learning_rate": 1.9754133107248833e-07, + "loss": 0.5737, + "mean_token_accuracy": 0.8152846693992615, + "num_tokens": 17761676.0, + "step": 467 + }, + { + "epoch": 0.059534410380358734, + "ewc_loss": 0.0038872358854860067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9436179172771517e-06, + "grad_norm": 9.909440994262695, + "learning_rate": 1.979652395082662e-07, + "loss": 0.639, + "mean_token_accuracy": 0.8056577444076538, + "num_tokens": 17802311.0, + "step": 468 + }, + { + "epoch": 0.05966162065894924, + "ewc_loss": 0.003793510375544429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.896755179586762e-06, + "grad_norm": 9.71249008178711, + "learning_rate": 1.9838914794404408e-07, + "loss": 0.5294, + "mean_token_accuracy": 0.8324045538902283, + "num_tokens": 17839048.0, + "step": 469 + }, + { + "epoch": 0.059788830937539755, + "ewc_loss": 0.0038127978332340717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9063988929701736e-06, + "grad_norm": 10.51980972290039, + "learning_rate": 1.9881305637982195e-07, + "loss": 0.6029, + "mean_token_accuracy": 0.8147479295730591, + "num_tokens": 17881510.0, + "step": 470 + }, + { + "epoch": 0.05991604121613026, + "ewc_loss": 0.0039029449690133333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.951472540895338e-06, + "grad_norm": 10.411799430847168, + "learning_rate": 1.9923696481559982e-07, + "loss": 0.6299, + "mean_token_accuracy": 0.8083089590072632, + "num_tokens": 17915080.0, + "step": 471 + }, + { + "epoch": 0.06004325149472077, + "ewc_loss": 0.0038593984209001064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9296992377348943e-06, + "grad_norm": 10.511574745178223, + "learning_rate": 1.996608732513777e-07, + "loss": 0.6616, + "mean_token_accuracy": 0.798630952835083, + "num_tokens": 17957972.0, + "step": 472 + }, + { + "epoch": 0.060170461773311285, + "ewc_loss": 0.0038692287635058165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9346143744769506e-06, + "grad_norm": 10.26167106628418, + "learning_rate": 2.0008478168715557e-07, + "loss": 0.5306, + "mean_token_accuracy": 0.832260012626648, + "num_tokens": 17998008.0, + "step": 473 + }, + { + "epoch": 0.06029767205190179, + "ewc_loss": 0.0038641132414340973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9320566480018897e-06, + "grad_norm": 10.004024505615234, + "learning_rate": 2.0050869012293344e-07, + "loss": 0.5398, + "mean_token_accuracy": 0.8297863602638245, + "num_tokens": 18032427.0, + "step": 474 + }, + { + "epoch": 0.060424882330492306, + "ewc_loss": 0.0038420811761170626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9210406208003405e-06, + "grad_norm": 9.861370086669922, + "learning_rate": 2.009325985587113e-07, + "loss": 0.57, + "mean_token_accuracy": 0.8220171928405762, + "num_tokens": 18069848.0, + "step": 475 + }, + { + "epoch": 0.060552092609082814, + "ewc_loss": 0.0038636261597275734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.93181313079549e-06, + "grad_norm": 10.4065580368042, + "learning_rate": 2.0135650699448918e-07, + "loss": 0.6639, + "mean_token_accuracy": 0.7963698506355286, + "num_tokens": 18109240.0, + "step": 476 + }, + { + "epoch": 0.06067930288767332, + "ewc_loss": 0.003922402858734131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.961201405720203e-06, + "grad_norm": 10.151372909545898, + "learning_rate": 2.0178041543026706e-07, + "loss": 0.6347, + "mean_token_accuracy": 0.8070968389511108, + "num_tokens": 18146849.0, + "step": 477 + }, + { + "epoch": 0.060806513166263836, + "ewc_loss": 0.003892370034009218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.946184966072906e-06, + "grad_norm": 10.608009338378906, + "learning_rate": 2.022043238660449e-07, + "loss": 0.6652, + "mean_token_accuracy": 0.7950747013092041, + "num_tokens": 18186872.0, + "step": 478 + }, + { + "epoch": 0.06093372344485434, + "ewc_loss": 0.003939339891076088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9696699382620864e-06, + "grad_norm": 10.192344665527344, + "learning_rate": 2.026282323018228e-07, + "loss": 0.6403, + "mean_token_accuracy": 0.8050229549407959, + "num_tokens": 18222299.0, + "step": 479 + }, + { + "epoch": 0.06106093372344486, + "ewc_loss": 0.0038937614299356937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.946880729519762e-06, + "grad_norm": 9.898176193237305, + "learning_rate": 2.0305214073760065e-07, + "loss": 0.6442, + "mean_token_accuracy": 0.8053147196769714, + "num_tokens": 18261645.0, + "step": 480 + }, + { + "epoch": 0.061188144002035365, + "ewc_loss": 0.003884752979502082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9423764570092317e-06, + "grad_norm": 10.397735595703125, + "learning_rate": 2.0347604917337855e-07, + "loss": 0.6152, + "mean_token_accuracy": 0.8108950853347778, + "num_tokens": 18292970.0, + "step": 481 + }, + { + "epoch": 0.06131535428062587, + "ewc_loss": 0.003964256960898638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9821284240606474e-06, + "grad_norm": 9.923152923583984, + "learning_rate": 2.038999576091564e-07, + "loss": 0.5606, + "mean_token_accuracy": 0.8281404376029968, + "num_tokens": 18331133.0, + "step": 482 + }, + { + "epoch": 0.06144256455921639, + "ewc_loss": 0.003916861955076456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.958431084858603e-06, + "grad_norm": 10.13102912902832, + "learning_rate": 2.043238660449343e-07, + "loss": 0.6479, + "mean_token_accuracy": 0.8014274835586548, + "num_tokens": 18368905.0, + "step": 483 + }, + { + "epoch": 0.061569774837806894, + "ewc_loss": 0.0039619519375264645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9809758668998256e-06, + "grad_norm": 10.47880744934082, + "learning_rate": 2.0474777448071214e-07, + "loss": 0.5727, + "mean_token_accuracy": 0.823602557182312, + "num_tokens": 18406362.0, + "step": 484 + }, + { + "epoch": 0.0616969851163974, + "ewc_loss": 0.0039950343780219555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9975173017883208e-06, + "grad_norm": 10.147377014160156, + "learning_rate": 2.0517168291649004e-07, + "loss": 0.6113, + "mean_token_accuracy": 0.8125574588775635, + "num_tokens": 18439246.0, + "step": 485 + }, + { + "epoch": 0.061824195394987916, + "ewc_loss": 0.003952144645154476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.976072326215217e-06, + "grad_norm": 10.348376274108887, + "learning_rate": 2.0559559135226788e-07, + "loss": 0.613, + "mean_token_accuracy": 0.8094357252120972, + "num_tokens": 18473408.0, + "step": 486 + }, + { + "epoch": 0.06195140567357842, + "ewc_loss": 0.004005567170679569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0027835034852615e-06, + "grad_norm": 10.278876304626465, + "learning_rate": 2.0601949978804578e-07, + "loss": 0.567, + "mean_token_accuracy": 0.8225557804107666, + "num_tokens": 18513246.0, + "step": 487 + }, + { + "epoch": 0.06207861595216894, + "ewc_loss": 0.003995594568550587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9977971987827914e-06, + "grad_norm": 10.270240783691406, + "learning_rate": 2.0644340822382363e-07, + "loss": 0.5395, + "mean_token_accuracy": 0.8316394090652466, + "num_tokens": 18549125.0, + "step": 488 + }, + { + "epoch": 0.062205826230759445, + "ewc_loss": 0.004015563987195492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0077820863662055e-06, + "grad_norm": 10.128934860229492, + "learning_rate": 2.0686731665960153e-07, + "loss": 0.5886, + "mean_token_accuracy": 0.8230257034301758, + "num_tokens": 18585306.0, + "step": 489 + }, + { + "epoch": 0.06233303650934995, + "ewc_loss": 0.00401571299880743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0078564375580754e-06, + "grad_norm": 10.15388298034668, + "learning_rate": 2.0729122509537937e-07, + "loss": 0.5492, + "mean_token_accuracy": 0.8319924473762512, + "num_tokens": 18624378.0, + "step": 490 + }, + { + "epoch": 0.06246024678794047, + "ewc_loss": 0.004035842139273882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.01792113330157e-06, + "grad_norm": 10.473198890686035, + "learning_rate": 2.0771513353115727e-07, + "loss": 0.5933, + "mean_token_accuracy": 0.8157156705856323, + "num_tokens": 18660814.0, + "step": 491 + }, + { + "epoch": 0.06258745706653097, + "ewc_loss": 0.00409402372315526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.047011776085128e-06, + "grad_norm": 10.416622161865234, + "learning_rate": 2.0813904196693512e-07, + "loss": 0.5595, + "mean_token_accuracy": 0.8276522159576416, + "num_tokens": 18702129.0, + "step": 492 + }, + { + "epoch": 0.06271466734512149, + "ewc_loss": 0.004066607914865017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0333038719400065e-06, + "grad_norm": 11.393237113952637, + "learning_rate": 2.0856295040271302e-07, + "loss": 0.6346, + "mean_token_accuracy": 0.8023982048034668, + "num_tokens": 18737496.0, + "step": 493 + }, + { + "epoch": 0.06284187762371199, + "ewc_loss": 0.0041603888384997845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.080194462905638e-06, + "grad_norm": 10.091046333312988, + "learning_rate": 2.0898685883849086e-07, + "loss": 0.588, + "mean_token_accuracy": 0.8208664059638977, + "num_tokens": 18779332.0, + "step": 494 + }, + { + "epoch": 0.0629690879023025, + "ewc_loss": 0.004022523760795593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0112618130951887e-06, + "grad_norm": 9.946779251098633, + "learning_rate": 2.0941076727426874e-07, + "loss": 0.5738, + "mean_token_accuracy": 0.8230801820755005, + "num_tokens": 18817976.0, + "step": 495 + }, + { + "epoch": 0.06309629818089302, + "ewc_loss": 0.004060742445290089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0303712062741397e-06, + "grad_norm": 10.071612358093262, + "learning_rate": 2.098346757100466e-07, + "loss": 0.5649, + "mean_token_accuracy": 0.8234211206436157, + "num_tokens": 18862860.0, + "step": 496 + }, + { + "epoch": 0.06322350845948353, + "ewc_loss": 0.004108810797333717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.054405513263191e-06, + "grad_norm": 10.951448440551758, + "learning_rate": 2.1025858414582448e-07, + "loss": 0.608, + "mean_token_accuracy": 0.8116636872291565, + "num_tokens": 18899025.0, + "step": 497 + }, + { + "epoch": 0.06335071873807403, + "ewc_loss": 0.004181507043540478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.090753469019546e-06, + "grad_norm": 10.559764862060547, + "learning_rate": 2.1068249258160238e-07, + "loss": 0.5739, + "mean_token_accuracy": 0.8158023357391357, + "num_tokens": 18933118.0, + "step": 498 + }, + { + "epoch": 0.06347792901666455, + "ewc_loss": 0.004097167402505875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.048583610303467e-06, + "grad_norm": 10.140705108642578, + "learning_rate": 2.1110640101738023e-07, + "loss": 0.5745, + "mean_token_accuracy": 0.8246784210205078, + "num_tokens": 18969165.0, + "step": 499 + }, + { + "epoch": 0.06360513929525506, + "ewc_loss": 0.004084211308509111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0421057342900895e-06, + "grad_norm": 10.252320289611816, + "learning_rate": 2.1153030945315813e-07, + "loss": 0.5831, + "mean_token_accuracy": 0.8189486265182495, + "num_tokens": 19003882.0, + "step": 500 + }, + { + "epoch": 0.06373234957384556, + "ewc_loss": 0.004129204899072647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0646025404857937e-06, + "grad_norm": 10.351419448852539, + "learning_rate": 2.1195421788893597e-07, + "loss": 0.5831, + "mean_token_accuracy": 0.8213167190551758, + "num_tokens": 19037540.0, + "step": 501 + }, + { + "epoch": 0.06385955985243608, + "ewc_loss": 0.004139534197747707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0697671061498113e-06, + "grad_norm": 10.051734924316406, + "learning_rate": 2.1237812632471387e-07, + "loss": 0.5926, + "mean_token_accuracy": 0.8122530579566956, + "num_tokens": 19077270.0, + "step": 502 + }, + { + "epoch": 0.06398677013102659, + "ewc_loss": 0.004117329604923725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.058664904325269e-06, + "grad_norm": 10.477646827697754, + "learning_rate": 2.1280203476049172e-07, + "loss": 0.6813, + "mean_token_accuracy": 0.7872685194015503, + "num_tokens": 19117579.0, + "step": 503 + }, + { + "epoch": 0.06411398040961709, + "ewc_loss": 0.004169701598584652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.08485084840504e-06, + "grad_norm": 10.145454406738281, + "learning_rate": 2.1322594319626962e-07, + "loss": 0.6212, + "mean_token_accuracy": 0.8074600696563721, + "num_tokens": 19156599.0, + "step": 504 + }, + { + "epoch": 0.0642411906882076, + "ewc_loss": 0.0041289362125098705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0644681626436068e-06, + "grad_norm": 10.332088470458984, + "learning_rate": 2.1364985163204746e-07, + "loss": 0.6132, + "mean_token_accuracy": 0.8111627101898193, + "num_tokens": 19187367.0, + "step": 505 + }, + { + "epoch": 0.06436840096679812, + "ewc_loss": 0.004162203054875135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.081101456496981e-06, + "grad_norm": 10.532933235168457, + "learning_rate": 2.1407376006782536e-07, + "loss": 0.546, + "mean_token_accuracy": 0.8285707235336304, + "num_tokens": 19223916.0, + "step": 506 + }, + { + "epoch": 0.06449561124538863, + "ewc_loss": 0.004192674066871405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0963370843674056e-06, + "grad_norm": 10.1287202835083, + "learning_rate": 2.144976685036032e-07, + "loss": 0.5619, + "mean_token_accuracy": 0.825756847858429, + "num_tokens": 19260336.0, + "step": 507 + }, + { + "epoch": 0.06462282152397913, + "ewc_loss": 0.004139329306781292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0696645606221864e-06, + "grad_norm": 10.215612411499023, + "learning_rate": 2.149215769393811e-07, + "loss": 0.6264, + "mean_token_accuracy": 0.8099923729896545, + "num_tokens": 19299895.0, + "step": 508 + }, + { + "epoch": 0.06475003180256965, + "ewc_loss": 0.004172578454017639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0862892142758938e-06, + "grad_norm": 10.343904495239258, + "learning_rate": 2.1534548537515895e-07, + "loss": 0.5656, + "mean_token_accuracy": 0.8242948651313782, + "num_tokens": 19336614.0, + "step": 509 + }, + { + "epoch": 0.06487724208116016, + "ewc_loss": 0.004195056390017271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0975282950530527e-06, + "grad_norm": 10.336669921875, + "learning_rate": 2.1576939381093685e-07, + "loss": 0.6166, + "mean_token_accuracy": 0.8121258020401001, + "num_tokens": 19378683.0, + "step": 510 + }, + { + "epoch": 0.06500445235975066, + "ewc_loss": 0.0041891285218298435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0945642518199747e-06, + "grad_norm": 10.237191200256348, + "learning_rate": 2.161933022467147e-07, + "loss": 0.6001, + "mean_token_accuracy": 0.8122482299804688, + "num_tokens": 19422598.0, + "step": 511 + }, + { + "epoch": 0.06513166263834118, + "ewc_loss": 0.004189612343907356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0948061774106463e-06, + "grad_norm": 10.031943321228027, + "learning_rate": 2.166172106824926e-07, + "loss": 0.5894, + "mean_token_accuracy": 0.8210081458091736, + "num_tokens": 19466972.0, + "step": 512 + }, + { + "epoch": 0.06525887291693169, + "ewc_loss": 0.004175686743110418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0878433133475482e-06, + "grad_norm": 10.314964294433594, + "learning_rate": 2.1704111911827044e-07, + "loss": 0.5817, + "mean_token_accuracy": 0.8160924911499023, + "num_tokens": 19503809.0, + "step": 513 + }, + { + "epoch": 0.0653860831955222, + "ewc_loss": 0.004234139807522297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1170699255890213e-06, + "grad_norm": 10.340471267700195, + "learning_rate": 2.1746502755404831e-07, + "loss": 0.6363, + "mean_token_accuracy": 0.8096756935119629, + "num_tokens": 19545713.0, + "step": 514 + }, + { + "epoch": 0.06551329347411271, + "ewc_loss": 0.004219220485538244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1096102500450797e-06, + "grad_norm": 10.462902069091797, + "learning_rate": 2.178889359898262e-07, + "loss": 0.6239, + "mean_token_accuracy": 0.8109036684036255, + "num_tokens": 19582272.0, + "step": 515 + }, + { + "epoch": 0.06564050375270322, + "ewc_loss": 0.00424147630110383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.120738145094947e-06, + "grad_norm": 10.108637809753418, + "learning_rate": 2.1831284442560406e-07, + "loss": 0.6388, + "mean_token_accuracy": 0.8054421544075012, + "num_tokens": 19625574.0, + "step": 516 + }, + { + "epoch": 0.06576771403129372, + "ewc_loss": 0.004207715392112732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.103857696056366e-06, + "grad_norm": 10.260446548461914, + "learning_rate": 2.1873675286138193e-07, + "loss": 0.6262, + "mean_token_accuracy": 0.8059495687484741, + "num_tokens": 19667791.0, + "step": 517 + }, + { + "epoch": 0.06589492430988424, + "ewc_loss": 0.004251196514815092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.125598257407546e-06, + "grad_norm": 10.453365325927734, + "learning_rate": 2.191606612971598e-07, + "loss": 0.6561, + "mean_token_accuracy": 0.8002948760986328, + "num_tokens": 19708289.0, + "step": 518 + }, + { + "epoch": 0.06602213458847475, + "ewc_loss": 0.004280942492187023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1404712242656387e-06, + "grad_norm": 10.322527885437012, + "learning_rate": 2.1958456973293768e-07, + "loss": 0.6344, + "mean_token_accuracy": 0.8012425303459167, + "num_tokens": 19747444.0, + "step": 519 + }, + { + "epoch": 0.06614934486706527, + "ewc_loss": 0.004246102645993233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.123051217495231e-06, + "grad_norm": 10.371337890625, + "learning_rate": 2.2000847816871555e-07, + "loss": 0.5224, + "mean_token_accuracy": 0.8337909579277039, + "num_tokens": 19787061.0, + "step": 520 + }, + { + "epoch": 0.06627655514565577, + "ewc_loss": 0.004279070999473333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.13953558159119e-06, + "grad_norm": 10.403681755065918, + "learning_rate": 2.2043238660449342e-07, + "loss": 0.564, + "mean_token_accuracy": 0.825935959815979, + "num_tokens": 19826893.0, + "step": 521 + }, + { + "epoch": 0.06640376542424628, + "ewc_loss": 0.004281824454665184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1409123291959986e-06, + "grad_norm": 10.399909019470215, + "learning_rate": 2.208562950402713e-07, + "loss": 0.5557, + "mean_token_accuracy": 0.8231741189956665, + "num_tokens": 19864285.0, + "step": 522 + }, + { + "epoch": 0.0665309757028368, + "ewc_loss": 0.004284678027033806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.142339099009405e-06, + "grad_norm": 10.302947044372559, + "learning_rate": 2.2128020347604917e-07, + "loss": 0.592, + "mean_token_accuracy": 0.8170541524887085, + "num_tokens": 19903806.0, + "step": 523 + }, + { + "epoch": 0.0666581859814273, + "ewc_loss": 0.004284006077796221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1420030407171e-06, + "grad_norm": 10.35649299621582, + "learning_rate": 2.2170411191182704e-07, + "loss": 0.5689, + "mean_token_accuracy": 0.8222190737724304, + "num_tokens": 19950382.0, + "step": 524 + }, + { + "epoch": 0.06678539626001781, + "ewc_loss": 0.004302093759179115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1510468286578543e-06, + "grad_norm": 10.712791442871094, + "learning_rate": 2.221280203476049e-07, + "loss": 0.5937, + "mean_token_accuracy": 0.8137952089309692, + "num_tokens": 19984765.0, + "step": 525 + }, + { + "epoch": 0.06691260653860832, + "ewc_loss": 0.004339142236858606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.16957118936989e-06, + "grad_norm": 10.599835395812988, + "learning_rate": 2.2255192878338279e-07, + "loss": 0.6342, + "mean_token_accuracy": 0.8077343702316284, + "num_tokens": 20021081.0, + "step": 526 + }, + { + "epoch": 0.06703981681719882, + "ewc_loss": 0.004305555485188961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1527778244490037e-06, + "grad_norm": 10.624954223632812, + "learning_rate": 2.2297583721916066e-07, + "loss": 0.6087, + "mean_token_accuracy": 0.8119937181472778, + "num_tokens": 20055860.0, + "step": 527 + }, + { + "epoch": 0.06716702709578934, + "ewc_loss": 0.004307713825255632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1538569399126573e-06, + "grad_norm": 10.375632286071777, + "learning_rate": 2.2339974565493853e-07, + "loss": 0.5614, + "mean_token_accuracy": 0.8274613618850708, + "num_tokens": 20095023.0, + "step": 528 + }, + { + "epoch": 0.06729423737437985, + "ewc_loss": 0.004308320581912994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1541602563956985e-06, + "grad_norm": 10.285615921020508, + "learning_rate": 2.238236540907164e-07, + "loss": 0.6158, + "mean_token_accuracy": 0.8107136487960815, + "num_tokens": 20134620.0, + "step": 529 + }, + { + "epoch": 0.06742144765297035, + "ewc_loss": 0.0043092104606330395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.154605226678541e-06, + "grad_norm": 10.834420204162598, + "learning_rate": 2.2424756252649428e-07, + "loss": 0.5893, + "mean_token_accuracy": 0.8147160410881042, + "num_tokens": 20169690.0, + "step": 530 + }, + { + "epoch": 0.06754865793156087, + "ewc_loss": 0.00439307140186429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1965356609143782e-06, + "grad_norm": 10.932284355163574, + "learning_rate": 2.2467147096227215e-07, + "loss": 0.5814, + "mean_token_accuracy": 0.8193848133087158, + "num_tokens": 20203609.0, + "step": 531 + }, + { + "epoch": 0.06767586821015138, + "ewc_loss": 0.00438329391181469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1916468995186733e-06, + "grad_norm": 10.602781295776367, + "learning_rate": 2.2509537939805002e-07, + "loss": 0.5881, + "mean_token_accuracy": 0.8164353370666504, + "num_tokens": 20234616.0, + "step": 532 + }, + { + "epoch": 0.0678030784887419, + "ewc_loss": 0.004356055520474911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1780276711069746e-06, + "grad_norm": 10.26027774810791, + "learning_rate": 2.2551928783382787e-07, + "loss": 0.5752, + "mean_token_accuracy": 0.8196139931678772, + "num_tokens": 20271634.0, + "step": 533 + }, + { + "epoch": 0.0679302887673324, + "ewc_loss": 0.004358463454991579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.179231842092122e-06, + "grad_norm": 10.62670612335205, + "learning_rate": 2.2594319626960577e-07, + "loss": 0.6039, + "mean_token_accuracy": 0.8166910409927368, + "num_tokens": 20309683.0, + "step": 534 + }, + { + "epoch": 0.06805749904592291, + "ewc_loss": 0.004427294712513685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.213647348980885e-06, + "grad_norm": 10.467289924621582, + "learning_rate": 2.263671047053836e-07, + "loss": 0.5177, + "mean_token_accuracy": 0.8334674835205078, + "num_tokens": 20346013.0, + "step": 535 + }, + { + "epoch": 0.06818470932451343, + "ewc_loss": 0.004396843258291483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.19842172555218e-06, + "grad_norm": 10.427384376525879, + "learning_rate": 2.267910131411615e-07, + "loss": 0.5762, + "mean_token_accuracy": 0.8240480422973633, + "num_tokens": 20387613.0, + "step": 536 + }, + { + "epoch": 0.06831191960310393, + "ewc_loss": 0.004404827021062374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2024134977982612e-06, + "grad_norm": 10.687508583068848, + "learning_rate": 2.2721492157693936e-07, + "loss": 0.5754, + "mean_token_accuracy": 0.8211618065834045, + "num_tokens": 20417579.0, + "step": 537 + }, + { + "epoch": 0.06843912988169444, + "ewc_loss": 0.004460523370653391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2302617708191974e-06, + "grad_norm": 10.554478645324707, + "learning_rate": 2.2763883001271726e-07, + "loss": 0.5481, + "mean_token_accuracy": 0.8278931379318237, + "num_tokens": 20455100.0, + "step": 538 + }, + { + "epoch": 0.06856634016028496, + "ewc_loss": 0.0044260513968765736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.213025709352223e-06, + "grad_norm": 10.648459434509277, + "learning_rate": 2.280627384484951e-07, + "loss": 0.6101, + "mean_token_accuracy": 0.8082170486450195, + "num_tokens": 20496458.0, + "step": 539 + }, + { + "epoch": 0.06869355043887546, + "ewc_loss": 0.004452937748283148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.226468950539129e-06, + "grad_norm": 10.569279670715332, + "learning_rate": 2.28486646884273e-07, + "loss": 0.5828, + "mean_token_accuracy": 0.8185060024261475, + "num_tokens": 20533758.0, + "step": 540 + }, + { + "epoch": 0.06882076071746597, + "ewc_loss": 0.004459114745259285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2295573671726743e-06, + "grad_norm": 10.628546714782715, + "learning_rate": 2.2891055532005085e-07, + "loss": 0.6283, + "mean_token_accuracy": 0.8097989559173584, + "num_tokens": 20567259.0, + "step": 541 + }, + { + "epoch": 0.06894797099605648, + "ewc_loss": 0.004477782174944878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.238891056549619e-06, + "grad_norm": 10.616950035095215, + "learning_rate": 2.2933446375582875e-07, + "loss": 0.6097, + "mean_token_accuracy": 0.815200686454773, + "num_tokens": 20599333.0, + "step": 542 + }, + { + "epoch": 0.06907518127464699, + "ewc_loss": 0.004483161028474569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2415804323827615e-06, + "grad_norm": 10.421112060546875, + "learning_rate": 2.297583721916066e-07, + "loss": 0.5447, + "mean_token_accuracy": 0.8314547538757324, + "num_tokens": 20640195.0, + "step": 543 + }, + { + "epoch": 0.0692023915532375, + "ewc_loss": 0.004467046819627285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2335234461934306e-06, + "grad_norm": 10.594366073608398, + "learning_rate": 2.301822806273845e-07, + "loss": 0.5666, + "mean_token_accuracy": 0.8238531947135925, + "num_tokens": 20682617.0, + "step": 544 + }, + { + "epoch": 0.06932960183182801, + "ewc_loss": 0.004515219479799271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2576098217541585e-06, + "grad_norm": 10.892857551574707, + "learning_rate": 2.3060618906316234e-07, + "loss": 0.6193, + "mean_token_accuracy": 0.8106193542480469, + "num_tokens": 20718032.0, + "step": 545 + }, + { + "epoch": 0.06945681211041853, + "ewc_loss": 0.004533965606242418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.266982846776955e-06, + "grad_norm": 10.68157958984375, + "learning_rate": 2.3103009749894024e-07, + "loss": 0.5822, + "mean_token_accuracy": 0.819700300693512, + "num_tokens": 20755190.0, + "step": 546 + }, + { + "epoch": 0.06958402238900903, + "ewc_loss": 0.00450154347345233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2507717858388787e-06, + "grad_norm": 10.256088256835938, + "learning_rate": 2.3145400593471808e-07, + "loss": 0.609, + "mean_token_accuracy": 0.8119945526123047, + "num_tokens": 20799088.0, + "step": 547 + }, + { + "epoch": 0.06971123266759954, + "ewc_loss": 0.004473468754440546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.23673441723804e-06, + "grad_norm": 10.769835472106934, + "learning_rate": 2.3187791437049598e-07, + "loss": 0.6411, + "mean_token_accuracy": 0.800082266330719, + "num_tokens": 20831819.0, + "step": 548 + }, + { + "epoch": 0.06983844294619006, + "ewc_loss": 0.004584332462400198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2921663003216963e-06, + "grad_norm": 10.78705883026123, + "learning_rate": 2.3230182280627383e-07, + "loss": 0.5708, + "mean_token_accuracy": 0.8238857984542847, + "num_tokens": 20861291.0, + "step": 549 + }, + { + "epoch": 0.06996565322478056, + "ewc_loss": 0.004537312779575586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.268656317028217e-06, + "grad_norm": 10.6048583984375, + "learning_rate": 2.327257312420517e-07, + "loss": 0.5456, + "mean_token_accuracy": 0.8326621651649475, + "num_tokens": 20897446.0, + "step": 550 + }, + { + "epoch": 0.07009286350337107, + "ewc_loss": 0.004531018901616335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2655094653600827e-06, + "grad_norm": 10.698387145996094, + "learning_rate": 2.3314963967782957e-07, + "loss": 0.5691, + "mean_token_accuracy": 0.8195825219154358, + "num_tokens": 20934423.0, + "step": 551 + }, + { + "epoch": 0.07022007378196159, + "ewc_loss": 0.004569170065224171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2845849798613926e-06, + "grad_norm": 10.455025672912598, + "learning_rate": 2.3357354811360745e-07, + "loss": 0.5852, + "mean_token_accuracy": 0.8211269378662109, + "num_tokens": 20978394.0, + "step": 552 + }, + { + "epoch": 0.07034728406055209, + "ewc_loss": 0.004532996565103531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2664983134745853e-06, + "grad_norm": 10.769999504089355, + "learning_rate": 2.3399745654938532e-07, + "loss": 0.6131, + "mean_token_accuracy": 0.8115862607955933, + "num_tokens": 21010669.0, + "step": 553 + }, + { + "epoch": 0.0704744943391426, + "ewc_loss": 0.004596343729645014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2981719212111784e-06, + "grad_norm": 10.515746116638184, + "learning_rate": 2.344213649851632e-07, + "loss": 0.5811, + "mean_token_accuracy": 0.8217263221740723, + "num_tokens": 21047696.0, + "step": 554 + }, + { + "epoch": 0.07060170461773312, + "ewc_loss": 0.004559983499348164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.279991804243764e-06, + "grad_norm": 10.468986511230469, + "learning_rate": 2.3484527342094106e-07, + "loss": 0.6632, + "mean_token_accuracy": 0.798829197883606, + "num_tokens": 21091437.0, + "step": 555 + }, + { + "epoch": 0.07072891489632362, + "ewc_loss": 0.004579126834869385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.289563326485222e-06, + "grad_norm": 10.513490676879883, + "learning_rate": 2.3526918185671894e-07, + "loss": 0.6233, + "mean_token_accuracy": 0.8080824613571167, + "num_tokens": 21128880.0, + "step": 556 + }, + { + "epoch": 0.07085612517491413, + "ewc_loss": 0.004610252100974321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3051261450746097e-06, + "grad_norm": 10.603527069091797, + "learning_rate": 2.356930902924968e-07, + "loss": 0.5762, + "mean_token_accuracy": 0.8229290246963501, + "num_tokens": 21168523.0, + "step": 557 + }, + { + "epoch": 0.07098333545350465, + "ewc_loss": 0.0046094306744635105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3047152808430837e-06, + "grad_norm": 10.658354759216309, + "learning_rate": 2.3611699872827468e-07, + "loss": 0.6638, + "mean_token_accuracy": 0.7957459092140198, + "num_tokens": 21211376.0, + "step": 558 + }, + { + "epoch": 0.07111054573209516, + "ewc_loss": 0.004624868277460337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3124341623770306e-06, + "grad_norm": 10.567538261413574, + "learning_rate": 2.3654090716405255e-07, + "loss": 0.5933, + "mean_token_accuracy": 0.8165873289108276, + "num_tokens": 21249162.0, + "step": 559 + }, + { + "epoch": 0.07123775601068566, + "ewc_loss": 0.004614541772753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3072709609550657e-06, + "grad_norm": 10.646791458129883, + "learning_rate": 2.3696481559983043e-07, + "loss": 0.6224, + "mean_token_accuracy": 0.8059802651405334, + "num_tokens": 21282351.0, + "step": 560 + }, + { + "epoch": 0.07136496628927617, + "ewc_loss": 0.00464317761361599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3215889086714014e-06, + "grad_norm": 10.768575668334961, + "learning_rate": 2.373887240356083e-07, + "loss": 0.6115, + "mean_token_accuracy": 0.8108310699462891, + "num_tokens": 21319508.0, + "step": 561 + }, + { + "epoch": 0.07149217656786669, + "ewc_loss": 0.004657892044633627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.328946038687718e-06, + "grad_norm": 10.695755004882812, + "learning_rate": 2.3781263247138617e-07, + "loss": 0.576, + "mean_token_accuracy": 0.8223993182182312, + "num_tokens": 21357111.0, + "step": 562 + }, + { + "epoch": 0.07161938684645719, + "ewc_loss": 0.004643472842872143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.321736474186764e-06, + "grad_norm": 10.5610990524292, + "learning_rate": 2.3823654090716404e-07, + "loss": 0.5784, + "mean_token_accuracy": 0.8186577558517456, + "num_tokens": 21393222.0, + "step": 563 + }, + { + "epoch": 0.0717465971250477, + "ewc_loss": 0.0046551767736673355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3275883904716466e-06, + "grad_norm": 10.59795093536377, + "learning_rate": 2.386604493429419e-07, + "loss": 0.5329, + "mean_token_accuracy": 0.8317437171936035, + "num_tokens": 21429595.0, + "step": 564 + }, + { + "epoch": 0.07187380740363822, + "ewc_loss": 0.004672688897699118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3363445507129654e-06, + "grad_norm": 10.738378524780273, + "learning_rate": 2.390843577787198e-07, + "loss": 0.6295, + "mean_token_accuracy": 0.8022019863128662, + "num_tokens": 21467341.0, + "step": 565 + }, + { + "epoch": 0.07200101768222872, + "ewc_loss": 0.004685654770582914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.342827428947203e-06, + "grad_norm": 10.564425468444824, + "learning_rate": 2.3950826621449766e-07, + "loss": 0.504, + "mean_token_accuracy": 0.8441417217254639, + "num_tokens": 21510927.0, + "step": 566 + }, + { + "epoch": 0.07212822796081923, + "ewc_loss": 0.004665993619710207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3329967007157393e-06, + "grad_norm": 10.657283782958984, + "learning_rate": 2.3993217465027556e-07, + "loss": 0.5809, + "mean_token_accuracy": 0.8214021921157837, + "num_tokens": 21551604.0, + "step": 567 + }, + { + "epoch": 0.07225543823940975, + "ewc_loss": 0.004709032829850912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.354516482228064e-06, + "grad_norm": 10.7982816696167, + "learning_rate": 2.403560830860534e-07, + "loss": 0.6323, + "mean_token_accuracy": 0.8094579577445984, + "num_tokens": 21591871.0, + "step": 568 + }, + { + "epoch": 0.07238264851800025, + "ewc_loss": 0.004740268923342228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.370134552620584e-06, + "grad_norm": 10.913885116577148, + "learning_rate": 2.4077999152183125e-07, + "loss": 0.6291, + "mean_token_accuracy": 0.8049288988113403, + "num_tokens": 21629810.0, + "step": 569 + }, + { + "epoch": 0.07250985879659076, + "ewc_loss": 0.00473351264372468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3667562345508486e-06, + "grad_norm": 10.720842361450195, + "learning_rate": 2.4120389995760915e-07, + "loss": 0.5281, + "mean_token_accuracy": 0.8392337560653687, + "num_tokens": 21668706.0, + "step": 570 + }, + { + "epoch": 0.07263706907518128, + "ewc_loss": 0.004722755867987871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3613779376319144e-06, + "grad_norm": 10.744722366333008, + "learning_rate": 2.41627808393387e-07, + "loss": 0.5736, + "mean_token_accuracy": 0.8203421831130981, + "num_tokens": 21712725.0, + "step": 571 + }, + { + "epoch": 0.07276427935377179, + "ewc_loss": 0.004746496677398682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.373248435105779e-06, + "grad_norm": 10.714940071105957, + "learning_rate": 2.420517168291649e-07, + "loss": 0.5819, + "mean_token_accuracy": 0.8212541341781616, + "num_tokens": 21746086.0, + "step": 572 + }, + { + "epoch": 0.07289148963236229, + "ewc_loss": 0.00476341275498271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3817062810849166e-06, + "grad_norm": 10.650681495666504, + "learning_rate": 2.4247562526494274e-07, + "loss": 0.5424, + "mean_token_accuracy": 0.8286718130111694, + "num_tokens": 21779573.0, + "step": 573 + }, + { + "epoch": 0.0730186999109528, + "ewc_loss": 0.0047643049620091915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.382152388236136e-06, + "grad_norm": 10.751470565795898, + "learning_rate": 2.4289953370072064e-07, + "loss": 0.6151, + "mean_token_accuracy": 0.8094434142112732, + "num_tokens": 21818766.0, + "step": 574 + }, + { + "epoch": 0.07314591018954332, + "ewc_loss": 0.004800966940820217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.400483481324045e-06, + "grad_norm": 10.778848648071289, + "learning_rate": 2.433234421364985e-07, + "loss": 0.5713, + "mean_token_accuracy": 0.8270567655563354, + "num_tokens": 21855539.0, + "step": 575 + }, + { + "epoch": 0.07327312046813382, + "ewc_loss": 0.0048020221292972565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4010109882510733e-06, + "grad_norm": 10.802706718444824, + "learning_rate": 2.437473505722764e-07, + "loss": 0.5339, + "mean_token_accuracy": 0.8331822156906128, + "num_tokens": 21892567.0, + "step": 576 + }, + { + "epoch": 0.07340033074672433, + "ewc_loss": 0.004798820707947016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.399410277575953e-06, + "grad_norm": 10.799840927124023, + "learning_rate": 2.4417125900805423e-07, + "loss": 0.5543, + "mean_token_accuracy": 0.8212069272994995, + "num_tokens": 21928607.0, + "step": 577 + }, + { + "epoch": 0.07352754102531485, + "ewc_loss": 0.00480248685926199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4012433641473763e-06, + "grad_norm": 10.820357322692871, + "learning_rate": 2.4459516744383213e-07, + "loss": 0.6021, + "mean_token_accuracy": 0.8145451545715332, + "num_tokens": 21963781.0, + "step": 578 + }, + { + "epoch": 0.07365475130390535, + "ewc_loss": 0.004820043221116066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.410021579635213e-06, + "grad_norm": 10.826082229614258, + "learning_rate": 2.4501907587961e-07, + "loss": 0.5619, + "mean_token_accuracy": 0.8257240056991577, + "num_tokens": 21999140.0, + "step": 579 + }, + { + "epoch": 0.07378196158249586, + "ewc_loss": 0.0048214164562523365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4107082481350517e-06, + "grad_norm": 10.70689868927002, + "learning_rate": 2.454429843153879e-07, + "loss": 0.6627, + "mean_token_accuracy": 0.7958672046661377, + "num_tokens": 22038703.0, + "step": 580 + }, + { + "epoch": 0.07390917186108638, + "ewc_loss": 0.004821757785975933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.410878778391634e-06, + "grad_norm": 10.925837516784668, + "learning_rate": 2.458668927511657e-07, + "loss": 0.5751, + "mean_token_accuracy": 0.8139115571975708, + "num_tokens": 22071052.0, + "step": 581 + }, + { + "epoch": 0.07403638213967688, + "ewc_loss": 0.004852560814470053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4262803890451323e-06, + "grad_norm": 10.71292495727539, + "learning_rate": 2.462908011869436e-07, + "loss": 0.5399, + "mean_token_accuracy": 0.8324525952339172, + "num_tokens": 22109629.0, + "step": 582 + }, + { + "epoch": 0.0741635924182674, + "ewc_loss": 0.004823107272386551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4115536234603496e-06, + "grad_norm": 10.742959022521973, + "learning_rate": 2.4671470962272147e-07, + "loss": 0.553, + "mean_token_accuracy": 0.8293811082839966, + "num_tokens": 22151544.0, + "step": 583 + }, + { + "epoch": 0.07429080269685791, + "ewc_loss": 0.00485589075833559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.427945446470403e-06, + "grad_norm": 11.023469924926758, + "learning_rate": 2.4713861805849937e-07, + "loss": 0.5311, + "mean_token_accuracy": 0.8317494988441467, + "num_tokens": 22189241.0, + "step": 584 + }, + { + "epoch": 0.07441801297544842, + "ewc_loss": 0.004884210415184498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4421051421086304e-06, + "grad_norm": 10.76088809967041, + "learning_rate": 2.475625264942772e-07, + "loss": 0.5459, + "mean_token_accuracy": 0.8283203840255737, + "num_tokens": 22229138.0, + "step": 585 + }, + { + "epoch": 0.07454522325403892, + "ewc_loss": 0.004828667733818293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4143339487636695e-06, + "grad_norm": 10.698046684265137, + "learning_rate": 2.479864349300551e-07, + "loss": 0.5853, + "mean_token_accuracy": 0.8194568157196045, + "num_tokens": 22264930.0, + "step": 586 + }, + { + "epoch": 0.07467243353262944, + "ewc_loss": 0.004858405329287052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4292025955219287e-06, + "grad_norm": 10.837104797363281, + "learning_rate": 2.4841034336583296e-07, + "loss": 0.5253, + "mean_token_accuracy": 0.8298612833023071, + "num_tokens": 22297162.0, + "step": 587 + }, + { + "epoch": 0.07479964381121995, + "ewc_loss": 0.004884045105427504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4420226054644445e-06, + "grad_norm": 10.929935455322266, + "learning_rate": 2.488342518016108e-07, + "loss": 0.6118, + "mean_token_accuracy": 0.8093026876449585, + "num_tokens": 22334779.0, + "step": 588 + }, + { + "epoch": 0.07492685408981045, + "ewc_loss": 0.004884547553956509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4422738533758093e-06, + "grad_norm": 10.86451530456543, + "learning_rate": 2.492581602373887e-07, + "loss": 0.5666, + "mean_token_accuracy": 0.825463056564331, + "num_tokens": 22372820.0, + "step": 589 + }, + { + "epoch": 0.07505406436840097, + "ewc_loss": 0.004875858314335346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.43792919718544e-06, + "grad_norm": 10.739870071411133, + "learning_rate": 2.4968206867316655e-07, + "loss": 0.5879, + "mean_token_accuracy": 0.8174005150794983, + "num_tokens": 22417841.0, + "step": 590 + }, + { + "epoch": 0.07518127464699148, + "ewc_loss": 0.004889765754342079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4448829663015204e-06, + "grad_norm": 10.734461784362793, + "learning_rate": 2.5010597710894445e-07, + "loss": 0.5017, + "mean_token_accuracy": 0.8432545065879822, + "num_tokens": 22449133.0, + "step": 591 + }, + { + "epoch": 0.07530848492558198, + "ewc_loss": 0.004900130443274975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4500652671122225e-06, + "grad_norm": 10.803242683410645, + "learning_rate": 2.505298855447223e-07, + "loss": 0.5666, + "mean_token_accuracy": 0.8234936594963074, + "num_tokens": 22491145.0, + "step": 592 + }, + { + "epoch": 0.0754356952041725, + "ewc_loss": 0.004908760078251362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4543801373511087e-06, + "grad_norm": 10.888254165649414, + "learning_rate": 2.509537939805002e-07, + "loss": 0.5801, + "mean_token_accuracy": 0.8230830430984497, + "num_tokens": 22531738.0, + "step": 593 + }, + { + "epoch": 0.07556290548276301, + "ewc_loss": 0.004923049360513687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.461524672980886e-06, + "grad_norm": 11.449809074401855, + "learning_rate": 2.513777024162781e-07, + "loss": 0.5599, + "mean_token_accuracy": 0.8277294039726257, + "num_tokens": 22563005.0, + "step": 594 + }, + { + "epoch": 0.07569011576135352, + "ewc_loss": 0.004979276563972235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4896382910810644e-06, + "grad_norm": 10.789501190185547, + "learning_rate": 2.5180161085205594e-07, + "loss": 0.5388, + "mean_token_accuracy": 0.8326207399368286, + "num_tokens": 22601333.0, + "step": 595 + }, + { + "epoch": 0.07581732603994402, + "ewc_loss": 0.004865641705691814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.432820792819257e-06, + "grad_norm": 10.612643241882324, + "learning_rate": 2.522255192878338e-07, + "loss": 0.5953, + "mean_token_accuracy": 0.8164010047912598, + "num_tokens": 22639528.0, + "step": 596 + }, + { + "epoch": 0.07594453631853454, + "ewc_loss": 0.004921081010252237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4605405997135676e-06, + "grad_norm": 10.899389266967773, + "learning_rate": 2.526494277236117e-07, + "loss": 0.578, + "mean_token_accuracy": 0.8195886015892029, + "num_tokens": 22675290.0, + "step": 597 + }, + { + "epoch": 0.07607174659712505, + "ewc_loss": 0.0049713123589754105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4856560685293516e-06, + "grad_norm": 11.011672973632812, + "learning_rate": 2.530733361593896e-07, + "loss": 0.5523, + "mean_token_accuracy": 0.8255341649055481, + "num_tokens": 22712823.0, + "step": 598 + }, + { + "epoch": 0.07619895687571555, + "ewc_loss": 0.004962768405675888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.481384171915124e-06, + "grad_norm": 10.942748069763184, + "learning_rate": 2.5349724459516743e-07, + "loss": 0.5722, + "mean_token_accuracy": 0.8227962851524353, + "num_tokens": 22750916.0, + "step": 599 + }, + { + "epoch": 0.07632616715430607, + "ewc_loss": 0.004972119815647602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.486059884176939e-06, + "grad_norm": 10.892522811889648, + "learning_rate": 2.539211530309453e-07, + "loss": 0.5248, + "mean_token_accuracy": 0.8332213759422302, + "num_tokens": 22782537.0, + "step": 600 + }, + { + "epoch": 0.07645337743289658, + "ewc_loss": 0.004962083417922258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4810417471599067e-06, + "grad_norm": 11.1585693359375, + "learning_rate": 2.543450614667232e-07, + "loss": 0.6181, + "mean_token_accuracy": 0.8105642199516296, + "num_tokens": 22814805.0, + "step": 601 + }, + { + "epoch": 0.07658058771148708, + "ewc_loss": 0.005007460247725248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5037302293640096e-06, + "grad_norm": 10.83034896850586, + "learning_rate": 2.547689699025011e-07, + "loss": 0.6151, + "mean_token_accuracy": 0.8091245293617249, + "num_tokens": 22852654.0, + "step": 602 + }, + { + "epoch": 0.0767077979900776, + "ewc_loss": 0.004963436163961887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4817181838443503e-06, + "grad_norm": 10.851088523864746, + "learning_rate": 2.551928783382789e-07, + "loss": 0.571, + "mean_token_accuracy": 0.8219847679138184, + "num_tokens": 22894205.0, + "step": 603 + }, + { + "epoch": 0.07683500826866811, + "ewc_loss": 0.005009571090340614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5047854705917416e-06, + "grad_norm": 10.90843391418457, + "learning_rate": 2.5561678677405677e-07, + "loss": 0.5372, + "mean_token_accuracy": 0.8294892311096191, + "num_tokens": 22924095.0, + "step": 604 + }, + { + "epoch": 0.07696221854725861, + "ewc_loss": 0.0050235106609761715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5117553832387784e-06, + "grad_norm": 10.911865234375, + "learning_rate": 2.5604069520983467e-07, + "loss": 0.6195, + "mean_token_accuracy": 0.8055154085159302, + "num_tokens": 22960857.0, + "step": 605 + }, + { + "epoch": 0.07708942882584913, + "ewc_loss": 0.005022336263209581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5111680770351086e-06, + "grad_norm": 10.764416694641113, + "learning_rate": 2.564646036456125e-07, + "loss": 0.5694, + "mean_token_accuracy": 0.82106614112854, + "num_tokens": 23004670.0, + "step": 606 + }, + { + "epoch": 0.07721663910443964, + "ewc_loss": 0.005012056790292263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5060282951017143e-06, + "grad_norm": 10.981328964233398, + "learning_rate": 2.568885120813904e-07, + "loss": 0.5701, + "mean_token_accuracy": 0.8190969228744507, + "num_tokens": 23041999.0, + "step": 607 + }, + { + "epoch": 0.07734384938303016, + "ewc_loss": 0.005075050052255392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5375250061188126e-06, + "grad_norm": 10.881105422973633, + "learning_rate": 2.5731242051716826e-07, + "loss": 0.5599, + "mean_token_accuracy": 0.8271880149841309, + "num_tokens": 23080704.0, + "step": 608 + }, + { + "epoch": 0.07747105966162066, + "ewc_loss": 0.00505314813926816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5265740077884402e-06, + "grad_norm": 11.001496315002441, + "learning_rate": 2.5773632895294616e-07, + "loss": 0.5947, + "mean_token_accuracy": 0.8162992000579834, + "num_tokens": 23118662.0, + "step": 609 + }, + { + "epoch": 0.07759826994021117, + "ewc_loss": 0.005061489064246416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.53074449574342e-06, + "grad_norm": 10.917318344116211, + "learning_rate": 2.58160237388724e-07, + "loss": 0.5177, + "mean_token_accuracy": 0.8329532742500305, + "num_tokens": 23150367.0, + "step": 610 + }, + { + "epoch": 0.07772548021880168, + "ewc_loss": 0.005078280810266733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5391404960828368e-06, + "grad_norm": 11.042336463928223, + "learning_rate": 2.585841458245019e-07, + "loss": 0.576, + "mean_token_accuracy": 0.819678544998169, + "num_tokens": 23185836.0, + "step": 611 + }, + { + "epoch": 0.07785269049739219, + "ewc_loss": 0.005105641670525074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5528208880132297e-06, + "grad_norm": 11.022784233093262, + "learning_rate": 2.5900805426027975e-07, + "loss": 0.515, + "mean_token_accuracy": 0.8377578258514404, + "num_tokens": 23219346.0, + "step": 612 + }, + { + "epoch": 0.0779799007759827, + "ewc_loss": 0.005104757379740477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5523786462144926e-06, + "grad_norm": 11.07007884979248, + "learning_rate": 2.5943196269605765e-07, + "loss": 0.5884, + "mean_token_accuracy": 0.8220160603523254, + "num_tokens": 23257569.0, + "step": 613 + }, + { + "epoch": 0.07810711105457321, + "ewc_loss": 0.005123487673699856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.561743940532324e-06, + "grad_norm": 10.810112953186035, + "learning_rate": 2.598558711318355e-07, + "loss": 0.577, + "mean_token_accuracy": 0.8226938247680664, + "num_tokens": 23302944.0, + "step": 614 + }, + { + "epoch": 0.07823432133316371, + "ewc_loss": 0.005103282164782286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5516410460113548e-06, + "grad_norm": 11.205708503723145, + "learning_rate": 2.602797795676134e-07, + "loss": 0.5828, + "mean_token_accuracy": 0.8198152780532837, + "num_tokens": 23341660.0, + "step": 615 + }, + { + "epoch": 0.07836153161175423, + "ewc_loss": 0.005182583350688219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.591291604403523e-06, + "grad_norm": 11.061023712158203, + "learning_rate": 2.6070368800339124e-07, + "loss": 0.5829, + "mean_token_accuracy": 0.8132292032241821, + "num_tokens": 23375956.0, + "step": 616 + }, + { + "epoch": 0.07848874189034474, + "ewc_loss": 0.005130337085574865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.56516864283185e-06, + "grad_norm": 11.022500038146973, + "learning_rate": 2.6112759643916914e-07, + "loss": 0.5342, + "mean_token_accuracy": 0.8313932418823242, + "num_tokens": 23407568.0, + "step": 617 + }, + { + "epoch": 0.07861595216893524, + "ewc_loss": 0.005147271323949099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.573635583758005e-06, + "grad_norm": 11.08305835723877, + "learning_rate": 2.61551504874947e-07, + "loss": 0.5683, + "mean_token_accuracy": 0.8234911561012268, + "num_tokens": 23440261.0, + "step": 618 + }, + { + "epoch": 0.07874316244752576, + "ewc_loss": 0.0051708524115383625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.585426273071789e-06, + "grad_norm": 10.987273216247559, + "learning_rate": 2.619754133107249e-07, + "loss": 0.5484, + "mean_token_accuracy": 0.8296104669570923, + "num_tokens": 23481706.0, + "step": 619 + }, + { + "epoch": 0.07887037272611627, + "ewc_loss": 0.005157013889402151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5785068373807007e-06, + "grad_norm": 11.096132278442383, + "learning_rate": 2.623993217465028e-07, + "loss": 0.6092, + "mean_token_accuracy": 0.8074328899383545, + "num_tokens": 23515410.0, + "step": 620 + }, + { + "epoch": 0.07899758300470679, + "ewc_loss": 0.005189746618270874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.59487342191278e-06, + "grad_norm": 10.988784790039062, + "learning_rate": 2.6282323018228063e-07, + "loss": 0.5949, + "mean_token_accuracy": 0.8190141916275024, + "num_tokens": 23554627.0, + "step": 621 + }, + { + "epoch": 0.07912479328329729, + "ewc_loss": 0.00517628900706768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.588144525361713e-06, + "grad_norm": 11.047778129577637, + "learning_rate": 2.632471386180585e-07, + "loss": 0.6163, + "mean_token_accuracy": 0.8081402778625488, + "num_tokens": 23593400.0, + "step": 622 + }, + { + "epoch": 0.0792520035618878, + "ewc_loss": 0.005187703296542168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.593851604615338e-06, + "grad_norm": 11.107598304748535, + "learning_rate": 2.6367104705383637e-07, + "loss": 0.6142, + "mean_token_accuracy": 0.8078535795211792, + "num_tokens": 23628242.0, + "step": 623 + }, + { + "epoch": 0.07937921384047832, + "ewc_loss": 0.005213325377553701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6066627469845116e-06, + "grad_norm": 11.175069808959961, + "learning_rate": 2.6409495548961427e-07, + "loss": 0.6, + "mean_token_accuracy": 0.8188228607177734, + "num_tokens": 23671041.0, + "step": 624 + }, + { + "epoch": 0.07950642411906882, + "ewc_loss": 0.0052151987329125404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6075992991536623e-06, + "grad_norm": 11.255139350891113, + "learning_rate": 2.6451886392539206e-07, + "loss": 0.5791, + "mean_token_accuracy": 0.8232980966567993, + "num_tokens": 23701904.0, + "step": 625 + }, + { + "epoch": 0.07963363439765933, + "ewc_loss": 0.0052428157068789005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6214079298370052e-06, + "grad_norm": 10.907281875610352, + "learning_rate": 2.6494277236116996e-07, + "loss": 0.5408, + "mean_token_accuracy": 0.8285288214683533, + "num_tokens": 23742374.0, + "step": 626 + }, + { + "epoch": 0.07976084467624985, + "ewc_loss": 0.00522194616496563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.610973069749889e-06, + "grad_norm": 11.324654579162598, + "learning_rate": 2.6536668079694786e-07, + "loss": 0.648, + "mean_token_accuracy": 0.8000391125679016, + "num_tokens": 23780216.0, + "step": 627 + }, + { + "epoch": 0.07988805495484035, + "ewc_loss": 0.005292476154863834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.646238044690108e-06, + "grad_norm": 11.232362747192383, + "learning_rate": 2.6579058923272576e-07, + "loss": 0.4997, + "mean_token_accuracy": 0.8444346785545349, + "num_tokens": 23813641.0, + "step": 628 + }, + { + "epoch": 0.08001526523343086, + "ewc_loss": 0.005270289722830057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6351449378125835e-06, + "grad_norm": 11.087641716003418, + "learning_rate": 2.6621449766850356e-07, + "loss": 0.5864, + "mean_token_accuracy": 0.8183732032775879, + "num_tokens": 23850189.0, + "step": 629 + }, + { + "epoch": 0.08014247551202137, + "ewc_loss": 0.005266229156404734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6331144908908755e-06, + "grad_norm": 11.03096866607666, + "learning_rate": 2.6663840610428145e-07, + "loss": 0.5395, + "mean_token_accuracy": 0.8315025568008423, + "num_tokens": 23884410.0, + "step": 630 + }, + { + "epoch": 0.08026968579061187, + "ewc_loss": 0.005275013390928507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6375066681794124e-06, + "grad_norm": 11.139605522155762, + "learning_rate": 2.6706231454005935e-07, + "loss": 0.5511, + "mean_token_accuracy": 0.8279359936714172, + "num_tokens": 23930139.0, + "step": 631 + }, + { + "epoch": 0.08039689606920239, + "ewc_loss": 0.005308527499437332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6542636533122277e-06, + "grad_norm": 10.953032493591309, + "learning_rate": 2.6748622297583725e-07, + "loss": 0.5471, + "mean_token_accuracy": 0.8296985626220703, + "num_tokens": 23973124.0, + "step": 632 + }, + { + "epoch": 0.0805241063477929, + "ewc_loss": 0.0052762506529688835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6381253519502934e-06, + "grad_norm": 11.086478233337402, + "learning_rate": 2.6791013141161505e-07, + "loss": 0.557, + "mean_token_accuracy": 0.8275315761566162, + "num_tokens": 24008676.0, + "step": 633 + }, + { + "epoch": 0.08065131662638342, + "ewc_loss": 0.005319769959896803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6598850126902107e-06, + "grad_norm": 11.354926109313965, + "learning_rate": 2.6833403984739294e-07, + "loss": 0.5544, + "mean_token_accuracy": 0.8253399133682251, + "num_tokens": 24044880.0, + "step": 634 + }, + { + "epoch": 0.08077852690497392, + "ewc_loss": 0.00534453708678484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6722684651758755e-06, + "grad_norm": 11.049055099487305, + "learning_rate": 2.6875794828317084e-07, + "loss": 0.5449, + "mean_token_accuracy": 0.8336973786354065, + "num_tokens": 24090140.0, + "step": 635 + }, + { + "epoch": 0.08090573718356443, + "ewc_loss": 0.005282439291477203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6412196802994004e-06, + "grad_norm": 11.202109336853027, + "learning_rate": 2.6918185671894874e-07, + "loss": 0.6317, + "mean_token_accuracy": 0.808383584022522, + "num_tokens": 24123311.0, + "step": 636 + }, + { + "epoch": 0.08103294746215495, + "ewc_loss": 0.005348897073417902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.674448523976025e-06, + "grad_norm": 11.160530090332031, + "learning_rate": 2.6960576515472654e-07, + "loss": 0.6643, + "mean_token_accuracy": 0.7952765822410583, + "num_tokens": 24164019.0, + "step": 637 + }, + { + "epoch": 0.08116015774074545, + "ewc_loss": 0.005324211902916431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.66210599875194e-06, + "grad_norm": 11.17244815826416, + "learning_rate": 2.7002967359050443e-07, + "loss": 0.5317, + "mean_token_accuracy": 0.8346177339553833, + "num_tokens": 24205986.0, + "step": 638 + }, + { + "epoch": 0.08128736801933596, + "ewc_loss": 0.005342260468751192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6711302325566066e-06, + "grad_norm": 11.171653747558594, + "learning_rate": 2.7045358202628233e-07, + "loss": 0.589, + "mean_token_accuracy": 0.8142539262771606, + "num_tokens": 24251956.0, + "step": 639 + }, + { + "epoch": 0.08141457829792648, + "ewc_loss": 0.005352229345589876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.676114718269673e-06, + "grad_norm": 11.199296951293945, + "learning_rate": 2.7087749046206023e-07, + "loss": 0.5341, + "mean_token_accuracy": 0.8302950263023376, + "num_tokens": 24284990.0, + "step": 640 + }, + { + "epoch": 0.08154178857651698, + "ewc_loss": 0.005346131045371294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6730656372819794e-06, + "grad_norm": 11.169703483581543, + "learning_rate": 2.71301398897838e-07, + "loss": 0.5894, + "mean_token_accuracy": 0.8161996603012085, + "num_tokens": 24323305.0, + "step": 641 + }, + { + "epoch": 0.08166899885510749, + "ewc_loss": 0.005364412907510996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6822065137821482e-06, + "grad_norm": 10.96671199798584, + "learning_rate": 2.717253073336159e-07, + "loss": 0.4996, + "mean_token_accuracy": 0.8419296741485596, + "num_tokens": 24365847.0, + "step": 642 + }, + { + "epoch": 0.081796209133698, + "ewc_loss": 0.005357738584280014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.678869350347668e-06, + "grad_norm": 11.281115531921387, + "learning_rate": 2.721492157693938e-07, + "loss": 0.5077, + "mean_token_accuracy": 0.8395399451255798, + "num_tokens": 24407269.0, + "step": 643 + }, + { + "epoch": 0.0819234194122885, + "ewc_loss": 0.0054127867333590984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.706393388507422e-06, + "grad_norm": 11.253256797790527, + "learning_rate": 2.7257312420517167e-07, + "loss": 0.5662, + "mean_token_accuracy": 0.8217809200286865, + "num_tokens": 24444990.0, + "step": 644 + }, + { + "epoch": 0.08205062969087902, + "ewc_loss": 0.005379142239689827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6895711471297545e-06, + "grad_norm": 11.155735969543457, + "learning_rate": 2.729970326409495e-07, + "loss": 0.5967, + "mean_token_accuracy": 0.8177270889282227, + "num_tokens": 24487293.0, + "step": 645 + }, + { + "epoch": 0.08217783996946953, + "ewc_loss": 0.005376901477575302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6884506496571703e-06, + "grad_norm": 11.268689155578613, + "learning_rate": 2.734209410767274e-07, + "loss": 0.5581, + "mean_token_accuracy": 0.8270895481109619, + "num_tokens": 24521403.0, + "step": 646 + }, + { + "epoch": 0.08230505024806005, + "ewc_loss": 0.005403962451964617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.701981202335446e-06, + "grad_norm": 11.119348526000977, + "learning_rate": 2.738448495125053e-07, + "loss": 0.5657, + "mean_token_accuracy": 0.8268368244171143, + "num_tokens": 24562740.0, + "step": 647 + }, + { + "epoch": 0.08243226052665055, + "ewc_loss": 0.005393650848418474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.696825504244771e-06, + "grad_norm": 11.185890197753906, + "learning_rate": 2.7426875794828316e-07, + "loss": 0.5735, + "mean_token_accuracy": 0.8166208267211914, + "num_tokens": 24597507.0, + "step": 648 + }, + { + "epoch": 0.08255947080524106, + "ewc_loss": 0.005427421070635319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7137105007568607e-06, + "grad_norm": 11.215984344482422, + "learning_rate": 2.74692666384061e-07, + "loss": 0.535, + "mean_token_accuracy": 0.8310533165931702, + "num_tokens": 24635629.0, + "step": 649 + }, + { + "epoch": 0.08268668108383158, + "ewc_loss": 0.005429364740848541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.714682295845705e-06, + "grad_norm": 11.335116386413574, + "learning_rate": 2.751165748198389e-07, + "loss": 0.5602, + "mean_token_accuracy": 0.8231247663497925, + "num_tokens": 24676087.0, + "step": 650 + }, + { + "epoch": 0.08281389136242208, + "ewc_loss": 0.00544277299195528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7213864086661488e-06, + "grad_norm": 11.24670124053955, + "learning_rate": 2.755404832556168e-07, + "loss": 0.5806, + "mean_token_accuracy": 0.8195056915283203, + "num_tokens": 24713652.0, + "step": 651 + }, + { + "epoch": 0.0829411016410126, + "ewc_loss": 0.005430885124951601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7154426334163873e-06, + "grad_norm": 11.180442810058594, + "learning_rate": 2.7596439169139465e-07, + "loss": 0.5628, + "mean_token_accuracy": 0.8266787528991699, + "num_tokens": 24750130.0, + "step": 652 + }, + { + "epoch": 0.08306831191960311, + "ewc_loss": 0.005440047010779381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.720023530855542e-06, + "grad_norm": 11.354799270629883, + "learning_rate": 2.763883001271725e-07, + "loss": 0.6132, + "mean_token_accuracy": 0.8087068200111389, + "num_tokens": 24787337.0, + "step": 653 + }, + { + "epoch": 0.08319552219819361, + "ewc_loss": 0.005484495311975479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.74224771601439e-06, + "grad_norm": 11.377352714538574, + "learning_rate": 2.768122085629504e-07, + "loss": 0.6047, + "mean_token_accuracy": 0.8052178621292114, + "num_tokens": 24822736.0, + "step": 654 + }, + { + "epoch": 0.08332273247678412, + "ewc_loss": 0.005487698595970869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.743849336184212e-06, + "grad_norm": 11.410882949829102, + "learning_rate": 2.772361169987283e-07, + "loss": 0.5438, + "mean_token_accuracy": 0.833212673664093, + "num_tokens": 24858743.0, + "step": 655 + }, + { + "epoch": 0.08344994275537464, + "ewc_loss": 0.005493070464581251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.746535301412223e-06, + "grad_norm": 11.272499084472656, + "learning_rate": 2.7766002543450614e-07, + "loss": 0.5527, + "mean_token_accuracy": 0.8246853351593018, + "num_tokens": 24901286.0, + "step": 656 + }, + { + "epoch": 0.08357715303396514, + "ewc_loss": 0.005477119702845812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.738559942372376e-06, + "grad_norm": 11.256720542907715, + "learning_rate": 2.78083933870284e-07, + "loss": 0.6254, + "mean_token_accuracy": 0.8077734708786011, + "num_tokens": 24943457.0, + "step": 657 + }, + { + "epoch": 0.08370436331255565, + "ewc_loss": 0.005505503620952368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7527519250725163e-06, + "grad_norm": 11.357855796813965, + "learning_rate": 2.785078423060619e-07, + "loss": 0.5637, + "mean_token_accuracy": 0.8226243853569031, + "num_tokens": 24979247.0, + "step": 658 + }, + { + "epoch": 0.08383157359114617, + "ewc_loss": 0.00552499620243907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.762498070296715e-06, + "grad_norm": 11.342408180236816, + "learning_rate": 2.789317507418398e-07, + "loss": 0.5747, + "mean_token_accuracy": 0.819422721862793, + "num_tokens": 25017456.0, + "step": 659 + }, + { + "epoch": 0.08395878386973668, + "ewc_loss": 0.005516231991350651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7581158974498976e-06, + "grad_norm": 11.368319511413574, + "learning_rate": 2.7935565917761763e-07, + "loss": 0.5312, + "mean_token_accuracy": 0.8305288553237915, + "num_tokens": 25054682.0, + "step": 660 + }, + { + "epoch": 0.08408599414832718, + "ewc_loss": 0.00552136218175292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.760681127256248e-06, + "grad_norm": 11.466876983642578, + "learning_rate": 2.797795676133955e-07, + "loss": 0.6068, + "mean_token_accuracy": 0.8103408813476562, + "num_tokens": 25092501.0, + "step": 661 + }, + { + "epoch": 0.0842132044269177, + "ewc_loss": 0.005543003790080547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7715018404705916e-06, + "grad_norm": 11.299896240234375, + "learning_rate": 2.802034760491734e-07, + "loss": 0.4709, + "mean_token_accuracy": 0.8500860929489136, + "num_tokens": 25129253.0, + "step": 662 + }, + { + "epoch": 0.08434041470550821, + "ewc_loss": 0.005527647212147713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.763823658824549e-06, + "grad_norm": 11.235068321228027, + "learning_rate": 2.806273844849512e-07, + "loss": 0.511, + "mean_token_accuracy": 0.8398624658584595, + "num_tokens": 25169119.0, + "step": 663 + }, + { + "epoch": 0.08446762498409871, + "ewc_loss": 0.005534607917070389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7673040676745586e-06, + "grad_norm": 11.321056365966797, + "learning_rate": 2.810512929207291e-07, + "loss": 0.5831, + "mean_token_accuracy": 0.8122686147689819, + "num_tokens": 25207654.0, + "step": 664 + }, + { + "epoch": 0.08459483526268922, + "ewc_loss": 0.0055681555531919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7840778784593567e-06, + "grad_norm": 11.325520515441895, + "learning_rate": 2.8147520135650697e-07, + "loss": 0.5272, + "mean_token_accuracy": 0.8307449817657471, + "num_tokens": 25243258.0, + "step": 665 + }, + { + "epoch": 0.08472204554127974, + "ewc_loss": 0.0055632502771914005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.7816251986223506e-06, + "grad_norm": 11.340898513793945, + "learning_rate": 2.8189910979228487e-07, + "loss": 0.5681, + "mean_token_accuracy": 0.8242018818855286, + "num_tokens": 25281301.0, + "step": 666 + }, + { + "epoch": 0.08484925581987024, + "ewc_loss": 0.005580978933721781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.790489361359505e-06, + "grad_norm": 11.388788223266602, + "learning_rate": 2.823230182280627e-07, + "loss": 0.5171, + "mean_token_accuracy": 0.8373658657073975, + "num_tokens": 25313500.0, + "step": 667 + }, + { + "epoch": 0.08497646609846075, + "ewc_loss": 0.00559424702078104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.797123443087912e-06, + "grad_norm": 11.332844734191895, + "learning_rate": 2.827469266638406e-07, + "loss": 0.5563, + "mean_token_accuracy": 0.8250387907028198, + "num_tokens": 25355121.0, + "step": 668 + }, + { + "epoch": 0.08510367637705127, + "ewc_loss": 0.005592887289822102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.796443595798337e-06, + "grad_norm": 11.306378364562988, + "learning_rate": 2.8317083509961846e-07, + "loss": 0.5713, + "mean_token_accuracy": 0.8233135938644409, + "num_tokens": 25398868.0, + "step": 669 + }, + { + "epoch": 0.08523088665564178, + "ewc_loss": 0.005607522092759609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8037611627951264e-06, + "grad_norm": 11.284611701965332, + "learning_rate": 2.8359474353539636e-07, + "loss": 0.5497, + "mean_token_accuracy": 0.8266607522964478, + "num_tokens": 25438670.0, + "step": 670 + }, + { + "epoch": 0.08535809693423228, + "ewc_loss": 0.005613056942820549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.806528527798946e-06, + "grad_norm": 11.490181922912598, + "learning_rate": 2.840186519711742e-07, + "loss": 0.5974, + "mean_token_accuracy": 0.8182306289672852, + "num_tokens": 25479243.0, + "step": 671 + }, + { + "epoch": 0.0854853072128228, + "ewc_loss": 0.005637155380100012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8185777409817092e-06, + "grad_norm": 11.333622932434082, + "learning_rate": 2.844425604069521e-07, + "loss": 0.5803, + "mean_token_accuracy": 0.8169493079185486, + "num_tokens": 25514615.0, + "step": 672 + }, + { + "epoch": 0.08561251749141331, + "ewc_loss": 0.00561023922637105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8051197205058997e-06, + "grad_norm": 11.357234954833984, + "learning_rate": 2.8486646884272995e-07, + "loss": 0.559, + "mean_token_accuracy": 0.8244109153747559, + "num_tokens": 25552753.0, + "step": 673 + }, + { + "epoch": 0.08573972777000381, + "ewc_loss": 0.005635987967252731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8179940727568464e-06, + "grad_norm": 11.38831901550293, + "learning_rate": 2.8529037727850785e-07, + "loss": 0.4943, + "mean_token_accuracy": 0.8424568772315979, + "num_tokens": 25587415.0, + "step": 674 + }, + { + "epoch": 0.08586693804859433, + "ewc_loss": 0.005643020384013653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8215101792739006e-06, + "grad_norm": 11.339491844177246, + "learning_rate": 2.857142857142857e-07, + "loss": 0.5265, + "mean_token_accuracy": 0.8365230560302734, + "num_tokens": 25625962.0, + "step": 675 + }, + { + "epoch": 0.08599414832718484, + "ewc_loss": 0.005649104714393616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.824552439051331e-06, + "grad_norm": 11.395065307617188, + "learning_rate": 2.861381941500636e-07, + "loss": 0.5741, + "mean_token_accuracy": 0.8205641508102417, + "num_tokens": 25669680.0, + "step": 676 + }, + { + "epoch": 0.08612135860577534, + "ewc_loss": 0.005658426787704229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.829213372024242e-06, + "grad_norm": 11.419456481933594, + "learning_rate": 2.8656210258584144e-07, + "loss": 0.6109, + "mean_token_accuracy": 0.8069543838500977, + "num_tokens": 25709221.0, + "step": 677 + }, + { + "epoch": 0.08624856888436586, + "ewc_loss": 0.005664785858243704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.83239296550164e-06, + "grad_norm": 11.505317687988281, + "learning_rate": 2.869860110216193e-07, + "loss": 0.6101, + "mean_token_accuracy": 0.8070273399353027, + "num_tokens": 25741880.0, + "step": 678 + }, + { + "epoch": 0.08637577916295637, + "ewc_loss": 0.0056812958791852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.840647994162282e-06, + "grad_norm": 11.323237419128418, + "learning_rate": 2.874099194573972e-07, + "loss": 0.5413, + "mean_token_accuracy": 0.8292951583862305, + "num_tokens": 25786282.0, + "step": 679 + }, + { + "epoch": 0.08650298944154687, + "ewc_loss": 0.005665402393788099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.832701284205541e-06, + "grad_norm": 11.428396224975586, + "learning_rate": 2.878338278931751e-07, + "loss": 0.5898, + "mean_token_accuracy": 0.8177576065063477, + "num_tokens": 25820229.0, + "step": 680 + }, + { + "epoch": 0.08663019972013739, + "ewc_loss": 0.005710831377655268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8554156870086445e-06, + "grad_norm": 11.553864479064941, + "learning_rate": 2.8825773632895293e-07, + "loss": 0.5233, + "mean_token_accuracy": 0.8387492895126343, + "num_tokens": 25856465.0, + "step": 681 + }, + { + "epoch": 0.0867574099987279, + "ewc_loss": 0.0057235476560890675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8617737370950636e-06, + "grad_norm": 11.429256439208984, + "learning_rate": 2.886816447647308e-07, + "loss": 0.5231, + "mean_token_accuracy": 0.8354797959327698, + "num_tokens": 25897407.0, + "step": 682 + }, + { + "epoch": 0.08688462027731841, + "ewc_loss": 0.0056915609166026115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.845780500138062e-06, + "grad_norm": 11.403107643127441, + "learning_rate": 2.891055532005087e-07, + "loss": 0.5454, + "mean_token_accuracy": 0.8297766447067261, + "num_tokens": 25938825.0, + "step": 683 + }, + { + "epoch": 0.08701183055590891, + "ewc_loss": 0.00572510901838541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8625545382965356e-06, + "grad_norm": 11.440363883972168, + "learning_rate": 2.8952946163628657e-07, + "loss": 0.578, + "mean_token_accuracy": 0.8161144852638245, + "num_tokens": 25974944.0, + "step": 684 + }, + { + "epoch": 0.08713904083449943, + "ewc_loss": 0.005738528911024332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.869264562832541e-06, + "grad_norm": 11.498136520385742, + "learning_rate": 2.899533700720644e-07, + "loss": 0.5354, + "mean_token_accuracy": 0.8297090530395508, + "num_tokens": 26012864.0, + "step": 685 + }, + { + "epoch": 0.08726625111308994, + "ewc_loss": 0.005745175760239363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.872587856472819e-06, + "grad_norm": 11.407893180847168, + "learning_rate": 2.9037727850784227e-07, + "loss": 0.5824, + "mean_token_accuracy": 0.8193343877792358, + "num_tokens": 26049779.0, + "step": 686 + }, + { + "epoch": 0.08739346139168044, + "ewc_loss": 0.005751427728682756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8757137897628127e-06, + "grad_norm": 11.504993438720703, + "learning_rate": 2.9080118694362016e-07, + "loss": 0.5519, + "mean_token_accuracy": 0.828696072101593, + "num_tokens": 26089095.0, + "step": 687 + }, + { + "epoch": 0.08752067167027096, + "ewc_loss": 0.0057851714082062244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.892585598601727e-06, + "grad_norm": 11.435425758361816, + "learning_rate": 2.9122509537939806e-07, + "loss": 0.5514, + "mean_token_accuracy": 0.8278048634529114, + "num_tokens": 26132553.0, + "step": 688 + }, + { + "epoch": 0.08764788194886147, + "ewc_loss": 0.005747829098254442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8739145818690304e-06, + "grad_norm": 11.454157829284668, + "learning_rate": 2.916490038151759e-07, + "loss": 0.6013, + "mean_token_accuracy": 0.8098485469818115, + "num_tokens": 26171301.0, + "step": 689 + }, + { + "epoch": 0.08777509222745197, + "ewc_loss": 0.005785519257187843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.892759539463441e-06, + "grad_norm": 11.54537296295166, + "learning_rate": 2.9207291225095376e-07, + "loss": 0.5894, + "mean_token_accuracy": 0.8151947855949402, + "num_tokens": 26217871.0, + "step": 690 + }, + { + "epoch": 0.08790230250604249, + "ewc_loss": 0.005788744427263737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8943723009433597e-06, + "grad_norm": 11.471572875976562, + "learning_rate": 2.9249682068673166e-07, + "loss": 0.5527, + "mean_token_accuracy": 0.8248882293701172, + "num_tokens": 26255927.0, + "step": 691 + }, + { + "epoch": 0.088029512784633, + "ewc_loss": 0.00578324543312192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8916226710862247e-06, + "grad_norm": 11.662236213684082, + "learning_rate": 2.9292072912250955e-07, + "loss": 0.5362, + "mean_token_accuracy": 0.8337932229042053, + "num_tokens": 26296041.0, + "step": 692 + }, + { + "epoch": 0.0881567230632235, + "ewc_loss": 0.005809086840599775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.904543407566962e-06, + "grad_norm": 11.51551342010498, + "learning_rate": 2.933446375582874e-07, + "loss": 0.6187, + "mean_token_accuracy": 0.8082907199859619, + "num_tokens": 26333785.0, + "step": 693 + }, + { + "epoch": 0.08828393334181402, + "ewc_loss": 0.005768727511167526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8843637664976995e-06, + "grad_norm": 11.500516891479492, + "learning_rate": 2.9376854599406525e-07, + "loss": 0.5241, + "mean_token_accuracy": 0.8323816061019897, + "num_tokens": 26368837.0, + "step": 694 + }, + { + "epoch": 0.08841114362040453, + "ewc_loss": 0.0058028302155435085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.901415200540214e-06, + "grad_norm": 11.662487030029297, + "learning_rate": 2.9419245442984315e-07, + "loss": 0.6004, + "mean_token_accuracy": 0.8153785467147827, + "num_tokens": 26402151.0, + "step": 695 + }, + { + "epoch": 0.08853835389899505, + "ewc_loss": 0.00582801504060626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.914007609433611e-06, + "grad_norm": 11.514610290527344, + "learning_rate": 2.9461636286562104e-07, + "loss": 0.5514, + "mean_token_accuracy": 0.8261693716049194, + "num_tokens": 26435550.0, + "step": 696 + }, + { + "epoch": 0.08866556417758555, + "ewc_loss": 0.005785664077848196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.8928320716659073e-06, + "grad_norm": 11.435937881469727, + "learning_rate": 2.9504027130139884e-07, + "loss": 0.5401, + "mean_token_accuracy": 0.8305484056472778, + "num_tokens": 26479299.0, + "step": 697 + }, + { + "epoch": 0.08879277445617606, + "ewc_loss": 0.005815105978399515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9075529255351285e-06, + "grad_norm": 11.49157428741455, + "learning_rate": 2.9546417973717674e-07, + "loss": 0.5776, + "mean_token_accuracy": 0.8190442323684692, + "num_tokens": 26519380.0, + "step": 698 + }, + { + "epoch": 0.08891998473476657, + "ewc_loss": 0.00582378264516592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9118912152625853e-06, + "grad_norm": 11.478804588317871, + "learning_rate": 2.9588808817295464e-07, + "loss": 0.504, + "mean_token_accuracy": 0.8389405012130737, + "num_tokens": 26557306.0, + "step": 699 + }, + { + "epoch": 0.08904719501335707, + "ewc_loss": 0.005830670706927776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.915335244324524e-06, + "grad_norm": 11.834774017333984, + "learning_rate": 2.9631199660873253e-07, + "loss": 0.5754, + "mean_token_accuracy": 0.8200992941856384, + "num_tokens": 26593549.0, + "step": 700 + }, + { + "epoch": 0.08917440529194759, + "ewc_loss": 0.005886910483241081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.943455228887615e-06, + "grad_norm": 11.596132278442383, + "learning_rate": 2.9673590504451033e-07, + "loss": 0.4993, + "mean_token_accuracy": 0.8441873788833618, + "num_tokens": 26629926.0, + "step": 701 + }, + { + "epoch": 0.0893016155705381, + "ewc_loss": 0.005804379470646381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9021896352787735e-06, + "grad_norm": 11.405269622802734, + "learning_rate": 2.9715981348028823e-07, + "loss": 0.5318, + "mean_token_accuracy": 0.8343173265457153, + "num_tokens": 26668652.0, + "step": 702 + }, + { + "epoch": 0.0894288258491286, + "ewc_loss": 0.005850575398653746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9252876174723497e-06, + "grad_norm": 11.767586708068848, + "learning_rate": 2.975837219160661e-07, + "loss": 0.5943, + "mean_token_accuracy": 0.8139408826828003, + "num_tokens": 26708084.0, + "step": 703 + }, + { + "epoch": 0.08955603612771912, + "ewc_loss": 0.005911510903388262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9557554626080673e-06, + "grad_norm": 11.578049659729004, + "learning_rate": 2.98007630351844e-07, + "loss": 0.5595, + "mean_token_accuracy": 0.8263846039772034, + "num_tokens": 26750134.0, + "step": 704 + }, + { + "epoch": 0.08968324640630963, + "ewc_loss": 0.005852935370057821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9264676868479e-06, + "grad_norm": 11.666360855102539, + "learning_rate": 2.984315387876218e-07, + "loss": 0.5674, + "mean_token_accuracy": 0.8210987448692322, + "num_tokens": 26785410.0, + "step": 705 + }, + { + "epoch": 0.08981045668490013, + "ewc_loss": 0.005901082884520292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9505415568564786e-06, + "grad_norm": 11.62846565246582, + "learning_rate": 2.988554472233997e-07, + "loss": 0.5709, + "mean_token_accuracy": 0.8241261839866638, + "num_tokens": 26825226.0, + "step": 706 + }, + { + "epoch": 0.08993766696349065, + "ewc_loss": 0.005897302646189928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.948651399492519e-06, + "grad_norm": 11.545939445495605, + "learning_rate": 2.992793556591776e-07, + "loss": 0.5647, + "mean_token_accuracy": 0.8226593732833862, + "num_tokens": 26865401.0, + "step": 707 + }, + { + "epoch": 0.09006487724208116, + "ewc_loss": 0.0058852373622357845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.942618721135659e-06, + "grad_norm": 11.550122261047363, + "learning_rate": 2.997032640949555e-07, + "loss": 0.5743, + "mean_token_accuracy": 0.8210783004760742, + "num_tokens": 26905365.0, + "step": 708 + }, + { + "epoch": 0.09019208752067168, + "ewc_loss": 0.005933335050940514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.966667580039939e-06, + "grad_norm": 11.557209968566895, + "learning_rate": 3.001271725307333e-07, + "loss": 0.5152, + "mean_token_accuracy": 0.8376485705375671, + "num_tokens": 26941348.0, + "step": 709 + }, + { + "epoch": 0.09031929779926218, + "ewc_loss": 0.0059317233972251415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.965861767734168e-06, + "grad_norm": 11.669419288635254, + "learning_rate": 3.005510809665112e-07, + "loss": 0.5505, + "mean_token_accuracy": 0.8276986479759216, + "num_tokens": 26975803.0, + "step": 710 + }, + { + "epoch": 0.09044650807785269, + "ewc_loss": 0.005949388258159161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9746940981567604e-06, + "grad_norm": 11.693826675415039, + "learning_rate": 3.009749894022891e-07, + "loss": 0.4956, + "mean_token_accuracy": 0.8435413837432861, + "num_tokens": 27014931.0, + "step": 711 + }, + { + "epoch": 0.0905737183564432, + "ewc_loss": 0.005960449576377869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9802247354382416e-06, + "grad_norm": 11.725461959838867, + "learning_rate": 3.01398897838067e-07, + "loss": 0.5546, + "mean_token_accuracy": 0.827649712562561, + "num_tokens": 27055505.0, + "step": 712 + }, + { + "epoch": 0.0907009286350337, + "ewc_loss": 0.0059587243013083935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.97936207971361e-06, + "grad_norm": 11.59937572479248, + "learning_rate": 3.018228062738448e-07, + "loss": 0.5808, + "mean_token_accuracy": 0.8211933970451355, + "num_tokens": 27092121.0, + "step": 713 + }, + { + "epoch": 0.09082813891362422, + "ewc_loss": 0.005975589156150818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.987794687214773e-06, + "grad_norm": 11.682046890258789, + "learning_rate": 3.022467147096227e-07, + "loss": 0.5179, + "mean_token_accuracy": 0.8370579481124878, + "num_tokens": 27129827.0, + "step": 714 + }, + { + "epoch": 0.09095534919221474, + "ewc_loss": 0.0059896609745919704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9948305382276885e-06, + "grad_norm": 11.670689582824707, + "learning_rate": 3.026706231454006e-07, + "loss": 0.5352, + "mean_token_accuracy": 0.8314616084098816, + "num_tokens": 27174977.0, + "step": 715 + }, + { + "epoch": 0.09108255947080524, + "ewc_loss": 0.0059889936819672585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.994496753672138e-06, + "grad_norm": 11.791644096374512, + "learning_rate": 3.0309453158117844e-07, + "loss": 0.5412, + "mean_token_accuracy": 0.8285825252532959, + "num_tokens": 27211472.0, + "step": 716 + }, + { + "epoch": 0.09120976974939575, + "ewc_loss": 0.006010682787746191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.005341341122403e-06, + "grad_norm": 11.801396369934082, + "learning_rate": 3.035184400169563e-07, + "loss": 0.6256, + "mean_token_accuracy": 0.8087555766105652, + "num_tokens": 27249160.0, + "step": 717 + }, + { + "epoch": 0.09133698002798626, + "ewc_loss": 0.005989694502204657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9948473638796713e-06, + "grad_norm": 11.635600090026855, + "learning_rate": 3.039423484527342e-07, + "loss": 0.5322, + "mean_token_accuracy": 0.8317052721977234, + "num_tokens": 27287946.0, + "step": 718 + }, + { + "epoch": 0.09146419030657676, + "ewc_loss": 0.005993937607854605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.9969687602715567e-06, + "grad_norm": 11.621926307678223, + "learning_rate": 3.043662568885121e-07, + "loss": 0.5671, + "mean_token_accuracy": 0.8177648782730103, + "num_tokens": 27323388.0, + "step": 719 + }, + { + "epoch": 0.09159140058516728, + "ewc_loss": 0.006013914477080107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.006957285833778e-06, + "grad_norm": 11.741307258605957, + "learning_rate": 3.0479016532428993e-07, + "loss": 0.6136, + "mean_token_accuracy": 0.8075833320617676, + "num_tokens": 27357524.0, + "step": 720 + }, + { + "epoch": 0.0917186108637578, + "ewc_loss": 0.0060357809998095036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0178905490174657e-06, + "grad_norm": 11.69965934753418, + "learning_rate": 3.052140737600678e-07, + "loss": 0.5202, + "mean_token_accuracy": 0.8349843621253967, + "num_tokens": 27392037.0, + "step": 721 + }, + { + "epoch": 0.09184582114234831, + "ewc_loss": 0.006052385084331036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0261926440289244e-06, + "grad_norm": 11.696019172668457, + "learning_rate": 3.056379821958457e-07, + "loss": 0.5312, + "mean_token_accuracy": 0.8323866128921509, + "num_tokens": 27429149.0, + "step": 722 + }, + { + "epoch": 0.09197303142093881, + "ewc_loss": 0.006063446402549744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0317232813104056e-06, + "grad_norm": 11.771525382995605, + "learning_rate": 3.060618906316236e-07, + "loss": 0.5381, + "mean_token_accuracy": 0.8332773447036743, + "num_tokens": 27465023.0, + "step": 723 + }, + { + "epoch": 0.09210024169952932, + "ewc_loss": 0.006085332483053207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0426663215621375e-06, + "grad_norm": 11.757281303405762, + "learning_rate": 3.064857990674014e-07, + "loss": 0.529, + "mean_token_accuracy": 0.8341648578643799, + "num_tokens": 27501268.0, + "step": 724 + }, + { + "epoch": 0.09222745197811984, + "ewc_loss": 0.0060761175118386745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.038058821402956e-06, + "grad_norm": 11.905233383178711, + "learning_rate": 3.0690970750317927e-07, + "loss": 0.5814, + "mean_token_accuracy": 0.8185176849365234, + "num_tokens": 27539937.0, + "step": 725 + }, + { + "epoch": 0.09235466225671034, + "ewc_loss": 0.006103095132857561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.05154753732495e-06, + "grad_norm": 11.720846176147461, + "learning_rate": 3.0733361593895717e-07, + "loss": 0.5427, + "mean_token_accuracy": 0.8295373320579529, + "num_tokens": 27574576.0, + "step": 726 + }, + { + "epoch": 0.09248187253530085, + "ewc_loss": 0.00605970760807395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0298538149509113e-06, + "grad_norm": 11.766992568969727, + "learning_rate": 3.0775752437473507e-07, + "loss": 0.552, + "mean_token_accuracy": 0.8230910301208496, + "num_tokens": 27616036.0, + "step": 727 + }, + { + "epoch": 0.09260908281389137, + "ewc_loss": 0.006107610650360584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0538053579221014e-06, + "grad_norm": 11.702468872070312, + "learning_rate": 3.081814328105129e-07, + "loss": 0.5761, + "mean_token_accuracy": 0.8197271227836609, + "num_tokens": 27662040.0, + "step": 728 + }, + { + "epoch": 0.09273629309248187, + "ewc_loss": 0.0061142886988818645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0571443403459853e-06, + "grad_norm": 11.80589485168457, + "learning_rate": 3.0860534124629076e-07, + "loss": 0.6056, + "mean_token_accuracy": 0.8114675879478455, + "num_tokens": 27705136.0, + "step": 729 + }, + { + "epoch": 0.09286350337107238, + "ewc_loss": 0.006137726828455925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.068863406952005e-06, + "grad_norm": 11.816383361816406, + "learning_rate": 3.0902924968206866e-07, + "loss": 0.5558, + "mean_token_accuracy": 0.8250967264175415, + "num_tokens": 27740564.0, + "step": 730 + }, + { + "epoch": 0.0929907136496629, + "ewc_loss": 0.0061274622566998005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0637311283499002e-06, + "grad_norm": 11.95790958404541, + "learning_rate": 3.0945315811784656e-07, + "loss": 0.5264, + "mean_token_accuracy": 0.8318255543708801, + "num_tokens": 27774788.0, + "step": 731 + }, + { + "epoch": 0.0931179239282534, + "ewc_loss": 0.00616049999371171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0802500532445265e-06, + "grad_norm": 11.752933502197266, + "learning_rate": 3.098770665536244e-07, + "loss": 0.6728, + "mean_token_accuracy": 0.7888801097869873, + "num_tokens": 27818300.0, + "step": 732 + }, + { + "epoch": 0.09324513420684391, + "ewc_loss": 0.006126969121396542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0634846552857198e-06, + "grad_norm": 11.7686185836792, + "learning_rate": 3.1030097498940225e-07, + "loss": 0.5131, + "mean_token_accuracy": 0.8378991484642029, + "num_tokens": 27857739.0, + "step": 733 + }, + { + "epoch": 0.09337234448543442, + "ewc_loss": 0.006166747771203518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.083373940171441e-06, + "grad_norm": 11.903634071350098, + "learning_rate": 3.1072488342518015e-07, + "loss": 0.5556, + "mean_token_accuracy": 0.8211919665336609, + "num_tokens": 27893897.0, + "step": 734 + }, + { + "epoch": 0.09349955476402494, + "ewc_loss": 0.006187377963215113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0936889743315987e-06, + "grad_norm": 11.853193283081055, + "learning_rate": 3.11148791860958e-07, + "loss": 0.5403, + "mean_token_accuracy": 0.8270620703697205, + "num_tokens": 27930511.0, + "step": 735 + }, + { + "epoch": 0.09362676504261544, + "ewc_loss": 0.006160305347293615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.080152737311437e-06, + "grad_norm": 11.987419128417969, + "learning_rate": 3.115727002967359e-07, + "loss": 0.5593, + "mean_token_accuracy": 0.8229107856750488, + "num_tokens": 27961128.0, + "step": 736 + }, + { + "epoch": 0.09375397532120595, + "ewc_loss": 0.0062039014883339405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.101950824202504e-06, + "grad_norm": 11.7761812210083, + "learning_rate": 3.1199660873251374e-07, + "loss": 0.5594, + "mean_token_accuracy": 0.8254501223564148, + "num_tokens": 28000748.0, + "step": 737 + }, + { + "epoch": 0.09388118559979647, + "ewc_loss": 0.0061653973534703255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0826986403553747e-06, + "grad_norm": 11.784613609313965, + "learning_rate": 3.1242051716829164e-07, + "loss": 0.592, + "mean_token_accuracy": 0.8127811551094055, + "num_tokens": 28046455.0, + "step": 738 + }, + { + "epoch": 0.09400839587838697, + "ewc_loss": 0.006191709544509649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.0958547085901955e-06, + "grad_norm": 11.828987121582031, + "learning_rate": 3.128444256040695e-07, + "loss": 0.5684, + "mean_token_accuracy": 0.8230804204940796, + "num_tokens": 28087026.0, + "step": 739 + }, + { + "epoch": 0.09413560615697748, + "ewc_loss": 0.00622967816889286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1148390462476527e-06, + "grad_norm": 11.9083890914917, + "learning_rate": 3.132683340398474e-07, + "loss": 0.4905, + "mean_token_accuracy": 0.8422642946243286, + "num_tokens": 28126382.0, + "step": 740 + }, + { + "epoch": 0.094262816435568, + "ewc_loss": 0.006222436670213938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1112183478398947e-06, + "grad_norm": 11.783622741699219, + "learning_rate": 3.1369224247562523e-07, + "loss": 0.5183, + "mean_token_accuracy": 0.8377543687820435, + "num_tokens": 28161198.0, + "step": 741 + }, + { + "epoch": 0.0943900267141585, + "ewc_loss": 0.006217800080776215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1089000458450755e-06, + "grad_norm": 11.868812561035156, + "learning_rate": 3.1411615091140313e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.8460026979446411, + "num_tokens": 28204612.0, + "step": 742 + }, + { + "epoch": 0.09451723699274901, + "ewc_loss": 0.006254938431084156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1274691991711734e-06, + "grad_norm": 11.862485885620117, + "learning_rate": 3.14540059347181e-07, + "loss": 0.5411, + "mean_token_accuracy": 0.8290954828262329, + "num_tokens": 28241050.0, + "step": 743 + }, + { + "epoch": 0.09464444727133953, + "ewc_loss": 0.006238434463739395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.119217126368312e-06, + "grad_norm": 11.813831329345703, + "learning_rate": 3.149639677829589e-07, + "loss": 0.4999, + "mean_token_accuracy": 0.839342474937439, + "num_tokens": 28278032.0, + "step": 744 + }, + { + "epoch": 0.09477165754993004, + "ewc_loss": 0.006249801255762577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.124900558759691e-06, + "grad_norm": 11.928009033203125, + "learning_rate": 3.153878762187368e-07, + "loss": 0.5879, + "mean_token_accuracy": 0.8201370239257812, + "num_tokens": 28315599.0, + "step": 745 + }, + { + "epoch": 0.09489886782852054, + "ewc_loss": 0.006274838466197252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1374192985822447e-06, + "grad_norm": 11.816264152526855, + "learning_rate": 3.158117846545146e-07, + "loss": 0.6111, + "mean_token_accuracy": 0.8115500807762146, + "num_tokens": 28358504.0, + "step": 746 + }, + { + "epoch": 0.09502607810711106, + "ewc_loss": 0.006262133829295635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1310669328377116e-06, + "grad_norm": 11.97059440612793, + "learning_rate": 3.1623569309029247e-07, + "loss": 0.5274, + "mean_token_accuracy": 0.8330526351928711, + "num_tokens": 28389626.0, + "step": 747 + }, + { + "epoch": 0.09515328838570157, + "ewc_loss": 0.006300721783190966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1503609534411225e-06, + "grad_norm": 11.816241264343262, + "learning_rate": 3.1665960152607037e-07, + "loss": 0.5113, + "mean_token_accuracy": 0.8375198841094971, + "num_tokens": 28430361.0, + "step": 748 + }, + { + "epoch": 0.09528049866429207, + "ewc_loss": 0.006277224514633417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1386123282572953e-06, + "grad_norm": 11.915359497070312, + "learning_rate": 3.1708350996184826e-07, + "loss": 0.5478, + "mean_token_accuracy": 0.8270859122276306, + "num_tokens": 28469792.0, + "step": 749 + }, + { + "epoch": 0.09540770894288259, + "ewc_loss": 0.006306803319603205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.153401621602825e-06, + "grad_norm": 12.018537521362305, + "learning_rate": 3.175074183976261e-07, + "loss": 0.527, + "mean_token_accuracy": 0.8365645408630371, + "num_tokens": 28507569.0, + "step": 750 + }, + { + "epoch": 0.0955349192214731, + "ewc_loss": 0.006301569286733866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1507845505984733e-06, + "grad_norm": 11.976143836975098, + "learning_rate": 3.1793132683340396e-07, + "loss": 0.5534, + "mean_token_accuracy": 0.8258985877037048, + "num_tokens": 28542336.0, + "step": 751 + }, + { + "epoch": 0.0956621295000636, + "ewc_loss": 0.00632630055770278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1631502679374535e-06, + "grad_norm": 11.941495895385742, + "learning_rate": 3.1835523526918186e-07, + "loss": 0.5026, + "mean_token_accuracy": 0.840933084487915, + "num_tokens": 28578587.0, + "step": 752 + }, + { + "epoch": 0.09578933977865411, + "ewc_loss": 0.0063324118964374065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.166205942761735e-06, + "grad_norm": 12.045459747314453, + "learning_rate": 3.1877914370495975e-07, + "loss": 0.5833, + "mean_token_accuracy": 0.8178318738937378, + "num_tokens": 28617505.0, + "step": 753 + }, + { + "epoch": 0.09591655005724463, + "ewc_loss": 0.006334708537906408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1673541798227234e-06, + "grad_norm": 11.947373390197754, + "learning_rate": 3.1920305214073755e-07, + "loss": 0.5893, + "mean_token_accuracy": 0.8169721364974976, + "num_tokens": 28658526.0, + "step": 754 + }, + { + "epoch": 0.09604376033583513, + "ewc_loss": 0.0063335527665913105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.166776423313422e-06, + "grad_norm": 11.898043632507324, + "learning_rate": 3.1962696057651545e-07, + "loss": 0.5238, + "mean_token_accuracy": 0.8337730169296265, + "num_tokens": 28698825.0, + "step": 755 + }, + { + "epoch": 0.09617097061442564, + "ewc_loss": 0.00633025448769331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1651272820454324e-06, + "grad_norm": 11.90087604522705, + "learning_rate": 3.2005086901229335e-07, + "loss": 0.5664, + "mean_token_accuracy": 0.820233166217804, + "num_tokens": 28733238.0, + "step": 756 + }, + { + "epoch": 0.09629818089301616, + "ewc_loss": 0.006351111456751823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.175555775669636e-06, + "grad_norm": 11.949928283691406, + "learning_rate": 3.2047477744807125e-07, + "loss": 0.5688, + "mean_token_accuracy": 0.8241391777992249, + "num_tokens": 28771552.0, + "step": 757 + }, + { + "epoch": 0.09642539117160667, + "ewc_loss": 0.006365678738802671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1828394639887847e-06, + "grad_norm": 11.926029205322266, + "learning_rate": 3.2089868588384904e-07, + "loss": 0.6014, + "mean_token_accuracy": 0.8106160163879395, + "num_tokens": 28810827.0, + "step": 758 + }, + { + "epoch": 0.09655260145019717, + "ewc_loss": 0.00637401407584548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1870069960859837e-06, + "grad_norm": 11.98727798461914, + "learning_rate": 3.2132259431962694e-07, + "loss": 0.51, + "mean_token_accuracy": 0.8399950861930847, + "num_tokens": 28849017.0, + "step": 759 + }, + { + "epoch": 0.09667981172878769, + "ewc_loss": 0.006388203240931034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1941015095071634e-06, + "grad_norm": 11.9331636428833, + "learning_rate": 3.2174650275540484e-07, + "loss": 0.5611, + "mean_token_accuracy": 0.8287992477416992, + "num_tokens": 28887659.0, + "step": 760 + }, + { + "epoch": 0.0968070220073782, + "ewc_loss": 0.0063806334510445595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.1903166473057354e-06, + "grad_norm": 11.954100608825684, + "learning_rate": 3.2217041119118274e-07, + "loss": 0.5283, + "mean_token_accuracy": 0.8326194286346436, + "num_tokens": 28928983.0, + "step": 761 + }, + { + "epoch": 0.0969342322859687, + "ewc_loss": 0.006394014693796635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.197007345079328e-06, + "grad_norm": 12.033846855163574, + "learning_rate": 3.2259431962696053e-07, + "loss": 0.563, + "mean_token_accuracy": 0.8244976997375488, + "num_tokens": 28962195.0, + "step": 762 + }, + { + "epoch": 0.09706144256455922, + "ewc_loss": 0.006415863987058401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.207931968063349e-06, + "grad_norm": 11.956218719482422, + "learning_rate": 3.2301822806273843e-07, + "loss": 0.5445, + "mean_token_accuracy": 0.8276212215423584, + "num_tokens": 28997351.0, + "step": 763 + }, + { + "epoch": 0.09718865284314973, + "ewc_loss": 0.006404623854905367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2023119729274185e-06, + "grad_norm": 12.00650691986084, + "learning_rate": 3.2344213649851633e-07, + "loss": 0.5719, + "mean_token_accuracy": 0.8221163153648376, + "num_tokens": 29040988.0, + "step": 764 + }, + { + "epoch": 0.09731586312174023, + "ewc_loss": 0.006429630797356367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2148154787137173e-06, + "grad_norm": 11.922638893127441, + "learning_rate": 3.238660449342942e-07, + "loss": 0.5329, + "mean_token_accuracy": 0.8302074670791626, + "num_tokens": 29081873.0, + "step": 765 + }, + { + "epoch": 0.09744307340033075, + "ewc_loss": 0.006418670993298292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.20933554576186e-06, + "grad_norm": 12.0604829788208, + "learning_rate": 3.24289953370072e-07, + "loss": 0.5015, + "mean_token_accuracy": 0.8376554250717163, + "num_tokens": 29115360.0, + "step": 766 + }, + { + "epoch": 0.09757028367892126, + "ewc_loss": 0.006445659324526787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.222829718652065e-06, + "grad_norm": 11.970026969909668, + "learning_rate": 3.247138618058499e-07, + "loss": 0.549, + "mean_token_accuracy": 0.8230733871459961, + "num_tokens": 29160139.0, + "step": 767 + }, + { + "epoch": 0.09769749395751176, + "ewc_loss": 0.006424406543374062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.21220318255655e-06, + "grad_norm": 12.010567665100098, + "learning_rate": 3.251377702416278e-07, + "loss": 0.5785, + "mean_token_accuracy": 0.8197982907295227, + "num_tokens": 29202872.0, + "step": 768 + }, + { + "epoch": 0.09782470423610228, + "ewc_loss": 0.0064560845494270325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.228042260161601e-06, + "grad_norm": 11.990036964416504, + "learning_rate": 3.255616786774057e-07, + "loss": 0.5253, + "mean_token_accuracy": 0.834190845489502, + "num_tokens": 29246965.0, + "step": 769 + }, + { + "epoch": 0.09795191451469279, + "ewc_loss": 0.006469881162047386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2349405501008732e-06, + "grad_norm": 11.989995956420898, + "learning_rate": 3.259855871131835e-07, + "loss": 0.464, + "mean_token_accuracy": 0.8517642617225647, + "num_tokens": 29289531.0, + "step": 770 + }, + { + "epoch": 0.0980791247932833, + "ewc_loss": 0.006469287443906069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.23464382745442e-06, + "grad_norm": 11.944729804992676, + "learning_rate": 3.264094955489614e-07, + "loss": 0.5211, + "mean_token_accuracy": 0.8348984718322754, + "num_tokens": 29330484.0, + "step": 771 + }, + { + "epoch": 0.0982063350718738, + "ewc_loss": 0.00647332426160574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2366622235713294e-06, + "grad_norm": 12.037476539611816, + "learning_rate": 3.268334039847393e-07, + "loss": 0.5498, + "mean_token_accuracy": 0.825150191783905, + "num_tokens": 29369595.0, + "step": 772 + }, + { + "epoch": 0.09833354535046432, + "ewc_loss": 0.006512132473289967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.256066293033655e-06, + "grad_norm": 12.06771469116211, + "learning_rate": 3.2725731242051715e-07, + "loss": 0.5376, + "mean_token_accuracy": 0.8336413502693176, + "num_tokens": 29411856.0, + "step": 773 + }, + { + "epoch": 0.09846075562905483, + "ewc_loss": 0.006494714878499508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.247357426516828e-06, + "grad_norm": 12.038925170898438, + "learning_rate": 3.27681220856295e-07, + "loss": 0.5203, + "mean_token_accuracy": 0.8371672034263611, + "num_tokens": 29454754.0, + "step": 774 + }, + { + "epoch": 0.09858796590764533, + "ewc_loss": 0.0065019624307751656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2509813081560424e-06, + "grad_norm": 12.112387657165527, + "learning_rate": 3.281051292920729e-07, + "loss": 0.5146, + "mean_token_accuracy": 0.8343147039413452, + "num_tokens": 29489131.0, + "step": 775 + }, + { + "epoch": 0.09871517618623585, + "ewc_loss": 0.006524414289742708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2622072012600256e-06, + "grad_norm": 12.094151496887207, + "learning_rate": 3.285290377278508e-07, + "loss": 0.57, + "mean_token_accuracy": 0.8180156946182251, + "num_tokens": 29528292.0, + "step": 776 + }, + { + "epoch": 0.09884238646482636, + "ewc_loss": 0.006511457730084658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.255728870499297e-06, + "grad_norm": 12.155024528503418, + "learning_rate": 3.2895294616362864e-07, + "loss": 0.5016, + "mean_token_accuracy": 0.8414174318313599, + "num_tokens": 29564878.0, + "step": 777 + }, + { + "epoch": 0.09896959674341686, + "ewc_loss": 0.0065282974392175674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2641487450746354e-06, + "grad_norm": 12.10726547241211, + "learning_rate": 3.293768545994065e-07, + "loss": 0.5232, + "mean_token_accuracy": 0.8301510810852051, + "num_tokens": 29602826.0, + "step": 778 + }, + { + "epoch": 0.09909680702200738, + "ewc_loss": 0.0065223947167396545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2611974347673822e-06, + "grad_norm": 12.105180740356445, + "learning_rate": 3.298007630351844e-07, + "loss": 0.5908, + "mean_token_accuracy": 0.8172543048858643, + "num_tokens": 29640323.0, + "step": 779 + }, + { + "epoch": 0.09922401730059789, + "ewc_loss": 0.006523855496197939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.261927759012906e-06, + "grad_norm": 12.105371475219727, + "learning_rate": 3.302246714709623e-07, + "loss": 0.5202, + "mean_token_accuracy": 0.834348738193512, + "num_tokens": 29683302.0, + "step": 780 + }, + { + "epoch": 0.09935122757918839, + "ewc_loss": 0.006525087170302868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.262543486926006e-06, + "grad_norm": 12.104351043701172, + "learning_rate": 3.3064857990674013e-07, + "loss": 0.5461, + "mean_token_accuracy": 0.8299594521522522, + "num_tokens": 29721085.0, + "step": 781 + }, + { + "epoch": 0.0994784378577789, + "ewc_loss": 0.006541003938764334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.27050202031387e-06, + "grad_norm": 12.163230895996094, + "learning_rate": 3.31072488342518e-07, + "loss": 0.497, + "mean_token_accuracy": 0.8404562473297119, + "num_tokens": 29753810.0, + "step": 782 + }, + { + "epoch": 0.09960564813636942, + "ewc_loss": 0.006549245212227106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.274622713433928e-06, + "grad_norm": 12.072494506835938, + "learning_rate": 3.314963967782959e-07, + "loss": 0.5258, + "mean_token_accuracy": 0.8340047597885132, + "num_tokens": 29796488.0, + "step": 783 + }, + { + "epoch": 0.09973285841495994, + "ewc_loss": 0.0065582264214754105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2791131161502562e-06, + "grad_norm": 12.07417106628418, + "learning_rate": 3.319203052140738e-07, + "loss": 0.535, + "mean_token_accuracy": 0.830941379070282, + "num_tokens": 29838181.0, + "step": 784 + }, + { + "epoch": 0.09986006869355044, + "ewc_loss": 0.006576157175004482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.2880786875466583e-06, + "grad_norm": 12.363142967224121, + "learning_rate": 3.323442136498516e-07, + "loss": 0.5163, + "mean_token_accuracy": 0.8367090821266174, + "num_tokens": 29868893.0, + "step": 785 + }, + { + "epoch": 0.09998727897214095, + "ewc_loss": 0.006626861169934273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3134306249849033e-06, + "grad_norm": 12.2491455078125, + "learning_rate": 3.3276812208562947e-07, + "loss": 0.5336, + "mean_token_accuracy": 0.8295363187789917, + "num_tokens": 29903571.0, + "step": 786 + }, + { + "epoch": 0.10011448925073146, + "ewc_loss": 0.006580060813575983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.290030463176663e-06, + "grad_norm": 12.144720077514648, + "learning_rate": 3.3319203052140737e-07, + "loss": 0.5908, + "mean_token_accuracy": 0.8206892013549805, + "num_tokens": 29940438.0, + "step": 787 + }, + { + "epoch": 0.10024169952932196, + "ewc_loss": 0.006596414837986231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.298207502666628e-06, + "grad_norm": 12.149848937988281, + "learning_rate": 3.336159389571852e-07, + "loss": 0.5133, + "mean_token_accuracy": 0.8418537378311157, + "num_tokens": 29976538.0, + "step": 788 + }, + { + "epoch": 0.10036890980791248, + "ewc_loss": 0.0066258348524570465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.312917442599428e-06, + "grad_norm": 12.162020683288574, + "learning_rate": 3.340398473929631e-07, + "loss": 0.5833, + "mean_token_accuracy": 0.817502498626709, + "num_tokens": 30019814.0, + "step": 789 + }, + { + "epoch": 0.100496120086503, + "ewc_loss": 0.006619373802095652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3096869174187304e-06, + "grad_norm": 12.200891494750977, + "learning_rate": 3.3446375582874096e-07, + "loss": 0.5328, + "mean_token_accuracy": 0.8329863548278809, + "num_tokens": 30057578.0, + "step": 790 + }, + { + "epoch": 0.1006233303650935, + "ewc_loss": 0.006636306643486023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3181534035975346e-06, + "grad_norm": 12.189864158630371, + "learning_rate": 3.3488766426451886e-07, + "loss": 0.6061, + "mean_token_accuracy": 0.8188556432723999, + "num_tokens": 30094869.0, + "step": 791 + }, + { + "epoch": 0.10075054064368401, + "ewc_loss": 0.006623784080147743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.311891987323179e-06, + "grad_norm": 12.113360404968262, + "learning_rate": 3.353115727002967e-07, + "loss": 0.5664, + "mean_token_accuracy": 0.823236346244812, + "num_tokens": 30136446.0, + "step": 792 + }, + { + "epoch": 0.10087775092227452, + "ewc_loss": 0.006630961317569017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3154806260426994e-06, + "grad_norm": 12.12206745147705, + "learning_rate": 3.357354811360746e-07, + "loss": 0.56, + "mean_token_accuracy": 0.8202242255210876, + "num_tokens": 30179226.0, + "step": 793 + }, + { + "epoch": 0.10100496120086502, + "ewc_loss": 0.006673469673842192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3367348351021064e-06, + "grad_norm": 12.258956909179688, + "learning_rate": 3.3615938957185245e-07, + "loss": 0.5519, + "mean_token_accuracy": 0.8262234330177307, + "num_tokens": 30213890.0, + "step": 794 + }, + { + "epoch": 0.10113217147945554, + "ewc_loss": 0.006672114599496126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3360572615492856e-06, + "grad_norm": 12.24244213104248, + "learning_rate": 3.3658329800763035e-07, + "loss": 0.6014, + "mean_token_accuracy": 0.8073199987411499, + "num_tokens": 30245691.0, + "step": 795 + }, + { + "epoch": 0.10125938175804605, + "ewc_loss": 0.006668453570455313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.334226676088292e-06, + "grad_norm": 12.183706283569336, + "learning_rate": 3.370072064434082e-07, + "loss": 0.5562, + "mean_token_accuracy": 0.8180795311927795, + "num_tokens": 30281738.0, + "step": 796 + }, + { + "epoch": 0.10138659203663657, + "ewc_loss": 0.00667256535962224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.33628258886165e-06, + "grad_norm": 12.1404447555542, + "learning_rate": 3.374311148791861e-07, + "loss": 0.5199, + "mean_token_accuracy": 0.8336504101753235, + "num_tokens": 30320707.0, + "step": 797 + }, + { + "epoch": 0.10151380231522707, + "ewc_loss": 0.006688574329018593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3442870517319534e-06, + "grad_norm": 12.292211532592773, + "learning_rate": 3.3785502331496394e-07, + "loss": 0.531, + "mean_token_accuracy": 0.829165518283844, + "num_tokens": 30351379.0, + "step": 798 + }, + { + "epoch": 0.10164101259381758, + "ewc_loss": 0.00673361960798502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.366809778526658e-06, + "grad_norm": 12.270169258117676, + "learning_rate": 3.3827893175074184e-07, + "loss": 0.5535, + "mean_token_accuracy": 0.8235752582550049, + "num_tokens": 30382395.0, + "step": 799 + }, + { + "epoch": 0.1017682228724081, + "ewc_loss": 0.006731044966727495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3655223887762986e-06, + "grad_norm": 12.306097984313965, + "learning_rate": 3.387028401865197e-07, + "loss": 0.5597, + "mean_token_accuracy": 0.8272498250007629, + "num_tokens": 30419758.0, + "step": 800 + }, + { + "epoch": 0.1018954331509986, + "ewc_loss": 0.0067489370703697205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.374468633410288e-06, + "grad_norm": 12.214406967163086, + "learning_rate": 3.391267486222976e-07, + "loss": 0.542, + "mean_token_accuracy": 0.8258774280548096, + "num_tokens": 30458115.0, + "step": 801 + }, + { + "epoch": 0.10202264342958911, + "ewc_loss": 0.0067519573494791985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3759786219889065e-06, + "grad_norm": 12.228368759155273, + "learning_rate": 3.3955065705807543e-07, + "loss": 0.5774, + "mean_token_accuracy": 0.8160476684570312, + "num_tokens": 30489679.0, + "step": 802 + }, + { + "epoch": 0.10214985370817962, + "ewc_loss": 0.006782863289117813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.391431619093055e-06, + "grad_norm": 12.319602012634277, + "learning_rate": 3.3997456549385333e-07, + "loss": 0.5133, + "mean_token_accuracy": 0.8373037576675415, + "num_tokens": 30524874.0, + "step": 803 + }, + { + "epoch": 0.10227706398677013, + "ewc_loss": 0.006784670054912567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.392334974705591e-06, + "grad_norm": 12.2446870803833, + "learning_rate": 3.403984739296312e-07, + "loss": 0.5888, + "mean_token_accuracy": 0.8168799877166748, + "num_tokens": 30558354.0, + "step": 804 + }, + { + "epoch": 0.10240427426536064, + "ewc_loss": 0.006794332060962915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.3971659831877332e-06, + "grad_norm": 12.290258407592773, + "learning_rate": 3.408223823654091e-07, + "loss": 0.5181, + "mean_token_accuracy": 0.8353030681610107, + "num_tokens": 30594827.0, + "step": 805 + }, + { + "epoch": 0.10253148454395115, + "ewc_loss": 0.006813463289290667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.40673159371363e-06, + "grad_norm": 12.32155990600586, + "learning_rate": 3.412462908011869e-07, + "loss": 0.5576, + "mean_token_accuracy": 0.830294132232666, + "num_tokens": 30635259.0, + "step": 806 + }, + { + "epoch": 0.10265869482254165, + "ewc_loss": 0.006808269303292036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.404134758966393e-06, + "grad_norm": 12.212638854980469, + "learning_rate": 3.4167019923696477e-07, + "loss": 0.551, + "mean_token_accuracy": 0.8319716453552246, + "num_tokens": 30681850.0, + "step": 807 + }, + { + "epoch": 0.10278590510113217, + "ewc_loss": 0.006812172941863537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4060865345963975e-06, + "grad_norm": 12.323315620422363, + "learning_rate": 3.4209410767274267e-07, + "loss": 0.519, + "mean_token_accuracy": 0.8401871919631958, + "num_tokens": 30721302.0, + "step": 808 + }, + { + "epoch": 0.10291311537972268, + "ewc_loss": 0.006858558859676123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4292793316126335e-06, + "grad_norm": 12.359591484069824, + "learning_rate": 3.4251801610852057e-07, + "loss": 0.5941, + "mean_token_accuracy": 0.8117759227752686, + "num_tokens": 30761975.0, + "step": 809 + }, + { + "epoch": 0.1030403256583132, + "ewc_loss": 0.006844229996204376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4221150144730927e-06, + "grad_norm": 12.38278865814209, + "learning_rate": 3.429419245442984e-07, + "loss": 0.5478, + "mean_token_accuracy": 0.8259362578392029, + "num_tokens": 30797754.0, + "step": 810 + }, + { + "epoch": 0.1031675359369037, + "ewc_loss": 0.00684766611084342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.423833049964742e-06, + "grad_norm": 12.315576553344727, + "learning_rate": 3.4336583298007626e-07, + "loss": 0.521, + "mean_token_accuracy": 0.8371814489364624, + "num_tokens": 30834183.0, + "step": 811 + }, + { + "epoch": 0.10329474621549421, + "ewc_loss": 0.006849867291748524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.424933538553887e-06, + "grad_norm": 12.275416374206543, + "learning_rate": 3.4378974141585416e-07, + "loss": 0.5453, + "mean_token_accuracy": 0.8299552202224731, + "num_tokens": 30874160.0, + "step": 812 + }, + { + "epoch": 0.10342195649408473, + "ewc_loss": 0.006868312135338783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.43415604220354e-06, + "grad_norm": 12.3152437210083, + "learning_rate": 3.4421364985163206e-07, + "loss": 0.4835, + "mean_token_accuracy": 0.8454126715660095, + "num_tokens": 30916827.0, + "step": 813 + }, + { + "epoch": 0.10354916677267523, + "ewc_loss": 0.006868538446724415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4342692742939107e-06, + "grad_norm": 12.316306114196777, + "learning_rate": 3.446375582874099e-07, + "loss": 0.5568, + "mean_token_accuracy": 0.8263331055641174, + "num_tokens": 30960063.0, + "step": 814 + }, + { + "epoch": 0.10367637705126574, + "ewc_loss": 0.006875552702695131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.437776285863947e-06, + "grad_norm": 12.358154296875, + "learning_rate": 3.4506146672318775e-07, + "loss": 0.5023, + "mean_token_accuracy": 0.8374611139297485, + "num_tokens": 30995172.0, + "step": 815 + }, + { + "epoch": 0.10380358732985626, + "ewc_loss": 0.0068906028755009174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4453014450264163e-06, + "grad_norm": 12.46129035949707, + "learning_rate": 3.4548537515896565e-07, + "loss": 0.5662, + "mean_token_accuracy": 0.8226426839828491, + "num_tokens": 31035089.0, + "step": 816 + }, + { + "epoch": 0.10393079760844676, + "ewc_loss": 0.006902626249939203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4513132050051354e-06, + "grad_norm": 12.390892028808594, + "learning_rate": 3.4590928359474355e-07, + "loss": 0.5446, + "mean_token_accuracy": 0.8276358842849731, + "num_tokens": 31074836.0, + "step": 817 + }, + { + "epoch": 0.10405800788703727, + "ewc_loss": 0.0068789515644311905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.43947567671421e-06, + "grad_norm": 12.32568359375, + "learning_rate": 3.463331920305214e-07, + "loss": 0.565, + "mean_token_accuracy": 0.8212270736694336, + "num_tokens": 31114700.0, + "step": 818 + }, + { + "epoch": 0.10418521816562779, + "ewc_loss": 0.0068873632699251175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4436816349625587e-06, + "grad_norm": 12.463088035583496, + "learning_rate": 3.4675710046629924e-07, + "loss": 0.5135, + "mean_token_accuracy": 0.8366096019744873, + "num_tokens": 31148258.0, + "step": 819 + }, + { + "epoch": 0.1043124284442183, + "ewc_loss": 0.006922710221260786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4613551633810857e-06, + "grad_norm": 12.374212265014648, + "learning_rate": 3.4718100890207714e-07, + "loss": 0.5535, + "mean_token_accuracy": 0.8258594274520874, + "num_tokens": 31185720.0, + "step": 820 + }, + { + "epoch": 0.1044396387228088, + "ewc_loss": 0.006915259640663862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.457629873082624e-06, + "grad_norm": 12.475946426391602, + "learning_rate": 3.4760491733785504e-07, + "loss": 0.565, + "mean_token_accuracy": 0.819568395614624, + "num_tokens": 31220353.0, + "step": 821 + }, + { + "epoch": 0.10456684900139931, + "ewc_loss": 0.0069463374093174934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4731688174360897e-06, + "grad_norm": 12.412400245666504, + "learning_rate": 3.480288257736329e-07, + "loss": 0.5418, + "mean_token_accuracy": 0.8265316486358643, + "num_tokens": 31256477.0, + "step": 822 + }, + { + "epoch": 0.10469405927998983, + "ewc_loss": 0.006937792524695396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.468896238700836e-06, + "grad_norm": 12.36457633972168, + "learning_rate": 3.4845273420941073e-07, + "loss": 0.539, + "mean_token_accuracy": 0.8307477831840515, + "num_tokens": 31294365.0, + "step": 823 + }, + { + "epoch": 0.10482126955858033, + "ewc_loss": 0.006948616821318865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4743084142974112e-06, + "grad_norm": 12.453362464904785, + "learning_rate": 3.4887664264518863e-07, + "loss": 0.5336, + "mean_token_accuracy": 0.8274878263473511, + "num_tokens": 31329357.0, + "step": 824 + }, + { + "epoch": 0.10494847983717084, + "ewc_loss": 0.006993679795414209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4968400086654583e-06, + "grad_norm": 12.412801742553711, + "learning_rate": 3.4930055108096653e-07, + "loss": 0.5389, + "mean_token_accuracy": 0.8301693797111511, + "num_tokens": 31372180.0, + "step": 825 + }, + { + "epoch": 0.10507569011576136, + "ewc_loss": 0.006974257528781891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4871286516136024e-06, + "grad_norm": 12.351335525512695, + "learning_rate": 3.497244595167443e-07, + "loss": 0.5236, + "mean_token_accuracy": 0.8346399664878845, + "num_tokens": 31414866.0, + "step": 826 + }, + { + "epoch": 0.10520290039435186, + "ewc_loss": 0.006971611641347408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4858057915698737e-06, + "grad_norm": 12.511208534240723, + "learning_rate": 3.501483679525222e-07, + "loss": 0.5932, + "mean_token_accuracy": 0.8148021101951599, + "num_tokens": 31450583.0, + "step": 827 + }, + { + "epoch": 0.10533011067294237, + "ewc_loss": 0.007013694383203983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.506847178869066e-06, + "grad_norm": 12.41904067993164, + "learning_rate": 3.505722763883001e-07, + "loss": 0.5234, + "mean_token_accuracy": 0.8330065011978149, + "num_tokens": 31490371.0, + "step": 828 + }, + { + "epoch": 0.10545732095153289, + "ewc_loss": 0.006985206622630358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.4926033549709246e-06, + "grad_norm": 12.369327545166016, + "learning_rate": 3.50996184824078e-07, + "loss": 0.5669, + "mean_token_accuracy": 0.8211448192596436, + "num_tokens": 31525951.0, + "step": 829 + }, + { + "epoch": 0.10558453123012339, + "ewc_loss": 0.007012047804892063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.506023858790286e-06, + "grad_norm": 12.34195613861084, + "learning_rate": 3.514200932598558e-07, + "loss": 0.606, + "mean_token_accuracy": 0.8113499879837036, + "num_tokens": 31570109.0, + "step": 830 + }, + { + "epoch": 0.1057117415087139, + "ewc_loss": 0.007027740124613047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.513870069582481e-06, + "grad_norm": 12.563826560974121, + "learning_rate": 3.518440016956337e-07, + "loss": 0.5471, + "mean_token_accuracy": 0.8282817006111145, + "num_tokens": 31610569.0, + "step": 831 + }, + { + "epoch": 0.10583895178730442, + "ewc_loss": 0.0070731122978031635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.536556050676154e-06, + "grad_norm": 12.3826265335083, + "learning_rate": 3.522679101314116e-07, + "loss": 0.5582, + "mean_token_accuracy": 0.8274699449539185, + "num_tokens": 31653585.0, + "step": 832 + }, + { + "epoch": 0.10596616206589493, + "ewc_loss": 0.00702320970594883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.51160474565404e-06, + "grad_norm": 12.39887809753418, + "learning_rate": 3.526918185671895e-07, + "loss": 0.5682, + "mean_token_accuracy": 0.8195576667785645, + "num_tokens": 31696353.0, + "step": 833 + }, + { + "epoch": 0.10609337234448543, + "ewc_loss": 0.00706017529591918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.530087724357145e-06, + "grad_norm": 12.432520866394043, + "learning_rate": 3.531157270029673e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8415096998214722, + "num_tokens": 31730620.0, + "step": 834 + }, + { + "epoch": 0.10622058262307595, + "ewc_loss": 0.007069853600114584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5349266909179278e-06, + "grad_norm": 12.451003074645996, + "learning_rate": 3.535396354387452e-07, + "loss": 0.5143, + "mean_token_accuracy": 0.8345166444778442, + "num_tokens": 31770538.0, + "step": 835 + }, + { + "epoch": 0.10634779290166646, + "ewc_loss": 0.007094412110745907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5472060062602395e-06, + "grad_norm": 12.48428726196289, + "learning_rate": 3.539635438745231e-07, + "loss": 0.5576, + "mean_token_accuracy": 0.8229125142097473, + "num_tokens": 31808687.0, + "step": 836 + }, + { + "epoch": 0.10647500318025696, + "ewc_loss": 0.007083308417350054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.541654223226942e-06, + "grad_norm": 12.423981666564941, + "learning_rate": 3.54387452310301e-07, + "loss": 0.5362, + "mean_token_accuracy": 0.831795334815979, + "num_tokens": 31852310.0, + "step": 837 + }, + { + "epoch": 0.10660221345884748, + "ewc_loss": 0.007099154870957136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5495775136951124e-06, + "grad_norm": 12.496722221374512, + "learning_rate": 3.548113607460788e-07, + "loss": 0.5057, + "mean_token_accuracy": 0.8421324491500854, + "num_tokens": 31887897.0, + "step": 838 + }, + { + "epoch": 0.10672942373743799, + "ewc_loss": 0.007120352238416672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.560176082828548e-06, + "grad_norm": 12.472735404968262, + "learning_rate": 3.552352691818567e-07, + "loss": 0.5311, + "mean_token_accuracy": 0.8328579068183899, + "num_tokens": 31926159.0, + "step": 839 + }, + { + "epoch": 0.10685663401602849, + "ewc_loss": 0.007096976973116398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5484883937897393e-06, + "grad_norm": 12.464244842529297, + "learning_rate": 3.556591776176346e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.8409568667411804, + "num_tokens": 31968432.0, + "step": 840 + }, + { + "epoch": 0.106983844294619, + "ewc_loss": 0.007133455481380224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.566727627912769e-06, + "grad_norm": 12.584228515625, + "learning_rate": 3.560830860534125e-07, + "loss": 0.5398, + "mean_token_accuracy": 0.8307446241378784, + "num_tokens": 32000949.0, + "step": 841 + }, + { + "epoch": 0.10711105457320952, + "ewc_loss": 0.0071439724415540695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.57198632627842e-06, + "grad_norm": 12.603716850280762, + "learning_rate": 3.565069944891903e-07, + "loss": 0.5835, + "mean_token_accuracy": 0.8167927861213684, + "num_tokens": 32033862.0, + "step": 842 + }, + { + "epoch": 0.10723826485180002, + "ewc_loss": 0.00712913554161787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.564567805369734e-06, + "grad_norm": 12.534942626953125, + "learning_rate": 3.569309029249682e-07, + "loss": 0.5439, + "mean_token_accuracy": 0.8296981453895569, + "num_tokens": 32068623.0, + "step": 843 + }, + { + "epoch": 0.10736547513039053, + "ewc_loss": 0.007147728931158781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5738644328375813e-06, + "grad_norm": 12.534339904785156, + "learning_rate": 3.573548113607461e-07, + "loss": 0.53, + "mean_token_accuracy": 0.8330548405647278, + "num_tokens": 32106658.0, + "step": 844 + }, + { + "epoch": 0.10749268540898105, + "ewc_loss": 0.007147717289626598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.573858748495695e-06, + "grad_norm": 12.502198219299316, + "learning_rate": 3.577787197965239e-07, + "loss": 0.5743, + "mean_token_accuracy": 0.8172662854194641, + "num_tokens": 32142031.0, + "step": 845 + }, + { + "epoch": 0.10761989568757156, + "ewc_loss": 0.007160053122788668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5800264868157683e-06, + "grad_norm": 12.46091079711914, + "learning_rate": 3.5820262823230177e-07, + "loss": 0.5024, + "mean_token_accuracy": 0.8441394567489624, + "num_tokens": 32182686.0, + "step": 846 + }, + { + "epoch": 0.10774710596616206, + "ewc_loss": 0.007175170350819826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.5875850699085277e-06, + "grad_norm": 12.53565502166748, + "learning_rate": 3.5862653666807967e-07, + "loss": 0.6006, + "mean_token_accuracy": 0.8117960095405579, + "num_tokens": 32218027.0, + "step": 847 + }, + { + "epoch": 0.10787431624475258, + "ewc_loss": 0.00720411678776145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.602058313845191e-06, + "grad_norm": 12.488693237304688, + "learning_rate": 3.5905044510385757e-07, + "loss": 0.5535, + "mean_token_accuracy": 0.8279340863227844, + "num_tokens": 32257854.0, + "step": 848 + }, + { + "epoch": 0.10800152652334309, + "ewc_loss": 0.007204500958323479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6022504446009407e-06, + "grad_norm": 12.546741485595703, + "learning_rate": 3.594743535396354e-07, + "loss": 0.5367, + "mean_token_accuracy": 0.8315601348876953, + "num_tokens": 32295019.0, + "step": 849 + }, + { + "epoch": 0.10812873680193359, + "ewc_loss": 0.007215737830847502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.607868848121143e-06, + "grad_norm": 12.512375831604004, + "learning_rate": 3.5989826197541326e-07, + "loss": 0.5872, + "mean_token_accuracy": 0.815889835357666, + "num_tokens": 32335652.0, + "step": 850 + }, + { + "epoch": 0.1082559470805241, + "ewc_loss": 0.0072421180084347725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6210590224072803e-06, + "grad_norm": 12.602720260620117, + "learning_rate": 3.6032217041119116e-07, + "loss": 0.5343, + "mean_token_accuracy": 0.8271874189376831, + "num_tokens": 32377249.0, + "step": 851 + }, + { + "epoch": 0.10838315735911462, + "ewc_loss": 0.007238637190312147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6193187042954378e-06, + "grad_norm": 12.561164855957031, + "learning_rate": 3.6074607884696906e-07, + "loss": 0.5057, + "mean_token_accuracy": 0.8431459665298462, + "num_tokens": 32422467.0, + "step": 852 + }, + { + "epoch": 0.10851036763770512, + "ewc_loss": 0.00725734094157815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6286703561927425e-06, + "grad_norm": 12.590810775756836, + "learning_rate": 3.611699872827469e-07, + "loss": 0.4985, + "mean_token_accuracy": 0.8408533334732056, + "num_tokens": 32462120.0, + "step": 853 + }, + { + "epoch": 0.10863757791629564, + "ewc_loss": 0.007262782659381628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6313913369667716e-06, + "grad_norm": 12.69282341003418, + "learning_rate": 3.6159389571852475e-07, + "loss": 0.6184, + "mean_token_accuracy": 0.8089028596878052, + "num_tokens": 32498484.0, + "step": 854 + }, + { + "epoch": 0.10876478819488615, + "ewc_loss": 0.007265066262334585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6325332075648475e-06, + "grad_norm": 12.552603721618652, + "learning_rate": 3.6201780415430265e-07, + "loss": 0.5511, + "mean_token_accuracy": 0.8273754119873047, + "num_tokens": 32541536.0, + "step": 855 + }, + { + "epoch": 0.10889199847347665, + "ewc_loss": 0.007250471506267786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.625235649451497e-06, + "grad_norm": 12.585295677185059, + "learning_rate": 3.6244171259008055e-07, + "loss": 0.5768, + "mean_token_accuracy": 0.8229671120643616, + "num_tokens": 32579750.0, + "step": 856 + }, + { + "epoch": 0.10901920875206716, + "ewc_loss": 0.007280505727976561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6402527712198207e-06, + "grad_norm": 12.647372245788574, + "learning_rate": 3.628656210258584e-07, + "loss": 0.5829, + "mean_token_accuracy": 0.816752552986145, + "num_tokens": 32617518.0, + "step": 857 + }, + { + "epoch": 0.10914641903065768, + "ewc_loss": 0.007279477547854185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6397386793396436e-06, + "grad_norm": 12.5911226272583, + "learning_rate": 3.6328952946163624e-07, + "loss": 0.5241, + "mean_token_accuracy": 0.8338401317596436, + "num_tokens": 32653931.0, + "step": 858 + }, + { + "epoch": 0.1092736293092482, + "ewc_loss": 0.007283175364136696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.641587682068348e-06, + "grad_norm": 12.608217239379883, + "learning_rate": 3.6371343789741414e-07, + "loss": 0.5126, + "mean_token_accuracy": 0.8366793990135193, + "num_tokens": 32693797.0, + "step": 859 + }, + { + "epoch": 0.1094008395878387, + "ewc_loss": 0.00731032807379961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.655163936855388e-06, + "grad_norm": 12.62689208984375, + "learning_rate": 3.6413734633319204e-07, + "loss": 0.5618, + "mean_token_accuracy": 0.8242250084877014, + "num_tokens": 32732432.0, + "step": 860 + }, + { + "epoch": 0.10952804986642921, + "ewc_loss": 0.00730677368119359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6533867842081236e-06, + "grad_norm": 12.675210952758789, + "learning_rate": 3.645612547689699e-07, + "loss": 0.5135, + "mean_token_accuracy": 0.8334087133407593, + "num_tokens": 32769225.0, + "step": 861 + }, + { + "epoch": 0.10965526014501972, + "ewc_loss": 0.007333654444664717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.666827296910924e-06, + "grad_norm": 12.60071849822998, + "learning_rate": 3.6498516320474773e-07, + "loss": 0.5078, + "mean_token_accuracy": 0.8415701389312744, + "num_tokens": 32812027.0, + "step": 862 + }, + { + "epoch": 0.10978247042361022, + "ewc_loss": 0.007316557224839926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6582785014616093e-06, + "grad_norm": 12.590469360351562, + "learning_rate": 3.6540907164052563e-07, + "loss": 0.4875, + "mean_token_accuracy": 0.8458731174468994, + "num_tokens": 32849334.0, + "step": 863 + }, + { + "epoch": 0.10990968070220074, + "ewc_loss": 0.007337192073464394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.668596036732197e-06, + "grad_norm": 12.724743843078613, + "learning_rate": 3.658329800763035e-07, + "loss": 0.5807, + "mean_token_accuracy": 0.815802812576294, + "num_tokens": 32888422.0, + "step": 864 + }, + { + "epoch": 0.11003689098079125, + "ewc_loss": 0.007360025774687529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6800129237235524e-06, + "grad_norm": 12.703032493591309, + "learning_rate": 3.662568885120814e-07, + "loss": 0.5404, + "mean_token_accuracy": 0.8283039331436157, + "num_tokens": 32922672.0, + "step": 865 + }, + { + "epoch": 0.11016410125938175, + "ewc_loss": 0.007341957651078701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.670978912850842e-06, + "grad_norm": 12.690333366394043, + "learning_rate": 3.666807969478592e-07, + "loss": 0.5691, + "mean_token_accuracy": 0.8182989358901978, + "num_tokens": 32961694.0, + "step": 866 + }, + { + "epoch": 0.11029131153797227, + "ewc_loss": 0.007376926485449076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6884632663714e-06, + "grad_norm": 12.69402027130127, + "learning_rate": 3.671047053836371e-07, + "loss": 0.5589, + "mean_token_accuracy": 0.8217356204986572, + "num_tokens": 32998595.0, + "step": 867 + }, + { + "epoch": 0.11041852181656278, + "ewc_loss": 0.007384079974144697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6920400816597976e-06, + "grad_norm": 12.628327369689941, + "learning_rate": 3.6752861381941497e-07, + "loss": 0.5164, + "mean_token_accuracy": 0.8342972993850708, + "num_tokens": 33035884.0, + "step": 868 + }, + { + "epoch": 0.11054573209515328, + "ewc_loss": 0.007377149537205696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.6885746794723673e-06, + "grad_norm": 12.70920181274414, + "learning_rate": 3.6795252225519287e-07, + "loss": 0.5808, + "mean_token_accuracy": 0.8202692270278931, + "num_tokens": 33073335.0, + "step": 869 + }, + { + "epoch": 0.1106729423737438, + "ewc_loss": 0.007421413436532021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7107067782926606e-06, + "grad_norm": 12.676175117492676, + "learning_rate": 3.6837643069097077e-07, + "loss": 0.4788, + "mean_token_accuracy": 0.8467292785644531, + "num_tokens": 33111443.0, + "step": 870 + }, + { + "epoch": 0.11080015265233431, + "ewc_loss": 0.007405643351376057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7028216866019648e-06, + "grad_norm": 12.636754989624023, + "learning_rate": 3.688003391267486e-07, + "loss": 0.5383, + "mean_token_accuracy": 0.8336337804794312, + "num_tokens": 33156362.0, + "step": 871 + }, + { + "epoch": 0.11092736293092482, + "ewc_loss": 0.007417826447635889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7089132547407644e-06, + "grad_norm": 12.649518966674805, + "learning_rate": 3.6922424756252646e-07, + "loss": 0.5074, + "mean_token_accuracy": 0.8388810157775879, + "num_tokens": 33204968.0, + "step": 872 + }, + { + "epoch": 0.11105457320951533, + "ewc_loss": 0.007431962992995977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7159813928155927e-06, + "grad_norm": 12.773959159851074, + "learning_rate": 3.6964815599830436e-07, + "loss": 0.5569, + "mean_token_accuracy": 0.8221240639686584, + "num_tokens": 33239265.0, + "step": 873 + }, + { + "epoch": 0.11118178348810584, + "ewc_loss": 0.007441721390932798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.720860604516929e-06, + "grad_norm": 12.734028816223145, + "learning_rate": 3.7007206443408226e-07, + "loss": 0.487, + "mean_token_accuracy": 0.8464342355728149, + "num_tokens": 33275897.0, + "step": 874 + }, + { + "epoch": 0.11130899376669635, + "ewc_loss": 0.0074264719150960445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7132358556846157e-06, + "grad_norm": 12.736872673034668, + "learning_rate": 3.704959728698601e-07, + "loss": 0.614, + "mean_token_accuracy": 0.8083761930465698, + "num_tokens": 33313966.0, + "step": 875 + }, + { + "epoch": 0.11143620404528685, + "ewc_loss": 0.007445627357810736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7228137443889864e-06, + "grad_norm": 12.72637939453125, + "learning_rate": 3.7091988130563795e-07, + "loss": 0.5563, + "mean_token_accuracy": 0.8252944946289062, + "num_tokens": 33352909.0, + "step": 876 + }, + { + "epoch": 0.11156341432387737, + "ewc_loss": 0.007469086907804012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.734543497557752e-06, + "grad_norm": 12.703666687011719, + "learning_rate": 3.7134378974141585e-07, + "loss": 0.5381, + "mean_token_accuracy": 0.8315391540527344, + "num_tokens": 33400120.0, + "step": 877 + }, + { + "epoch": 0.11169062460246788, + "ewc_loss": 0.0074623338878154755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.73116699847742e-06, + "grad_norm": 12.748340606689453, + "learning_rate": 3.7176769817719375e-07, + "loss": 0.5685, + "mean_token_accuracy": 0.8210999369621277, + "num_tokens": 33436703.0, + "step": 878 + }, + { + "epoch": 0.11181783488105838, + "ewc_loss": 0.007473940495401621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7369702567957575e-06, + "grad_norm": 12.768104553222656, + "learning_rate": 3.7219160661297154e-07, + "loss": 0.4761, + "mean_token_accuracy": 0.8500434756278992, + "num_tokens": 33473515.0, + "step": 879 + }, + { + "epoch": 0.1119450451596489, + "ewc_loss": 0.0074831112287938595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.741555701708421e-06, + "grad_norm": 12.783543586730957, + "learning_rate": 3.7261551504874944e-07, + "loss": 0.5392, + "mean_token_accuracy": 0.8328201770782471, + "num_tokens": 33507659.0, + "step": 880 + }, + { + "epoch": 0.11207225543823941, + "ewc_loss": 0.007485735230147839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7428676478157286e-06, + "grad_norm": 12.73939323425293, + "learning_rate": 3.7303942348452734e-07, + "loss": 0.4844, + "mean_token_accuracy": 0.8444001078605652, + "num_tokens": 33542430.0, + "step": 881 + }, + { + "epoch": 0.11219946571682991, + "ewc_loss": 0.007499235682189465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7496179174922872e-06, + "grad_norm": 12.774510383605957, + "learning_rate": 3.7346333192030524e-07, + "loss": 0.4894, + "mean_token_accuracy": 0.8488978743553162, + "num_tokens": 33582579.0, + "step": 882 + }, + { + "epoch": 0.11232667599542043, + "ewc_loss": 0.007491149473935366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7455747587955557e-06, + "grad_norm": 12.663633346557617, + "learning_rate": 3.7388724035608303e-07, + "loss": 0.514, + "mean_token_accuracy": 0.841295063495636, + "num_tokens": 33625928.0, + "step": 883 + }, + { + "epoch": 0.11245388627401094, + "ewc_loss": 0.007485017646104097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.742508852155879e-06, + "grad_norm": 12.830879211425781, + "learning_rate": 3.7431114879186093e-07, + "loss": 0.5567, + "mean_token_accuracy": 0.8259396553039551, + "num_tokens": 33661325.0, + "step": 884 + }, + { + "epoch": 0.11258109655260146, + "ewc_loss": 0.007546365726739168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.773182925215224e-06, + "grad_norm": 12.833222389221191, + "learning_rate": 3.7473505722763883e-07, + "loss": 0.5793, + "mean_token_accuracy": 0.8176295161247253, + "num_tokens": 33696472.0, + "step": 885 + }, + { + "epoch": 0.11270830683119196, + "ewc_loss": 0.0075100138783454895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7550069009739673e-06, + "grad_norm": 12.783085823059082, + "learning_rate": 3.7515896566341673e-07, + "loss": 0.5347, + "mean_token_accuracy": 0.8291922807693481, + "num_tokens": 33732637.0, + "step": 886 + }, + { + "epoch": 0.11283551710978247, + "ewc_loss": 0.007522647734731436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.761323796425131e-06, + "grad_norm": 12.820009231567383, + "learning_rate": 3.755828740991945e-07, + "loss": 0.5576, + "mean_token_accuracy": 0.8245173692703247, + "num_tokens": 33775106.0, + "step": 887 + }, + { + "epoch": 0.11296272738837299, + "ewc_loss": 0.007543779443949461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7718896237493027e-06, + "grad_norm": 12.815489768981934, + "learning_rate": 3.760067825349724e-07, + "loss": 0.5196, + "mean_token_accuracy": 0.8377514481544495, + "num_tokens": 33811297.0, + "step": 888 + }, + { + "epoch": 0.11308993766696349, + "ewc_loss": 0.007548901252448559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.77445053345582e-06, + "grad_norm": 12.82276725769043, + "learning_rate": 3.764306909707503e-07, + "loss": 0.4803, + "mean_token_accuracy": 0.8462625741958618, + "num_tokens": 33854038.0, + "step": 889 + }, + { + "epoch": 0.113217147945554, + "ewc_loss": 0.007547488436102867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.773744310819893e-06, + "grad_norm": 12.862227439880371, + "learning_rate": 3.768545994065282e-07, + "loss": 0.5375, + "mean_token_accuracy": 0.8303365707397461, + "num_tokens": 33884929.0, + "step": 890 + }, + { + "epoch": 0.11334435822414451, + "ewc_loss": 0.007580569479614496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7902848362136865e-06, + "grad_norm": 12.816621780395508, + "learning_rate": 3.77278507842306e-07, + "loss": 0.5456, + "mean_token_accuracy": 0.8305324912071228, + "num_tokens": 33926065.0, + "step": 891 + }, + { + "epoch": 0.11347156850273502, + "ewc_loss": 0.007559363264590502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7796817196067423e-06, + "grad_norm": 12.788743019104004, + "learning_rate": 3.777024162780839e-07, + "loss": 0.4941, + "mean_token_accuracy": 0.8408183455467224, + "num_tokens": 33965557.0, + "step": 892 + }, + { + "epoch": 0.11359877878132553, + "ewc_loss": 0.007571224123239517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7856120798096526e-06, + "grad_norm": 12.816158294677734, + "learning_rate": 3.781263247138618e-07, + "loss": 0.5055, + "mean_token_accuracy": 0.8393944501876831, + "num_tokens": 34004132.0, + "step": 893 + }, + { + "epoch": 0.11372598905991604, + "ewc_loss": 0.0075975144281983376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.7987572341080522e-06, + "grad_norm": 12.862560272216797, + "learning_rate": 3.785502331496397e-07, + "loss": 0.5333, + "mean_token_accuracy": 0.8231937289237976, + "num_tokens": 34039781.0, + "step": 894 + }, + { + "epoch": 0.11385319933850654, + "ewc_loss": 0.007613050285726786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.806525228355895e-06, + "grad_norm": 12.845048904418945, + "learning_rate": 3.789741415854175e-07, + "loss": 0.5619, + "mean_token_accuracy": 0.8218370079994202, + "num_tokens": 34080487.0, + "step": 895 + }, + { + "epoch": 0.11398040961709706, + "ewc_loss": 0.007601397577673197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.800698777922662e-06, + "grad_norm": 12.903326034545898, + "learning_rate": 3.793980500211954e-07, + "loss": 0.5358, + "mean_token_accuracy": 0.8313405513763428, + "num_tokens": 34119913.0, + "step": 896 + }, + { + "epoch": 0.11410761989568757, + "ewc_loss": 0.007627215702086687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8136079183459515e-06, + "grad_norm": 12.840145111083984, + "learning_rate": 3.798219584569733e-07, + "loss": 0.471, + "mean_token_accuracy": 0.8496865630149841, + "num_tokens": 34155811.0, + "step": 897 + }, + { + "epoch": 0.11423483017427809, + "ewc_loss": 0.007627422921359539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.813711373368278e-06, + "grad_norm": 12.907987594604492, + "learning_rate": 3.8024586689275115e-07, + "loss": 0.5931, + "mean_token_accuracy": 0.8182259798049927, + "num_tokens": 34195022.0, + "step": 898 + }, + { + "epoch": 0.11436204045286859, + "ewc_loss": 0.007651095278561115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.825547537417151e-06, + "grad_norm": 12.84013843536377, + "learning_rate": 3.80669775328529e-07, + "loss": 0.5256, + "mean_token_accuracy": 0.8301893472671509, + "num_tokens": 34229182.0, + "step": 899 + }, + { + "epoch": 0.1144892507314591, + "ewc_loss": 0.007640493102371693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8202465475478675e-06, + "grad_norm": 12.862478256225586, + "learning_rate": 3.810936837643069e-07, + "loss": 0.4835, + "mean_token_accuracy": 0.8443882465362549, + "num_tokens": 34266931.0, + "step": 900 + }, + { + "epoch": 0.11461646101004962, + "ewc_loss": 0.007672786712646484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.836393261735793e-06, + "grad_norm": 12.906683921813965, + "learning_rate": 3.815175922000848e-07, + "loss": 0.4781, + "mean_token_accuracy": 0.8451076149940491, + "num_tokens": 34301705.0, + "step": 901 + }, + { + "epoch": 0.11474367128864012, + "ewc_loss": 0.007675084751099348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.837542408291483e-06, + "grad_norm": 12.837536811828613, + "learning_rate": 3.8194150063586264e-07, + "loss": 0.5626, + "mean_token_accuracy": 0.8275299072265625, + "num_tokens": 34340975.0, + "step": 902 + }, + { + "epoch": 0.11487088156723063, + "ewc_loss": 0.007674149703234434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.837074928014772e-06, + "grad_norm": 12.855280876159668, + "learning_rate": 3.823654090716405e-07, + "loss": 0.5279, + "mean_token_accuracy": 0.8312947750091553, + "num_tokens": 34380024.0, + "step": 903 + }, + { + "epoch": 0.11499809184582115, + "ewc_loss": 0.007711340207606554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.85567000193987e-06, + "grad_norm": 12.940648078918457, + "learning_rate": 3.827893175074184e-07, + "loss": 0.549, + "mean_token_accuracy": 0.8332991600036621, + "num_tokens": 34419654.0, + "step": 904 + }, + { + "epoch": 0.11512530212441165, + "ewc_loss": 0.0077267857268452644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.863392976199975e-06, + "grad_norm": 12.962165832519531, + "learning_rate": 3.832132259431963e-07, + "loss": 0.4804, + "mean_token_accuracy": 0.8489205241203308, + "num_tokens": 34453034.0, + "step": 905 + }, + { + "epoch": 0.11525251240300216, + "ewc_loss": 0.007729515433311462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8647576729999855e-06, + "grad_norm": 12.980876922607422, + "learning_rate": 3.8363713437897413e-07, + "loss": 0.588, + "mean_token_accuracy": 0.8152803778648376, + "num_tokens": 34493694.0, + "step": 906 + }, + { + "epoch": 0.11537972268159268, + "ewc_loss": 0.0077311513014137745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.86557576348423e-06, + "grad_norm": 12.90833854675293, + "learning_rate": 3.8406104281475197e-07, + "loss": 0.5801, + "mean_token_accuracy": 0.8199602365493774, + "num_tokens": 34531636.0, + "step": 907 + }, + { + "epoch": 0.11550693296018319, + "ewc_loss": 0.007749658077955246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.874828962580068e-06, + "grad_norm": 12.969491958618164, + "learning_rate": 3.8448495125052987e-07, + "loss": 0.5636, + "mean_token_accuracy": 0.8203766345977783, + "num_tokens": 34570851.0, + "step": 908 + }, + { + "epoch": 0.11563414323877369, + "ewc_loss": 0.007743431720882654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.871715762215899e-06, + "grad_norm": 12.914288520812988, + "learning_rate": 3.8490885968630777e-07, + "loss": 0.515, + "mean_token_accuracy": 0.8389055728912354, + "num_tokens": 34613314.0, + "step": 909 + }, + { + "epoch": 0.1157613535173642, + "ewc_loss": 0.0077573638409376144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.878682036884129e-06, + "grad_norm": 12.927261352539062, + "learning_rate": 3.853327681220856e-07, + "loss": 0.53, + "mean_token_accuracy": 0.8308919668197632, + "num_tokens": 34656701.0, + "step": 910 + }, + { + "epoch": 0.11588856379595472, + "ewc_loss": 0.007774742785841227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.887371349264868e-06, + "grad_norm": 12.991581916809082, + "learning_rate": 3.8575667655786346e-07, + "loss": 0.5058, + "mean_token_accuracy": 0.8403865098953247, + "num_tokens": 34688595.0, + "step": 911 + }, + { + "epoch": 0.11601577407454522, + "ewc_loss": 0.00778105529025197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.8905277506273706e-06, + "grad_norm": 12.991296768188477, + "learning_rate": 3.8618058499364136e-07, + "loss": 0.5238, + "mean_token_accuracy": 0.8345954418182373, + "num_tokens": 34726609.0, + "step": 912 + }, + { + "epoch": 0.11614298435313573, + "ewc_loss": 0.007799253799021244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.899626790371258e-06, + "grad_norm": 12.993300437927246, + "learning_rate": 3.8660449342941926e-07, + "loss": 0.5822, + "mean_token_accuracy": 0.8201690316200256, + "num_tokens": 34769213.0, + "step": 913 + }, + { + "epoch": 0.11627019463172625, + "ewc_loss": 0.007788253948092461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.894127075909637e-06, + "grad_norm": 12.981578826904297, + "learning_rate": 3.870284018651971e-07, + "loss": 0.5529, + "mean_token_accuracy": 0.8257828950881958, + "num_tokens": 34806722.0, + "step": 914 + }, + { + "epoch": 0.11639740491031675, + "ewc_loss": 0.007817675359547138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.908837697963463e-06, + "grad_norm": 12.988804817199707, + "learning_rate": 3.8745231030097495e-07, + "loss": 0.509, + "mean_token_accuracy": 0.8415302038192749, + "num_tokens": 34846408.0, + "step": 915 + }, + { + "epoch": 0.11652461518890726, + "ewc_loss": 0.007806053850799799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.903026936313836e-06, + "grad_norm": 12.970651626586914, + "learning_rate": 3.8787621873675285e-07, + "loss": 0.4953, + "mean_token_accuracy": 0.8418079614639282, + "num_tokens": 34883791.0, + "step": 916 + }, + { + "epoch": 0.11665182546749778, + "ewc_loss": 0.007798131089657545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.899065632140264e-06, + "grad_norm": 12.964729309082031, + "learning_rate": 3.883001271725307e-07, + "loss": 0.491, + "mean_token_accuracy": 0.8443917036056519, + "num_tokens": 34922768.0, + "step": 917 + }, + { + "epoch": 0.11677903574608828, + "ewc_loss": 0.007820826023817062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.910412942786934e-06, + "grad_norm": 12.930559158325195, + "learning_rate": 3.887240356083086e-07, + "loss": 0.4716, + "mean_token_accuracy": 0.848360538482666, + "num_tokens": 34962184.0, + "step": 918 + }, + { + "epoch": 0.11690624602467879, + "ewc_loss": 0.007839469239115715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9197348087327555e-06, + "grad_norm": 13.091975212097168, + "learning_rate": 3.8914794404408644e-07, + "loss": 0.5169, + "mean_token_accuracy": 0.8351457118988037, + "num_tokens": 34991455.0, + "step": 919 + }, + { + "epoch": 0.1170334563032693, + "ewc_loss": 0.007871980778872967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.935990207537543e-06, + "grad_norm": 13.075904846191406, + "learning_rate": 3.8957185247986434e-07, + "loss": 0.5454, + "mean_token_accuracy": 0.8260766267776489, + "num_tokens": 35025044.0, + "step": 920 + }, + { + "epoch": 0.11716066658185982, + "ewc_loss": 0.007854104973375797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9270526031032205e-06, + "grad_norm": 13.025544166564941, + "learning_rate": 3.899957609156422e-07, + "loss": 0.5187, + "mean_token_accuracy": 0.8349802494049072, + "num_tokens": 35064564.0, + "step": 921 + }, + { + "epoch": 0.11728787686045032, + "ewc_loss": 0.007863468490540981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.931734227080597e-06, + "grad_norm": 13.010702133178711, + "learning_rate": 3.904196693514201e-07, + "loss": 0.5363, + "mean_token_accuracy": 0.8315960764884949, + "num_tokens": 35109786.0, + "step": 922 + }, + { + "epoch": 0.11741508713904084, + "ewc_loss": 0.007881207391619682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.940603619412286e-06, + "grad_norm": 13.036649703979492, + "learning_rate": 3.9084357778719793e-07, + "loss": 0.6001, + "mean_token_accuracy": 0.8107430934906006, + "num_tokens": 35148351.0, + "step": 923 + }, + { + "epoch": 0.11754229741763135, + "ewc_loss": 0.0078981788828969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9490896597271785e-06, + "grad_norm": 13.077170372009277, + "learning_rate": 3.9126748622297583e-07, + "loss": 0.5647, + "mean_token_accuracy": 0.8237374424934387, + "num_tokens": 35183464.0, + "step": 924 + }, + { + "epoch": 0.11766950769622185, + "ewc_loss": 0.007920434698462486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.96021732740337e-06, + "grad_norm": 12.987250328063965, + "learning_rate": 3.916913946587537e-07, + "loss": 0.5461, + "mean_token_accuracy": 0.826632559299469, + "num_tokens": 35218561.0, + "step": 925 + }, + { + "epoch": 0.11779671797481236, + "ewc_loss": 0.00792304053902626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.96152017856366e-06, + "grad_norm": 13.043577194213867, + "learning_rate": 3.921153030945316e-07, + "loss": 0.5607, + "mean_token_accuracy": 0.8219958543777466, + "num_tokens": 35257740.0, + "step": 926 + }, + { + "epoch": 0.11792392825340288, + "ewc_loss": 0.007943189702928066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.971594651375199e-06, + "grad_norm": 13.031245231628418, + "learning_rate": 3.925392115303094e-07, + "loss": 0.5157, + "mean_token_accuracy": 0.8359481692314148, + "num_tokens": 35291632.0, + "step": 927 + }, + { + "epoch": 0.11805113853199338, + "ewc_loss": 0.007941878400743008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.970939360442571e-06, + "grad_norm": 13.102823257446289, + "learning_rate": 3.929631199660873e-07, + "loss": 0.5211, + "mean_token_accuracy": 0.8321740627288818, + "num_tokens": 35327686.0, + "step": 928 + }, + { + "epoch": 0.1181783488105839, + "ewc_loss": 0.007972892373800278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.9864462451078e-06, + "grad_norm": 13.094093322753906, + "learning_rate": 3.9338702840186517e-07, + "loss": 0.523, + "mean_token_accuracy": 0.8321114778518677, + "num_tokens": 35364480.0, + "step": 929 + }, + { + "epoch": 0.11830555908917441, + "ewc_loss": 0.007973008789122105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.986504452768713e-06, + "grad_norm": 13.008209228515625, + "learning_rate": 3.9381093683764307e-07, + "loss": 0.528, + "mean_token_accuracy": 0.8348382711410522, + "num_tokens": 35406017.0, + "step": 930 + }, + { + "epoch": 0.11843276936776491, + "ewc_loss": 0.007970488630235195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.985244347859407e-06, + "grad_norm": 13.050661087036133, + "learning_rate": 3.942348452734209e-07, + "loss": 0.5879, + "mean_token_accuracy": 0.8158271312713623, + "num_tokens": 35448447.0, + "step": 931 + }, + { + "epoch": 0.11855997964635542, + "ewc_loss": 0.00799626111984253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.998130523541477e-06, + "grad_norm": 13.08564281463623, + "learning_rate": 3.946587537091988e-07, + "loss": 0.5359, + "mean_token_accuracy": 0.8286049365997314, + "num_tokens": 35488120.0, + "step": 932 + }, + { + "epoch": 0.11868718992494594, + "ewc_loss": 0.008007449097931385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.003724370704731e-06, + "grad_norm": 13.076033592224121, + "learning_rate": 3.9508266214497666e-07, + "loss": 0.5084, + "mean_token_accuracy": 0.8378300070762634, + "num_tokens": 35531759.0, + "step": 933 + }, + { + "epoch": 0.11881440020353645, + "ewc_loss": 0.008004620671272278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.002310561190825e-06, + "grad_norm": 13.064592361450195, + "learning_rate": 3.9550657058075456e-07, + "loss": 0.4936, + "mean_token_accuracy": 0.8449059724807739, + "num_tokens": 35571476.0, + "step": 934 + }, + { + "epoch": 0.11894161048212695, + "ewc_loss": 0.007993080653250217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 3.996540272055427e-06, + "grad_norm": 13.108794212341309, + "learning_rate": 3.959304790165324e-07, + "loss": 0.5343, + "mean_token_accuracy": 0.8322675824165344, + "num_tokens": 35608440.0, + "step": 935 + }, + { + "epoch": 0.11906882076071747, + "ewc_loss": 0.008033210411667824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.01660508941859e-06, + "grad_norm": 13.21146011352539, + "learning_rate": 3.9635438745231025e-07, + "loss": 0.5307, + "mean_token_accuracy": 0.8328808546066284, + "num_tokens": 35641861.0, + "step": 936 + }, + { + "epoch": 0.11919603103930798, + "ewc_loss": 0.008041092194616795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.020545929961372e-06, + "grad_norm": 13.158358573913574, + "learning_rate": 3.9677829588808815e-07, + "loss": 0.5527, + "mean_token_accuracy": 0.8252456188201904, + "num_tokens": 35684626.0, + "step": 937 + }, + { + "epoch": 0.11932324131789848, + "ewc_loss": 0.008025108836591244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.0125542000168934e-06, + "grad_norm": 13.121695518493652, + "learning_rate": 3.9720220432386605e-07, + "loss": 0.5573, + "mean_token_accuracy": 0.822322428226471, + "num_tokens": 35718125.0, + "step": 938 + }, + { + "epoch": 0.119450451596489, + "ewc_loss": 0.008043073117733002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.021536369691603e-06, + "grad_norm": 13.173613548278809, + "learning_rate": 3.976261127596439e-07, + "loss": 0.5024, + "mean_token_accuracy": 0.8427537679672241, + "num_tokens": 35752818.0, + "step": 939 + }, + { + "epoch": 0.11957766187507951, + "ewc_loss": 0.008053107187151909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.026553597213933e-06, + "grad_norm": 13.17218017578125, + "learning_rate": 3.9805002119542174e-07, + "loss": 0.4813, + "mean_token_accuracy": 0.8460874557495117, + "num_tokens": 35787097.0, + "step": 940 + }, + { + "epoch": 0.11970487215367001, + "ewc_loss": 0.008048365823924541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.024182999273762e-06, + "grad_norm": 13.149413108825684, + "learning_rate": 3.9847392963119964e-07, + "loss": 0.5815, + "mean_token_accuracy": 0.8193567991256714, + "num_tokens": 35825887.0, + "step": 941 + }, + { + "epoch": 0.11983208243226053, + "ewc_loss": 0.00806773267686367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.0338663893635385e-06, + "grad_norm": 13.132102966308594, + "learning_rate": 3.9889783806697754e-07, + "loss": 0.5061, + "mean_token_accuracy": 0.842199444770813, + "num_tokens": 35863592.0, + "step": 942 + }, + { + "epoch": 0.11995929271085104, + "ewc_loss": 0.008077727630734444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.03886360800243e-06, + "grad_norm": 13.178544998168945, + "learning_rate": 3.993217465027554e-07, + "loss": 0.5833, + "mean_token_accuracy": 0.820066511631012, + "num_tokens": 35904483.0, + "step": 943 + }, + { + "epoch": 0.12008650298944154, + "ewc_loss": 0.008093049749732018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.046524736622814e-06, + "grad_norm": 13.203347206115723, + "learning_rate": 3.9974565493853323e-07, + "loss": 0.4688, + "mean_token_accuracy": 0.8512152433395386, + "num_tokens": 35938662.0, + "step": 944 + }, + { + "epoch": 0.12021371326803205, + "ewc_loss": 0.008096879348158836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.048439677717397e-06, + "grad_norm": 13.159537315368652, + "learning_rate": 4.0016956337431113e-07, + "loss": 0.5339, + "mean_token_accuracy": 0.8310331106185913, + "num_tokens": 35975176.0, + "step": 945 + }, + { + "epoch": 0.12034092354662257, + "ewc_loss": 0.008113071322441101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.056535544805229e-06, + "grad_norm": 13.230297088623047, + "learning_rate": 4.0059347181008903e-07, + "loss": 0.5199, + "mean_token_accuracy": 0.8340820074081421, + "num_tokens": 36008893.0, + "step": 946 + }, + { + "epoch": 0.12046813382521308, + "ewc_loss": 0.008126141503453255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.063070718984818e-06, + "grad_norm": 13.131880760192871, + "learning_rate": 4.010173802458669e-07, + "loss": 0.479, + "mean_token_accuracy": 0.8503425121307373, + "num_tokens": 36049882.0, + "step": 947 + }, + { + "epoch": 0.12059534410380358, + "ewc_loss": 0.00812872126698494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.064360837219283e-06, + "grad_norm": 13.220417022705078, + "learning_rate": 4.014412886816447e-07, + "loss": 0.5936, + "mean_token_accuracy": 0.8168817758560181, + "num_tokens": 36090615.0, + "step": 948 + }, + { + "epoch": 0.1207225543823941, + "ewc_loss": 0.008156723342835903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.0783615986583754e-06, + "grad_norm": 13.182313919067383, + "learning_rate": 4.018651971174226e-07, + "loss": 0.5234, + "mean_token_accuracy": 0.8314552307128906, + "num_tokens": 36128159.0, + "step": 949 + }, + { + "epoch": 0.12084976466098461, + "ewc_loss": 0.008159887045621872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.0799436646921095e-06, + "grad_norm": 13.258821487426758, + "learning_rate": 4.022891055532005e-07, + "loss": 0.5361, + "mean_token_accuracy": 0.8309205770492554, + "num_tokens": 36172851.0, + "step": 950 + }, + { + "epoch": 0.12097697493957511, + "ewc_loss": 0.008173285983502865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.086643002665369e-06, + "grad_norm": 13.188124656677246, + "learning_rate": 4.0271301398897837e-07, + "loss": 0.5875, + "mean_token_accuracy": 0.8199115991592407, + "num_tokens": 36214282.0, + "step": 951 + }, + { + "epoch": 0.12110418521816563, + "ewc_loss": 0.008162050507962704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.0810250538925175e-06, + "grad_norm": 13.297947883605957, + "learning_rate": 4.031369224247562e-07, + "loss": 0.5083, + "mean_token_accuracy": 0.8348604440689087, + "num_tokens": 36248619.0, + "step": 952 + }, + { + "epoch": 0.12123139549675614, + "ewc_loss": 0.008190111257135868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.0950558286567684e-06, + "grad_norm": 13.239266395568848, + "learning_rate": 4.035608308605341e-07, + "loss": 0.5459, + "mean_token_accuracy": 0.8264597654342651, + "num_tokens": 36289070.0, + "step": 953 + }, + { + "epoch": 0.12135860577534664, + "ewc_loss": 0.008169584907591343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.084792635694612e-06, + "grad_norm": 13.236930847167969, + "learning_rate": 4.03984739296312e-07, + "loss": 0.5442, + "mean_token_accuracy": 0.8306894302368164, + "num_tokens": 36330883.0, + "step": 954 + }, + { + "epoch": 0.12148581605393716, + "ewc_loss": 0.008195502683520317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.0977511162054725e-06, + "grad_norm": 13.33073902130127, + "learning_rate": 4.044086477320898e-07, + "loss": 0.5243, + "mean_token_accuracy": 0.834052562713623, + "num_tokens": 36371507.0, + "step": 955 + }, + { + "epoch": 0.12161302633252767, + "ewc_loss": 0.008206748403608799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.103374067199184e-06, + "grad_norm": 13.232114791870117, + "learning_rate": 4.048325561678677e-07, + "loss": 0.5474, + "mean_token_accuracy": 0.8267530202865601, + "num_tokens": 36407262.0, + "step": 956 + }, + { + "epoch": 0.12174023661111817, + "ewc_loss": 0.008195790462195873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.0978952711157035e-06, + "grad_norm": 13.329779624938965, + "learning_rate": 4.052564646036456e-07, + "loss": 0.5427, + "mean_token_accuracy": 0.8295006155967712, + "num_tokens": 36448089.0, + "step": 957 + }, + { + "epoch": 0.12186744688970869, + "ewc_loss": 0.008231494575738907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1157472878694534e-06, + "grad_norm": 13.242122650146484, + "learning_rate": 4.056803730394235e-07, + "loss": 0.5562, + "mean_token_accuracy": 0.8222150802612305, + "num_tokens": 36484022.0, + "step": 958 + }, + { + "epoch": 0.1219946571682992, + "ewc_loss": 0.008205359801650047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.102680122741731e-06, + "grad_norm": 13.270671844482422, + "learning_rate": 4.061042814752013e-07, + "loss": 0.5314, + "mean_token_accuracy": 0.8331502079963684, + "num_tokens": 36519385.0, + "step": 959 + }, + { + "epoch": 0.12212186744688971, + "ewc_loss": 0.00823130365461111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.115651790925767e-06, + "grad_norm": 13.33210563659668, + "learning_rate": 4.065281899109792e-07, + "loss": 0.5354, + "mean_token_accuracy": 0.8302350044250488, + "num_tokens": 36551943.0, + "step": 960 + }, + { + "epoch": 0.12224907772548022, + "ewc_loss": 0.008256589062511921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.128294676775113e-06, + "grad_norm": 13.306134223937988, + "learning_rate": 4.069520983467571e-07, + "loss": 0.4808, + "mean_token_accuracy": 0.8474367260932922, + "num_tokens": 36590927.0, + "step": 961 + }, + { + "epoch": 0.12237628800407073, + "ewc_loss": 0.008243574760854244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1217872421839274e-06, + "grad_norm": 13.256380081176758, + "learning_rate": 4.07376006782535e-07, + "loss": 0.5156, + "mean_token_accuracy": 0.8349308371543884, + "num_tokens": 36625199.0, + "step": 962 + }, + { + "epoch": 0.12250349828266124, + "ewc_loss": 0.008255967870354652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.127984084334457e-06, + "grad_norm": 13.339021682739258, + "learning_rate": 4.077999152183128e-07, + "loss": 0.4763, + "mean_token_accuracy": 0.8471479415893555, + "num_tokens": 36658981.0, + "step": 963 + }, + { + "epoch": 0.12263070856125174, + "ewc_loss": 0.008284498006105423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.142249053984415e-06, + "grad_norm": 13.294885635375977, + "learning_rate": 4.082238236540907e-07, + "loss": 0.5362, + "mean_token_accuracy": 0.8337932229042053, + "num_tokens": 36704214.0, + "step": 964 + }, + { + "epoch": 0.12275791883984226, + "ewc_loss": 0.008271359838545322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.135679773753509e-06, + "grad_norm": 13.314112663269043, + "learning_rate": 4.086477320898686e-07, + "loss": 0.5054, + "mean_token_accuracy": 0.838828444480896, + "num_tokens": 36744782.0, + "step": 965 + }, + { + "epoch": 0.12288512911843277, + "ewc_loss": 0.008298303931951523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.149152118770871e-06, + "grad_norm": 13.358656883239746, + "learning_rate": 4.090716405256465e-07, + "loss": 0.542, + "mean_token_accuracy": 0.8271704912185669, + "num_tokens": 36783645.0, + "step": 966 + }, + { + "epoch": 0.12301233939702327, + "ewc_loss": 0.0082982974126935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.149148480792064e-06, + "grad_norm": 13.359517097473145, + "learning_rate": 4.094955489614243e-07, + "loss": 0.5098, + "mean_token_accuracy": 0.8374916911125183, + "num_tokens": 36817539.0, + "step": 967 + }, + { + "epoch": 0.12313954967561379, + "ewc_loss": 0.008313297294080257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.156648628850235e-06, + "grad_norm": 13.349383354187012, + "learning_rate": 4.099194573972022e-07, + "loss": 0.5109, + "mean_token_accuracy": 0.8373432159423828, + "num_tokens": 36859144.0, + "step": 968 + }, + { + "epoch": 0.1232667599542043, + "ewc_loss": 0.008292173035442829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.146086666878546e-06, + "grad_norm": 13.291096687316895, + "learning_rate": 4.1034336583298007e-07, + "loss": 0.5533, + "mean_token_accuracy": 0.8261115550994873, + "num_tokens": 36900833.0, + "step": 969 + }, + { + "epoch": 0.1233939702327948, + "ewc_loss": 0.008312728255987167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.15636395700858e-06, + "grad_norm": 13.334639549255371, + "learning_rate": 4.1076727426875797e-07, + "loss": 0.5431, + "mean_token_accuracy": 0.8280053734779358, + "num_tokens": 36940935.0, + "step": 970 + }, + { + "epoch": 0.12352118051138532, + "ewc_loss": 0.00832467619329691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.162337972957175e-06, + "grad_norm": 13.315455436706543, + "learning_rate": 4.1119118270453577e-07, + "loss": 0.6059, + "mean_token_accuracy": 0.8085276484489441, + "num_tokens": 36979874.0, + "step": 971 + }, + { + "epoch": 0.12364839078997583, + "ewc_loss": 0.008320885710418224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.160443040746031e-06, + "grad_norm": 13.384234428405762, + "learning_rate": 4.1161509114031366e-07, + "loss": 0.4861, + "mean_token_accuracy": 0.8476014137268066, + "num_tokens": 37017917.0, + "step": 972 + }, + { + "epoch": 0.12377560106856635, + "ewc_loss": 0.0083727752789855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.186387741356157e-06, + "grad_norm": 13.362174034118652, + "learning_rate": 4.1203899957609156e-07, + "loss": 0.5508, + "mean_token_accuracy": 0.8253811597824097, + "num_tokens": 37054914.0, + "step": 973 + }, + { + "epoch": 0.12390281134715685, + "ewc_loss": 0.008335906080901623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.167953193245921e-06, + "grad_norm": 13.346952438354492, + "learning_rate": 4.124629080118694e-07, + "loss": 0.5014, + "mean_token_accuracy": 0.8385446071624756, + "num_tokens": 37088277.0, + "step": 974 + }, + { + "epoch": 0.12403002162574736, + "ewc_loss": 0.008364923298358917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.182461452728603e-06, + "grad_norm": 13.373804092407227, + "learning_rate": 4.1288681644764726e-07, + "loss": 0.5481, + "mean_token_accuracy": 0.825398325920105, + "num_tokens": 37126963.0, + "step": 975 + }, + { + "epoch": 0.12415723190433788, + "ewc_loss": 0.008357521146535873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.178760718787089e-06, + "grad_norm": 13.319165229797363, + "learning_rate": 4.1331072488342515e-07, + "loss": 0.5617, + "mean_token_accuracy": 0.8246199488639832, + "num_tokens": 37161948.0, + "step": 976 + }, + { + "epoch": 0.12428444218292838, + "ewc_loss": 0.008375461213290691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1877306102833245e-06, + "grad_norm": 13.361920356750488, + "learning_rate": 4.1373463331920305e-07, + "loss": 0.4743, + "mean_token_accuracy": 0.8497414588928223, + "num_tokens": 37198552.0, + "step": 977 + }, + { + "epoch": 0.12441165246151889, + "ewc_loss": 0.008396920748054981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.198460374027491e-06, + "grad_norm": 13.472644805908203, + "learning_rate": 4.141585417549809e-07, + "loss": 0.5459, + "mean_token_accuracy": 0.8253817558288574, + "num_tokens": 37235990.0, + "step": 978 + }, + { + "epoch": 0.1245388627401094, + "ewc_loss": 0.008430741727352142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.2153710637649056e-06, + "grad_norm": 13.448033332824707, + "learning_rate": 4.1458245019075875e-07, + "loss": 0.5046, + "mean_token_accuracy": 0.8386670351028442, + "num_tokens": 37269664.0, + "step": 979 + }, + { + "epoch": 0.1246660730186999, + "ewc_loss": 0.008392672054469585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.1963362491515e-06, + "grad_norm": 13.341410636901855, + "learning_rate": 4.1500635862653664e-07, + "loss": 0.5357, + "mean_token_accuracy": 0.8316946029663086, + "num_tokens": 37308647.0, + "step": 980 + }, + { + "epoch": 0.12479328329729042, + "ewc_loss": 0.00841806922107935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.2090346141776536e-06, + "grad_norm": 13.395463943481445, + "learning_rate": 4.1543026706231454e-07, + "loss": 0.4853, + "mean_token_accuracy": 0.8462973833084106, + "num_tokens": 37345953.0, + "step": 981 + }, + { + "epoch": 0.12492049357588093, + "ewc_loss": 0.008430101908743382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.2150509216298815e-06, + "grad_norm": 13.390093803405762, + "learning_rate": 4.158541754980924e-07, + "loss": 0.4816, + "mean_token_accuracy": 0.8485479950904846, + "num_tokens": 37382735.0, + "step": 982 + }, + { + "epoch": 0.12504770385447145, + "ewc_loss": 0.008415314368903637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.2076571844518185e-06, + "grad_norm": 13.402137756347656, + "learning_rate": 4.1627808393387024e-07, + "loss": 0.548, + "mean_token_accuracy": 0.8245931267738342, + "num_tokens": 37419605.0, + "step": 983 + }, + { + "epoch": 0.12517491413306195, + "ewc_loss": 0.00845992099493742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.229960268276045e-06, + "grad_norm": 13.470871925354004, + "learning_rate": 4.1670199236964813e-07, + "loss": 0.541, + "mean_token_accuracy": 0.824138343334198, + "num_tokens": 37455395.0, + "step": 984 + }, + { + "epoch": 0.12530212441165245, + "ewc_loss": 0.008470838889479637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.235419510223437e-06, + "grad_norm": 13.38215160369873, + "learning_rate": 4.1712590080542603e-07, + "loss": 0.4918, + "mean_token_accuracy": 0.8439346551895142, + "num_tokens": 37492208.0, + "step": 985 + }, + { + "epoch": 0.12542933469024298, + "ewc_loss": 0.008453652262687683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.226826149533736e-06, + "grad_norm": 13.42428970336914, + "learning_rate": 4.175498092412039e-07, + "loss": 0.5383, + "mean_token_accuracy": 0.8328604698181152, + "num_tokens": 37533675.0, + "step": 986 + }, + { + "epoch": 0.12555654496883348, + "ewc_loss": 0.008515429683029652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.257714863342699e-06, + "grad_norm": 13.46940803527832, + "learning_rate": 4.179737176769817e-07, + "loss": 0.5038, + "mean_token_accuracy": 0.8407161235809326, + "num_tokens": 37572792.0, + "step": 987 + }, + { + "epoch": 0.12568375524742398, + "ewc_loss": 0.008506680838763714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.253340193827171e-06, + "grad_norm": 13.517745971679688, + "learning_rate": 4.183976261127596e-07, + "loss": 0.4997, + "mean_token_accuracy": 0.8399580121040344, + "num_tokens": 37611481.0, + "step": 988 + }, + { + "epoch": 0.1258109655260145, + "ewc_loss": 0.008501631207764149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.250815436535049e-06, + "grad_norm": 13.540068626403809, + "learning_rate": 4.1882153454853747e-07, + "loss": 0.5784, + "mean_token_accuracy": 0.8173255920410156, + "num_tokens": 37648253.0, + "step": 989 + }, + { + "epoch": 0.125938175804605, + "ewc_loss": 0.008498530834913254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.249265202815877e-06, + "grad_norm": 13.402000427246094, + "learning_rate": 4.1924544298431537e-07, + "loss": 0.5106, + "mean_token_accuracy": 0.8391216993331909, + "num_tokens": 37687086.0, + "step": 990 + }, + { + "epoch": 0.12606538608319554, + "ewc_loss": 0.008516723290085793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.258361514075659e-06, + "grad_norm": 13.468265533447266, + "learning_rate": 4.196693514200932e-07, + "loss": 0.5049, + "mean_token_accuracy": 0.840299129486084, + "num_tokens": 37730928.0, + "step": 991 + }, + { + "epoch": 0.12619259636178604, + "ewc_loss": 0.008550703525543213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.275351784599479e-06, + "grad_norm": 13.47834587097168, + "learning_rate": 4.200932598558711e-07, + "loss": 0.5308, + "mean_token_accuracy": 0.8291574716567993, + "num_tokens": 37771857.0, + "step": 992 + }, + { + "epoch": 0.12631980664037654, + "ewc_loss": 0.008545681834220886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.272841124475235e-06, + "grad_norm": 13.54068374633789, + "learning_rate": 4.2051716829164896e-07, + "loss": 0.5333, + "mean_token_accuracy": 0.833740770816803, + "num_tokens": 37817517.0, + "step": 993 + }, + { + "epoch": 0.12644701691896706, + "ewc_loss": 0.008556129410862923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.278064807294868e-06, + "grad_norm": 13.507143020629883, + "learning_rate": 4.2094107672742686e-07, + "loss": 0.5764, + "mean_token_accuracy": 0.8227224349975586, + "num_tokens": 37855891.0, + "step": 994 + }, + { + "epoch": 0.12657422719755757, + "ewc_loss": 0.008557737804949284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.278868800611235e-06, + "grad_norm": 13.58455753326416, + "learning_rate": 4.2136498516320476e-07, + "loss": 0.4985, + "mean_token_accuracy": 0.8432934880256653, + "num_tokens": 37888719.0, + "step": 995 + }, + { + "epoch": 0.12670143747614807, + "ewc_loss": 0.008579857647418976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.289928710932145e-06, + "grad_norm": 13.504630088806152, + "learning_rate": 4.217888935989826e-07, + "loss": 0.5481, + "mean_token_accuracy": 0.8277462720870972, + "num_tokens": 37932968.0, + "step": 996 + }, + { + "epoch": 0.1268286477547386, + "ewc_loss": 0.008558845147490501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.279422682884615e-06, + "grad_norm": 13.542169570922852, + "learning_rate": 4.2221280203476045e-07, + "loss": 0.5391, + "mean_token_accuracy": 0.8281518816947937, + "num_tokens": 37974297.0, + "step": 997 + }, + { + "epoch": 0.1269558580333291, + "ewc_loss": 0.008569195866584778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.284597707737703e-06, + "grad_norm": 13.553842544555664, + "learning_rate": 4.2263671047053835e-07, + "loss": 0.5574, + "mean_token_accuracy": 0.8239097595214844, + "num_tokens": 38020294.0, + "step": 998 + }, + { + "epoch": 0.1270830683119196, + "ewc_loss": 0.00855750311166048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.2787514757947065e-06, + "grad_norm": 13.472965240478516, + "learning_rate": 4.2306061890631625e-07, + "loss": 0.5363, + "mean_token_accuracy": 0.8315476179122925, + "num_tokens": 38063322.0, + "step": 999 + }, + { + "epoch": 0.12721027859051012, + "ewc_loss": 0.00858358945697546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.291794539312832e-06, + "grad_norm": 13.590080261230469, + "learning_rate": 4.234845273420941e-07, + "loss": 0.6265, + "mean_token_accuracy": 0.8113440275192261, + "num_tokens": 38101052.0, + "step": 1000 + }, + { + "epoch": 0.12733748886910062, + "ewc_loss": 0.008623462170362473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3117311179230455e-06, + "grad_norm": 13.548971176147461, + "learning_rate": 4.2390843577787194e-07, + "loss": 0.5443, + "mean_token_accuracy": 0.8276803493499756, + "num_tokens": 38145472.0, + "step": 1001 + }, + { + "epoch": 0.12746469914769112, + "ewc_loss": 0.008575573563575745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.28778685090947e-06, + "grad_norm": 13.4884672164917, + "learning_rate": 4.2433234421364984e-07, + "loss": 0.5206, + "mean_token_accuracy": 0.8350957036018372, + "num_tokens": 38182662.0, + "step": 1002 + }, + { + "epoch": 0.12759190942628165, + "ewc_loss": 0.008601774461567402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.300887212593807e-06, + "grad_norm": 13.594271659851074, + "learning_rate": 4.2475625264942774e-07, + "loss": 0.5067, + "mean_token_accuracy": 0.8399813175201416, + "num_tokens": 38224735.0, + "step": 1003 + }, + { + "epoch": 0.12771911970487215, + "ewc_loss": 0.008648951537907124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3244758671789896e-06, + "grad_norm": 13.618871688842773, + "learning_rate": 4.251801610852056e-07, + "loss": 0.5513, + "mean_token_accuracy": 0.8263156414031982, + "num_tokens": 38255847.0, + "step": 1004 + }, + { + "epoch": 0.12784632998346265, + "ewc_loss": 0.008637494407594204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.318747414799873e-06, + "grad_norm": 13.545977592468262, + "learning_rate": 4.2560406952098343e-07, + "loss": 0.5646, + "mean_token_accuracy": 0.8237000107765198, + "num_tokens": 38296921.0, + "step": 1005 + }, + { + "epoch": 0.12797354026205318, + "ewc_loss": 0.008638761006295681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.319380423112307e-06, + "grad_norm": 13.562226295471191, + "learning_rate": 4.2602797795676133e-07, + "loss": 0.5737, + "mean_token_accuracy": 0.8211228251457214, + "num_tokens": 38336408.0, + "step": 1006 + }, + { + "epoch": 0.12810075054064368, + "ewc_loss": 0.008683052845299244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3415266191004775e-06, + "grad_norm": 13.6010103225708, + "learning_rate": 4.2645188639253923e-07, + "loss": 0.4809, + "mean_token_accuracy": 0.845607340335846, + "num_tokens": 38372809.0, + "step": 1007 + }, + { + "epoch": 0.12822796081923418, + "ewc_loss": 0.008679017424583435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.339508905104594e-06, + "grad_norm": 13.62337589263916, + "learning_rate": 4.26875794828317e-07, + "loss": 0.541, + "mean_token_accuracy": 0.8280938863754272, + "num_tokens": 38413312.0, + "step": 1008 + }, + { + "epoch": 0.1283551710978247, + "ewc_loss": 0.008675215765833855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.337608061177889e-06, + "grad_norm": 13.59317684173584, + "learning_rate": 4.272997032640949e-07, + "loss": 0.5637, + "mean_token_accuracy": 0.8212169408798218, + "num_tokens": 38457710.0, + "step": 1009 + }, + { + "epoch": 0.1284823813764152, + "ewc_loss": 0.008697502315044403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.348751190264011e-06, + "grad_norm": 13.61783218383789, + "learning_rate": 4.277236116998728e-07, + "loss": 0.5225, + "mean_token_accuracy": 0.8359323740005493, + "num_tokens": 38499630.0, + "step": 1010 + }, + { + "epoch": 0.1286095916550057, + "ewc_loss": 0.008694492280483246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.347245976532577e-06, + "grad_norm": 13.5987548828125, + "learning_rate": 4.281475201356507e-07, + "loss": 0.5299, + "mean_token_accuracy": 0.8336817026138306, + "num_tokens": 38538133.0, + "step": 1011 + }, + { + "epoch": 0.12873680193359624, + "ewc_loss": 0.008712186478078365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.356093086244073e-06, + "grad_norm": 13.610498428344727, + "learning_rate": 4.285714285714285e-07, + "loss": 0.564, + "mean_token_accuracy": 0.8239216804504395, + "num_tokens": 38576816.0, + "step": 1012 + }, + { + "epoch": 0.12886401221218674, + "ewc_loss": 0.008728161454200745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3640807234623935e-06, + "grad_norm": 13.646552085876465, + "learning_rate": 4.289953370072064e-07, + "loss": 0.5005, + "mean_token_accuracy": 0.840866208076477, + "num_tokens": 38615270.0, + "step": 1013 + }, + { + "epoch": 0.12899122249077727, + "ewc_loss": 0.008710927329957485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.355463715910446e-06, + "grad_norm": 13.629209518432617, + "learning_rate": 4.294192454429843e-07, + "loss": 0.5537, + "mean_token_accuracy": 0.8248872756958008, + "num_tokens": 38654173.0, + "step": 1014 + }, + { + "epoch": 0.12911843276936777, + "ewc_loss": 0.008736992254853249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.368496320239501e-06, + "grad_norm": 13.612256050109863, + "learning_rate": 4.298431538787622e-07, + "loss": 0.438, + "mean_token_accuracy": 0.8617850542068481, + "num_tokens": 38695758.0, + "step": 1015 + }, + { + "epoch": 0.12924564304795827, + "ewc_loss": 0.008737212046980858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.368605914351065e-06, + "grad_norm": 13.70190143585205, + "learning_rate": 4.3026706231454e-07, + "loss": 0.5847, + "mean_token_accuracy": 0.8158849477767944, + "num_tokens": 38729524.0, + "step": 1016 + }, + { + "epoch": 0.1293728533265488, + "ewc_loss": 0.008787496946752071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.3937484406342264e-06, + "grad_norm": 13.685761451721191, + "learning_rate": 4.306909707503179e-07, + "loss": 0.5326, + "mean_token_accuracy": 0.8353900909423828, + "num_tokens": 38767500.0, + "step": 1017 + }, + { + "epoch": 0.1295000636051393, + "ewc_loss": 0.0087408646941185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.370432179712225e-06, + "grad_norm": 13.65035343170166, + "learning_rate": 4.311148791860958e-07, + "loss": 0.5196, + "mean_token_accuracy": 0.834186851978302, + "num_tokens": 38798683.0, + "step": 1018 + }, + { + "epoch": 0.1296272738837298, + "ewc_loss": 0.008783663623034954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.39183168055024e-06, + "grad_norm": 13.713038444519043, + "learning_rate": 4.315387876218737e-07, + "loss": 0.5691, + "mean_token_accuracy": 0.8221777677536011, + "num_tokens": 38833653.0, + "step": 1019 + }, + { + "epoch": 0.12975448416232033, + "ewc_loss": 0.008788611739873886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.394305960886413e-06, + "grad_norm": 13.71780014038086, + "learning_rate": 4.319626960576515e-07, + "loss": 0.5658, + "mean_token_accuracy": 0.8187406063079834, + "num_tokens": 38869428.0, + "step": 1020 + }, + { + "epoch": 0.12988169444091083, + "ewc_loss": 0.008785282261669636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.392641130834818e-06, + "grad_norm": 13.623566627502441, + "learning_rate": 4.323866044934294e-07, + "loss": 0.5022, + "mean_token_accuracy": 0.8369595408439636, + "num_tokens": 38909502.0, + "step": 1021 + }, + { + "epoch": 0.13000890471950133, + "ewc_loss": 0.00881283264607191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.406416337587871e-06, + "grad_norm": 13.659802436828613, + "learning_rate": 4.328105129292073e-07, + "loss": 0.5277, + "mean_token_accuracy": 0.8341860771179199, + "num_tokens": 38951730.0, + "step": 1022 + }, + { + "epoch": 0.13013611499809186, + "ewc_loss": 0.008825737051665783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.412868747749599e-06, + "grad_norm": 13.684835433959961, + "learning_rate": 4.332344213649852e-07, + "loss": 0.5052, + "mean_token_accuracy": 0.8361222743988037, + "num_tokens": 38988981.0, + "step": 1023 + }, + { + "epoch": 0.13026332527668236, + "ewc_loss": 0.008842759765684605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.42137979916879e-06, + "grad_norm": 13.734890937805176, + "learning_rate": 4.33658329800763e-07, + "loss": 0.5126, + "mean_token_accuracy": 0.8392930030822754, + "num_tokens": 39026289.0, + "step": 1024 + }, + { + "epoch": 0.13039053555527286, + "ewc_loss": 0.008849631063640118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.424815415404737e-06, + "grad_norm": 13.65005111694336, + "learning_rate": 4.340822382365409e-07, + "loss": 0.504, + "mean_token_accuracy": 0.837814211845398, + "num_tokens": 39069113.0, + "step": 1025 + }, + { + "epoch": 0.13051774583386339, + "ewc_loss": 0.008845271542668343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.422635811351938e-06, + "grad_norm": 13.728899002075195, + "learning_rate": 4.345061466723188e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.8448116779327393, + "num_tokens": 39112991.0, + "step": 1026 + }, + { + "epoch": 0.13064495611245389, + "ewc_loss": 0.008875498548150063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.43774933955865e-06, + "grad_norm": 13.785154342651367, + "learning_rate": 4.3493005510809663e-07, + "loss": 0.551, + "mean_token_accuracy": 0.8265418410301208, + "num_tokens": 39149593.0, + "step": 1027 + }, + { + "epoch": 0.1307721663910444, + "ewc_loss": 0.008850838989019394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.425419319886714e-06, + "grad_norm": 13.743760108947754, + "learning_rate": 4.353539635438745e-07, + "loss": 0.5138, + "mean_token_accuracy": 0.8374108076095581, + "num_tokens": 39186570.0, + "step": 1028 + }, + { + "epoch": 0.13089937666963491, + "ewc_loss": 0.008869503624737263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.434751645021606e-06, + "grad_norm": 13.744039535522461, + "learning_rate": 4.357778719796524e-07, + "loss": 0.5774, + "mean_token_accuracy": 0.8154549598693848, + "num_tokens": 39220666.0, + "step": 1029 + }, + { + "epoch": 0.13102658694822542, + "ewc_loss": 0.008891346864402294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.4456733121478464e-06, + "grad_norm": 13.69520378112793, + "learning_rate": 4.362017804154303e-07, + "loss": 0.4966, + "mean_token_accuracy": 0.8441919088363647, + "num_tokens": 39258372.0, + "step": 1030 + }, + { + "epoch": 0.13115379722681592, + "ewc_loss": 0.008880720473825932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.440360044100089e-06, + "grad_norm": 13.754561424255371, + "learning_rate": 4.366256888512081e-07, + "loss": 0.4918, + "mean_token_accuracy": 0.8443751335144043, + "num_tokens": 39293647.0, + "step": 1031 + }, + { + "epoch": 0.13128100750540644, + "ewc_loss": 0.008928820490837097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.4644102672464214e-06, + "grad_norm": 13.76596450805664, + "learning_rate": 4.3704959728698597e-07, + "loss": 0.521, + "mean_token_accuracy": 0.8334389925003052, + "num_tokens": 39332811.0, + "step": 1032 + }, + { + "epoch": 0.13140821778399694, + "ewc_loss": 0.008899185806512833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.449592779565137e-06, + "grad_norm": 13.728940963745117, + "learning_rate": 4.3747350572276386e-07, + "loss": 0.5221, + "mean_token_accuracy": 0.8335368037223816, + "num_tokens": 39365971.0, + "step": 1033 + }, + { + "epoch": 0.13153542806258745, + "ewc_loss": 0.008927897550165653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.463948698685272e-06, + "grad_norm": 13.742432594299316, + "learning_rate": 4.3789741415854176e-07, + "loss": 0.5367, + "mean_token_accuracy": 0.831298828125, + "num_tokens": 39403178.0, + "step": 1034 + }, + { + "epoch": 0.13166263834117797, + "ewc_loss": 0.00894238892942667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.471194642974297e-06, + "grad_norm": 13.762323379516602, + "learning_rate": 4.383213225943196e-07, + "loss": 0.5247, + "mean_token_accuracy": 0.8322654962539673, + "num_tokens": 39441303.0, + "step": 1035 + }, + { + "epoch": 0.13178984861976847, + "ewc_loss": 0.008936229161918163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.4681146391667426e-06, + "grad_norm": 13.792359352111816, + "learning_rate": 4.3874523103009746e-07, + "loss": 0.4711, + "mean_token_accuracy": 0.8507381677627563, + "num_tokens": 39477359.0, + "step": 1036 + }, + { + "epoch": 0.13191705889835897, + "ewc_loss": 0.008937936276197433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.4689681999443565e-06, + "grad_norm": 13.762007713317871, + "learning_rate": 4.3916913946587536e-07, + "loss": 0.4798, + "mean_token_accuracy": 0.8443933129310608, + "num_tokens": 39511687.0, + "step": 1037 + }, + { + "epoch": 0.1320442691769495, + "ewc_loss": 0.008961250074207783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.48062519353698e-06, + "grad_norm": 13.872828483581543, + "learning_rate": 4.3959304790165325e-07, + "loss": 0.4958, + "mean_token_accuracy": 0.8408399224281311, + "num_tokens": 39547453.0, + "step": 1038 + }, + { + "epoch": 0.13217147945554, + "ewc_loss": 0.00895304698497057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.476523372431984e-06, + "grad_norm": 13.701632499694824, + "learning_rate": 4.400169563374311e-07, + "loss": 0.5125, + "mean_token_accuracy": 0.8356761932373047, + "num_tokens": 39588934.0, + "step": 1039 + }, + { + "epoch": 0.13229868973413053, + "ewc_loss": 0.008930033072829247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.465016445465153e-06, + "grad_norm": 13.786604881286621, + "learning_rate": 4.4044086477320895e-07, + "loss": 0.6061, + "mean_token_accuracy": 0.8138119578361511, + "num_tokens": 39627638.0, + "step": 1040 + }, + { + "epoch": 0.13242590001272103, + "ewc_loss": 0.008985880762338638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.492940206546336e-06, + "grad_norm": 13.798323631286621, + "learning_rate": 4.4086477320898685e-07, + "loss": 0.5125, + "mean_token_accuracy": 0.836067259311676, + "num_tokens": 39661696.0, + "step": 1041 + }, + { + "epoch": 0.13255311029131153, + "ewc_loss": 0.008985347114503384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.492673724598717e-06, + "grad_norm": 13.80221939086914, + "learning_rate": 4.4128868164476474e-07, + "loss": 0.5209, + "mean_token_accuracy": 0.8385932445526123, + "num_tokens": 39702357.0, + "step": 1042 + }, + { + "epoch": 0.13268032056990206, + "ewc_loss": 0.009005998261272907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.502999217947945e-06, + "grad_norm": 13.772073745727539, + "learning_rate": 4.417125900805426e-07, + "loss": 0.5045, + "mean_token_accuracy": 0.8407152891159058, + "num_tokens": 39744960.0, + "step": 1043 + }, + { + "epoch": 0.13280753084849256, + "ewc_loss": 0.009017646312713623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.508823167270748e-06, + "grad_norm": 13.840703010559082, + "learning_rate": 4.4213649851632044e-07, + "loss": 0.5637, + "mean_token_accuracy": 0.8248911499977112, + "num_tokens": 39785394.0, + "step": 1044 + }, + { + "epoch": 0.13293474112708306, + "ewc_loss": 0.009043718688189983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.52185940957861e-06, + "grad_norm": 13.848423957824707, + "learning_rate": 4.4256040695209834e-07, + "loss": 0.5443, + "mean_token_accuracy": 0.831618070602417, + "num_tokens": 39822698.0, + "step": 1045 + }, + { + "epoch": 0.1330619514056736, + "ewc_loss": 0.009045769460499287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.522884864854859e-06, + "grad_norm": 13.873945236206055, + "learning_rate": 4.429843153878762e-07, + "loss": 0.5431, + "mean_token_accuracy": 0.827008068561554, + "num_tokens": 39856874.0, + "step": 1046 + }, + { + "epoch": 0.1331891616842641, + "ewc_loss": 0.009074777364730835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.537388576864032e-06, + "grad_norm": 13.905482292175293, + "learning_rate": 4.434082238236541e-07, + "loss": 0.575, + "mean_token_accuracy": 0.8185194134712219, + "num_tokens": 39899683.0, + "step": 1047 + }, + { + "epoch": 0.1333163719628546, + "ewc_loss": 0.00904387142509222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.521935807133559e-06, + "grad_norm": 13.762951850891113, + "learning_rate": 4.4383213225943193e-07, + "loss": 0.5527, + "mean_token_accuracy": 0.827561616897583, + "num_tokens": 39940565.0, + "step": 1048 + }, + { + "epoch": 0.13344358224144512, + "ewc_loss": 0.009034420363605022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.517210072663147e-06, + "grad_norm": 13.835258483886719, + "learning_rate": 4.442560406952098e-07, + "loss": 0.5518, + "mean_token_accuracy": 0.8230055570602417, + "num_tokens": 39979010.0, + "step": 1049 + }, + { + "epoch": 0.13357079252003562, + "ewc_loss": 0.009103388525545597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.551694473775569e-06, + "grad_norm": 13.811380386352539, + "learning_rate": 4.4467994913098767e-07, + "loss": 0.5008, + "mean_token_accuracy": 0.8414767980575562, + "num_tokens": 40021151.0, + "step": 1050 + }, + { + "epoch": 0.13369800279862612, + "ewc_loss": 0.00908996257930994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.544981493381783e-06, + "grad_norm": 13.920730590820312, + "learning_rate": 4.4510385756676557e-07, + "loss": 0.5611, + "mean_token_accuracy": 0.8225639462471008, + "num_tokens": 40059324.0, + "step": 1051 + }, + { + "epoch": 0.13382521307721665, + "ewc_loss": 0.009145893156528473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.572946636471897e-06, + "grad_norm": 13.905744552612305, + "learning_rate": 4.455277660025434e-07, + "loss": 0.5157, + "mean_token_accuracy": 0.8354065418243408, + "num_tokens": 40104577.0, + "step": 1052 + }, + { + "epoch": 0.13395242335580715, + "ewc_loss": 0.009101553820073605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5507767936214805e-06, + "grad_norm": 13.901398658752441, + "learning_rate": 4.459516744383213e-07, + "loss": 0.5398, + "mean_token_accuracy": 0.8268069624900818, + "num_tokens": 40139734.0, + "step": 1053 + }, + { + "epoch": 0.13407963363439765, + "ewc_loss": 0.009129165671765804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5645829231943935e-06, + "grad_norm": 13.886850357055664, + "learning_rate": 4.4637558287409916e-07, + "loss": 0.5364, + "mean_token_accuracy": 0.8295124173164368, + "num_tokens": 40180724.0, + "step": 1054 + }, + { + "epoch": 0.13420684391298818, + "ewc_loss": 0.009122650139033794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.561325113172643e-06, + "grad_norm": 13.867147445678711, + "learning_rate": 4.4679949130987706e-07, + "loss": 0.5341, + "mean_token_accuracy": 0.8363651037216187, + "num_tokens": 40220475.0, + "step": 1055 + }, + { + "epoch": 0.13433405419157868, + "ewc_loss": 0.009137718006968498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.568858912534779e-06, + "grad_norm": 13.926718711853027, + "learning_rate": 4.472233997456549e-07, + "loss": 0.5465, + "mean_token_accuracy": 0.8269345760345459, + "num_tokens": 40258208.0, + "step": 1056 + }, + { + "epoch": 0.13446126447016918, + "ewc_loss": 0.009165364317595959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.582682322507026e-06, + "grad_norm": 13.96058464050293, + "learning_rate": 4.476473081814328e-07, + "loss": 0.5294, + "mean_token_accuracy": 0.8315211534500122, + "num_tokens": 40299127.0, + "step": 1057 + }, + { + "epoch": 0.1345884747487597, + "ewc_loss": 0.00914840679615736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5742035581497476e-06, + "grad_norm": 13.931924819946289, + "learning_rate": 4.4807121661721065e-07, + "loss": 0.4875, + "mean_token_accuracy": 0.8422691822052002, + "num_tokens": 40339331.0, + "step": 1058 + }, + { + "epoch": 0.1347156850273502, + "ewc_loss": 0.009152360260486603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.576180344884051e-06, + "grad_norm": 13.990415573120117, + "learning_rate": 4.4849512505298855e-07, + "loss": 0.5323, + "mean_token_accuracy": 0.830143928527832, + "num_tokens": 40379386.0, + "step": 1059 + }, + { + "epoch": 0.1348428953059407, + "ewc_loss": 0.009148350916802883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.574175363813993e-06, + "grad_norm": 13.943065643310547, + "learning_rate": 4.489190334887664e-07, + "loss": 0.571, + "mean_token_accuracy": 0.8260332345962524, + "num_tokens": 40419868.0, + "step": 1060 + }, + { + "epoch": 0.13497010558453124, + "ewc_loss": 0.009165952913463116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.582976544043049e-06, + "grad_norm": 13.943622589111328, + "learning_rate": 4.493429419245443e-07, + "loss": 0.5384, + "mean_token_accuracy": 0.8320031762123108, + "num_tokens": 40455346.0, + "step": 1061 + }, + { + "epoch": 0.13509731586312174, + "ewc_loss": 0.00916560459882021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.58280237580766e-06, + "grad_norm": 13.988329887390137, + "learning_rate": 4.4976685036032214e-07, + "loss": 0.4971, + "mean_token_accuracy": 0.8411473035812378, + "num_tokens": 40490087.0, + "step": 1062 + }, + { + "epoch": 0.13522452614171224, + "ewc_loss": 0.009179056622087955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.5895280891272705e-06, + "grad_norm": 13.887304306030273, + "learning_rate": 4.5019075879610004e-07, + "loss": 0.5546, + "mean_token_accuracy": 0.820783257484436, + "num_tokens": 40524799.0, + "step": 1063 + }, + { + "epoch": 0.13535173642030277, + "ewc_loss": 0.009174926206469536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.587463081406895e-06, + "grad_norm": 13.964868545532227, + "learning_rate": 4.506146672318779e-07, + "loss": 0.5286, + "mean_token_accuracy": 0.8319329023361206, + "num_tokens": 40563740.0, + "step": 1064 + }, + { + "epoch": 0.13547894669889327, + "ewc_loss": 0.009203964844346046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.601982254825998e-06, + "grad_norm": 13.891129493713379, + "learning_rate": 4.5103857566765573e-07, + "loss": 0.5814, + "mean_token_accuracy": 0.823082685470581, + "num_tokens": 40604743.0, + "step": 1065 + }, + { + "epoch": 0.1356061569774838, + "ewc_loss": 0.009206068702042103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.603034540195949e-06, + "grad_norm": 13.994683265686035, + "learning_rate": 4.5146248410343363e-07, + "loss": 0.4878, + "mean_token_accuracy": 0.8454699516296387, + "num_tokens": 40640909.0, + "step": 1066 + }, + { + "epoch": 0.1357333672560743, + "ewc_loss": 0.009241904132068157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.620952040568227e-06, + "grad_norm": 13.970565795898438, + "learning_rate": 4.5188639253921153e-07, + "loss": 0.495, + "mean_token_accuracy": 0.844117283821106, + "num_tokens": 40678930.0, + "step": 1067 + }, + { + "epoch": 0.1358605775346648, + "ewc_loss": 0.009230168536305428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.615084435499739e-06, + "grad_norm": 13.917909622192383, + "learning_rate": 4.523103009749894e-07, + "loss": 0.4895, + "mean_token_accuracy": 0.8438236713409424, + "num_tokens": 40715609.0, + "step": 1068 + }, + { + "epoch": 0.13598778781325532, + "ewc_loss": 0.00926006119698286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.630030616681324e-06, + "grad_norm": 14.000394821166992, + "learning_rate": 4.527342094107672e-07, + "loss": 0.513, + "mean_token_accuracy": 0.8360523581504822, + "num_tokens": 40760533.0, + "step": 1069 + }, + { + "epoch": 0.13611499809184582, + "ewc_loss": 0.0092764962464571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.638247901311843e-06, + "grad_norm": 13.993982315063477, + "learning_rate": 4.531581178465451e-07, + "loss": 0.5407, + "mean_token_accuracy": 0.829304039478302, + "num_tokens": 40800039.0, + "step": 1070 + }, + { + "epoch": 0.13624220837043632, + "ewc_loss": 0.009268153458833694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.634076503862161e-06, + "grad_norm": 13.985947608947754, + "learning_rate": 4.53582026282323e-07, + "loss": 0.505, + "mean_token_accuracy": 0.8408092856407166, + "num_tokens": 40840207.0, + "step": 1071 + }, + { + "epoch": 0.13636941864902685, + "ewc_loss": 0.009282844141125679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.64142203782103e-06, + "grad_norm": 13.99570369720459, + "learning_rate": 4.5400593471810087e-07, + "loss": 0.5866, + "mean_token_accuracy": 0.8153833150863647, + "num_tokens": 40879559.0, + "step": 1072 + }, + { + "epoch": 0.13649662892761735, + "ewc_loss": 0.009283015504479408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.641507985070348e-06, + "grad_norm": 13.95140552520752, + "learning_rate": 4.544298431538787e-07, + "loss": 0.5136, + "mean_token_accuracy": 0.8383944034576416, + "num_tokens": 40919608.0, + "step": 1073 + }, + { + "epoch": 0.13662383920620785, + "ewc_loss": 0.009285581298172474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.6427908273471985e-06, + "grad_norm": 14.005651473999023, + "learning_rate": 4.548537515896566e-07, + "loss": 0.5749, + "mean_token_accuracy": 0.8190098404884338, + "num_tokens": 40953648.0, + "step": 1074 + }, + { + "epoch": 0.13675104948479838, + "ewc_loss": 0.009310691617429256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.655345946957823e-06, + "grad_norm": 13.993058204650879, + "learning_rate": 4.552776600254345e-07, + "loss": 0.5459, + "mean_token_accuracy": 0.8287808895111084, + "num_tokens": 40989675.0, + "step": 1075 + }, + { + "epoch": 0.13687825976338888, + "ewc_loss": 0.009322614409029484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.6613072299805935e-06, + "grad_norm": 14.0015869140625, + "learning_rate": 4.5570156846121236e-07, + "loss": 0.4965, + "mean_token_accuracy": 0.8423683643341064, + "num_tokens": 41027992.0, + "step": 1076 + }, + { + "epoch": 0.13700547004197938, + "ewc_loss": 0.009332257322967052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.666128461394692e-06, + "grad_norm": 14.028060913085938, + "learning_rate": 4.561254768969902e-07, + "loss": 0.4966, + "mean_token_accuracy": 0.8393007516860962, + "num_tokens": 41066925.0, + "step": 1077 + }, + { + "epoch": 0.1371326803205699, + "ewc_loss": 0.00933091714978218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.665458618546836e-06, + "grad_norm": 14.003893852233887, + "learning_rate": 4.565493853327681e-07, + "loss": 0.5059, + "mean_token_accuracy": 0.8406851291656494, + "num_tokens": 41105090.0, + "step": 1078 + }, + { + "epoch": 0.1372598905991604, + "ewc_loss": 0.009371569380164146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.685784915636759e-06, + "grad_norm": 14.097646713256836, + "learning_rate": 4.56973293768546e-07, + "loss": 0.5476, + "mean_token_accuracy": 0.8313796520233154, + "num_tokens": 41141924.0, + "step": 1079 + }, + { + "epoch": 0.1373871008777509, + "ewc_loss": 0.009359612129628658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.679805897467304e-06, + "grad_norm": 14.105934143066406, + "learning_rate": 4.573972022043238e-07, + "loss": 0.61, + "mean_token_accuracy": 0.8081446886062622, + "num_tokens": 41181512.0, + "step": 1080 + }, + { + "epoch": 0.13751431115634144, + "ewc_loss": 0.009355809539556503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.677904598793248e-06, + "grad_norm": 13.989190101623535, + "learning_rate": 4.578211106401017e-07, + "loss": 0.5573, + "mean_token_accuracy": 0.8233332633972168, + "num_tokens": 41224015.0, + "step": 1081 + }, + { + "epoch": 0.13764152143493194, + "ewc_loss": 0.00937581155449152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.687905857281294e-06, + "grad_norm": 14.094194412231445, + "learning_rate": 4.582450190758796e-07, + "loss": 0.5591, + "mean_token_accuracy": 0.825271725654602, + "num_tokens": 41265568.0, + "step": 1082 + }, + { + "epoch": 0.13776873171352244, + "ewc_loss": 0.00939455907791853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.697279564425116e-06, + "grad_norm": 14.049311637878418, + "learning_rate": 4.586689275116575e-07, + "loss": 0.471, + "mean_token_accuracy": 0.8502591848373413, + "num_tokens": 41303595.0, + "step": 1083 + }, + { + "epoch": 0.13789594199211297, + "ewc_loss": 0.00936843827366829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.684219220507657e-06, + "grad_norm": 14.03512954711914, + "learning_rate": 4.590928359474353e-07, + "loss": 0.5772, + "mean_token_accuracy": 0.8213293552398682, + "num_tokens": 41340119.0, + "step": 1084 + }, + { + "epoch": 0.13802315227070347, + "ewc_loss": 0.009423869661986828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.7119347073021345e-06, + "grad_norm": 14.059184074401855, + "learning_rate": 4.595167443832132e-07, + "loss": 0.5276, + "mean_token_accuracy": 0.8362373113632202, + "num_tokens": 41380184.0, + "step": 1085 + }, + { + "epoch": 0.13815036254929397, + "ewc_loss": 0.009414845146238804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.707422704086639e-06, + "grad_norm": 14.080291748046875, + "learning_rate": 4.599406528189911e-07, + "loss": 0.4696, + "mean_token_accuracy": 0.8485331535339355, + "num_tokens": 41419038.0, + "step": 1086 + }, + { + "epoch": 0.1382775728278845, + "ewc_loss": 0.00942806713283062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.7140333663264755e-06, + "grad_norm": 14.060782432556152, + "learning_rate": 4.60364561254769e-07, + "loss": 0.5152, + "mean_token_accuracy": 0.8337230086326599, + "num_tokens": 41455767.0, + "step": 1087 + }, + { + "epoch": 0.138404783106475, + "ewc_loss": 0.009436024352908134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.718012405646732e-06, + "grad_norm": 14.09068489074707, + "learning_rate": 4.607884696905468e-07, + "loss": 0.5115, + "mean_token_accuracy": 0.8364710807800293, + "num_tokens": 41497493.0, + "step": 1088 + }, + { + "epoch": 0.1385319933850655, + "ewc_loss": 0.009444545023143291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.7222724788298365e-06, + "grad_norm": 14.103615760803223, + "learning_rate": 4.612123781263247e-07, + "loss": 0.5415, + "mean_token_accuracy": 0.82908695936203, + "num_tokens": 41542940.0, + "step": 1089 + }, + { + "epoch": 0.13865920366365603, + "ewc_loss": 0.00945286825299263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.726434326585149e-06, + "grad_norm": 14.08548641204834, + "learning_rate": 4.616362865621026e-07, + "loss": 0.4426, + "mean_token_accuracy": 0.8549007177352905, + "num_tokens": 41579302.0, + "step": 1090 + }, + { + "epoch": 0.13878641394224653, + "ewc_loss": 0.009452566504478455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.726283350464655e-06, + "grad_norm": 14.138419151306152, + "learning_rate": 4.620601949978805e-07, + "loss": 0.5421, + "mean_token_accuracy": 0.8300639986991882, + "num_tokens": 41617555.0, + "step": 1091 + }, + { + "epoch": 0.13891362422083706, + "ewc_loss": 0.00947456806898117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.7372841436299495e-06, + "grad_norm": 14.155646324157715, + "learning_rate": 4.6248410343365827e-07, + "loss": 0.5916, + "mean_token_accuracy": 0.8129134178161621, + "num_tokens": 41654660.0, + "step": 1092 + }, + { + "epoch": 0.13904083449942756, + "ewc_loss": 0.009470860473811626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.7354301386803854e-06, + "grad_norm": 14.131359100341797, + "learning_rate": 4.6290801186943617e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8411056995391846, + "num_tokens": 41693773.0, + "step": 1093 + }, + { + "epoch": 0.13916804477801806, + "ewc_loss": 0.009476915001869202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.738457391795237e-06, + "grad_norm": 14.15633773803711, + "learning_rate": 4.6333192030521407e-07, + "loss": 0.5138, + "mean_token_accuracy": 0.8402119278907776, + "num_tokens": 41728864.0, + "step": 1094 + }, + { + "epoch": 0.13929525505660859, + "ewc_loss": 0.009493195451796055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.746597824123455e-06, + "grad_norm": 14.106550216674805, + "learning_rate": 4.6375582874099196e-07, + "loss": 0.4884, + "mean_token_accuracy": 0.8446203470230103, + "num_tokens": 41770596.0, + "step": 1095 + }, + { + "epoch": 0.1394224653351991, + "ewc_loss": 0.0094917556270957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.745877959067002e-06, + "grad_norm": 14.168538093566895, + "learning_rate": 4.6417973717676976e-07, + "loss": 0.508, + "mean_token_accuracy": 0.8425617218017578, + "num_tokens": 41806585.0, + "step": 1096 + }, + { + "epoch": 0.1395496756137896, + "ewc_loss": 0.009514013305306435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.7570065362378955e-06, + "grad_norm": 14.125447273254395, + "learning_rate": 4.6460364561254766e-07, + "loss": 0.5349, + "mean_token_accuracy": 0.829821765422821, + "num_tokens": 41845488.0, + "step": 1097 + }, + { + "epoch": 0.13967688589238011, + "ewc_loss": 0.00951583031564951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.757915121444967e-06, + "grad_norm": 14.163328170776367, + "learning_rate": 4.6502755404832556e-07, + "loss": 0.5009, + "mean_token_accuracy": 0.8388386368751526, + "num_tokens": 41880674.0, + "step": 1098 + }, + { + "epoch": 0.13980409617097062, + "ewc_loss": 0.009550376795232296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.775188244821038e-06, + "grad_norm": 14.20603084564209, + "learning_rate": 4.654514624841034e-07, + "loss": 0.5152, + "mean_token_accuracy": 0.8362094759941101, + "num_tokens": 41913020.0, + "step": 1099 + }, + { + "epoch": 0.13993130644956112, + "ewc_loss": 0.009555621072649956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.7778103180462494e-06, + "grad_norm": 14.192513465881348, + "learning_rate": 4.6587537091988125e-07, + "loss": 0.521, + "mean_token_accuracy": 0.8370234966278076, + "num_tokens": 41948483.0, + "step": 1100 + }, + { + "epoch": 0.14005851672815164, + "ewc_loss": 0.009546525776386261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.773262844537385e-06, + "grad_norm": 14.259893417358398, + "learning_rate": 4.6629927935565915e-07, + "loss": 0.4915, + "mean_token_accuracy": 0.844287633895874, + "num_tokens": 41985659.0, + "step": 1101 + }, + { + "epoch": 0.14018572700674214, + "ewc_loss": 0.009583410806953907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.791705578099936e-06, + "grad_norm": 14.183409690856934, + "learning_rate": 4.6672318779143705e-07, + "loss": 0.5904, + "mean_token_accuracy": 0.8154910802841187, + "num_tokens": 42027746.0, + "step": 1102 + }, + { + "epoch": 0.14031293728533265, + "ewc_loss": 0.009567297995090485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.783648819284281e-06, + "grad_norm": 14.220212936401367, + "learning_rate": 4.671470962272149e-07, + "loss": 0.5468, + "mean_token_accuracy": 0.8263266086578369, + "num_tokens": 42060903.0, + "step": 1103 + }, + { + "epoch": 0.14044014756392317, + "ewc_loss": 0.009587152861058712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.793576408701483e-06, + "grad_norm": 14.203092575073242, + "learning_rate": 4.6757100466299274e-07, + "loss": 0.5402, + "mean_token_accuracy": 0.8325966596603394, + "num_tokens": 42102722.0, + "step": 1104 + }, + { + "epoch": 0.14056735784251367, + "ewc_loss": 0.009581208229064941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.790604180016089e-06, + "grad_norm": 14.204320907592773, + "learning_rate": 4.6799491309877064e-07, + "loss": 0.5499, + "mean_token_accuracy": 0.8239631056785583, + "num_tokens": 42143418.0, + "step": 1105 + }, + { + "epoch": 0.14069456812110417, + "ewc_loss": 0.00960476603358984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.8023830458987504e-06, + "grad_norm": 14.260993957519531, + "learning_rate": 4.6841882153454854e-07, + "loss": 0.4896, + "mean_token_accuracy": 0.8417244553565979, + "num_tokens": 42177313.0, + "step": 1106 + }, + { + "epoch": 0.1408217783996947, + "ewc_loss": 0.00961158238351345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.8057913772936445e-06, + "grad_norm": 14.276131629943848, + "learning_rate": 4.688427299703264e-07, + "loss": 0.4678, + "mean_token_accuracy": 0.8474429845809937, + "num_tokens": 42214158.0, + "step": 1107 + }, + { + "epoch": 0.1409489886782852, + "ewc_loss": 0.009619840420782566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.809920028492343e-06, + "grad_norm": 14.25014591217041, + "learning_rate": 4.6926663840610423e-07, + "loss": 0.5272, + "mean_token_accuracy": 0.8337125778198242, + "num_tokens": 42254189.0, + "step": 1108 + }, + { + "epoch": 0.1410761989568757, + "ewc_loss": 0.009617205709218979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.8086030801641755e-06, + "grad_norm": 14.280471801757812, + "learning_rate": 4.6969054684188213e-07, + "loss": 0.5501, + "mean_token_accuracy": 0.8231134414672852, + "num_tokens": 42296853.0, + "step": 1109 + }, + { + "epoch": 0.14120340923546623, + "ewc_loss": 0.009640076197683811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.8200381570495665e-06, + "grad_norm": 14.24014949798584, + "learning_rate": 4.7011445527766003e-07, + "loss": 0.5202, + "mean_token_accuracy": 0.8322756886482239, + "num_tokens": 42331983.0, + "step": 1110 + }, + { + "epoch": 0.14133061951405673, + "ewc_loss": 0.009644592180848122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.822295977646718e-06, + "grad_norm": 14.238303184509277, + "learning_rate": 4.7053836371343787e-07, + "loss": 0.5291, + "mean_token_accuracy": 0.8347753286361694, + "num_tokens": 42370496.0, + "step": 1111 + }, + { + "epoch": 0.14145782979264723, + "ewc_loss": 0.009668337181210518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.834168521483662e-06, + "grad_norm": 14.285630226135254, + "learning_rate": 4.709622721492157e-07, + "loss": 0.5087, + "mean_token_accuracy": 0.8376057147979736, + "num_tokens": 42406489.0, + "step": 1112 + }, + { + "epoch": 0.14158504007123776, + "ewc_loss": 0.00969000905752182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.845004696107935e-06, + "grad_norm": 14.317304611206055, + "learning_rate": 4.713861805849936e-07, + "loss": 0.5072, + "mean_token_accuracy": 0.837196409702301, + "num_tokens": 42440679.0, + "step": 1113 + }, + { + "epoch": 0.14171225034982826, + "ewc_loss": 0.009688694030046463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.844347131438553e-06, + "grad_norm": 14.316288948059082, + "learning_rate": 4.718100890207715e-07, + "loss": 0.5813, + "mean_token_accuracy": 0.8166797757148743, + "num_tokens": 42476699.0, + "step": 1114 + }, + { + "epoch": 0.1418394606284188, + "ewc_loss": 0.009677381254732609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.838690529140877e-06, + "grad_norm": 14.252989768981934, + "learning_rate": 4.7223399745654936e-07, + "loss": 0.5375, + "mean_token_accuracy": 0.8300337195396423, + "num_tokens": 42513348.0, + "step": 1115 + }, + { + "epoch": 0.1419666709070093, + "ewc_loss": 0.009715071879327297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.8575357141089626e-06, + "grad_norm": 14.30615234375, + "learning_rate": 4.726579058923272e-07, + "loss": 0.5278, + "mean_token_accuracy": 0.8351281881332397, + "num_tokens": 42550633.0, + "step": 1116 + }, + { + "epoch": 0.1420938811855998, + "ewc_loss": 0.009710656479001045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.85532837046776e-06, + "grad_norm": 14.217012405395508, + "learning_rate": 4.730818143281051e-07, + "loss": 0.5518, + "mean_token_accuracy": 0.8287283182144165, + "num_tokens": 42593649.0, + "step": 1117 + }, + { + "epoch": 0.14222109146419032, + "ewc_loss": 0.00972024630755186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.8601232265355065e-06, + "grad_norm": 14.28487491607666, + "learning_rate": 4.7350572276388295e-07, + "loss": 0.4996, + "mean_token_accuracy": 0.8413839340209961, + "num_tokens": 42632379.0, + "step": 1118 + }, + { + "epoch": 0.14234830174278082, + "ewc_loss": 0.009760727174580097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.880363576376112e-06, + "grad_norm": 14.24125862121582, + "learning_rate": 4.7392963119966085e-07, + "loss": 0.5257, + "mean_token_accuracy": 0.8328539133071899, + "num_tokens": 42670319.0, + "step": 1119 + }, + { + "epoch": 0.14247551202137132, + "ewc_loss": 0.009747300297021866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.873650141234975e-06, + "grad_norm": 14.272563934326172, + "learning_rate": 4.7435353963543875e-07, + "loss": 0.5188, + "mean_token_accuracy": 0.8372066020965576, + "num_tokens": 42712680.0, + "step": 1120 + }, + { + "epoch": 0.14260272229996185, + "ewc_loss": 0.009765218943357468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.8826095735421404e-06, + "grad_norm": 14.226621627807617, + "learning_rate": 4.747774480712166e-07, + "loss": 0.536, + "mean_token_accuracy": 0.8329750895500183, + "num_tokens": 42748241.0, + "step": 1121 + }, + { + "epoch": 0.14272993257855235, + "ewc_loss": 0.009759167209267616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.879583684669342e-06, + "grad_norm": 14.309849739074707, + "learning_rate": 4.7520135650699444e-07, + "loss": 0.5571, + "mean_token_accuracy": 0.8254579305648804, + "num_tokens": 42783208.0, + "step": 1122 + }, + { + "epoch": 0.14285714285714285, + "ewc_loss": 0.009803314693272114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.901657575828722e-06, + "grad_norm": 14.31198501586914, + "learning_rate": 4.7562526494277234e-07, + "loss": 0.5316, + "mean_token_accuracy": 0.832974910736084, + "num_tokens": 42818570.0, + "step": 1123 + }, + { + "epoch": 0.14298435313573338, + "ewc_loss": 0.009825966320931911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.912983058602549e-06, + "grad_norm": 14.317505836486816, + "learning_rate": 4.7604917337855024e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8486739993095398, + "num_tokens": 42852891.0, + "step": 1124 + }, + { + "epoch": 0.14311156341432388, + "ewc_loss": 0.009806469082832336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.903234639641596e-06, + "grad_norm": 14.32647705078125, + "learning_rate": 4.764730818143281e-07, + "loss": 0.5206, + "mean_token_accuracy": 0.8290985226631165, + "num_tokens": 42887359.0, + "step": 1125 + }, + { + "epoch": 0.14323877369291438, + "ewc_loss": 0.009836941957473755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.918470949633047e-06, + "grad_norm": 14.317757606506348, + "learning_rate": 4.768969902501059e-07, + "loss": 0.5738, + "mean_token_accuracy": 0.8192880153656006, + "num_tokens": 42929308.0, + "step": 1126 + }, + { + "epoch": 0.1433659839715049, + "ewc_loss": 0.009823878295719624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.911938958684914e-06, + "grad_norm": 14.327916145324707, + "learning_rate": 4.773208986858838e-07, + "loss": 0.5226, + "mean_token_accuracy": 0.8325586318969727, + "num_tokens": 42969373.0, + "step": 1127 + }, + { + "epoch": 0.1434931942500954, + "ewc_loss": 0.009846373461186886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.92318667966174e-06, + "grad_norm": 14.307252883911133, + "learning_rate": 4.777448071216617e-07, + "loss": 0.5264, + "mean_token_accuracy": 0.8346211314201355, + "num_tokens": 43010133.0, + "step": 1128 + }, + { + "epoch": 0.1436204045286859, + "ewc_loss": 0.009870345704257488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.935172910336405e-06, + "grad_norm": 14.444864273071289, + "learning_rate": 4.781687155574396e-07, + "loss": 0.5541, + "mean_token_accuracy": 0.8260529041290283, + "num_tokens": 43053743.0, + "step": 1129 + }, + { + "epoch": 0.14374761480727644, + "ewc_loss": 0.009881917387247086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9409586608817335e-06, + "grad_norm": 14.351634979248047, + "learning_rate": 4.785926239932175e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.8427733182907104, + "num_tokens": 43094125.0, + "step": 1130 + }, + { + "epoch": 0.14387482508586694, + "ewc_loss": 0.009841838851571083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.92091930937022e-06, + "grad_norm": 14.375869750976562, + "learning_rate": 4.790165324289953e-07, + "loss": 0.5208, + "mean_token_accuracy": 0.8325220346450806, + "num_tokens": 43132399.0, + "step": 1131 + }, + { + "epoch": 0.14400203536445744, + "ewc_loss": 0.009888686239719391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9443433454143815e-06, + "grad_norm": 14.455394744873047, + "learning_rate": 4.794404408647732e-07, + "loss": 0.5355, + "mean_token_accuracy": 0.8324418067932129, + "num_tokens": 43169036.0, + "step": 1132 + }, + { + "epoch": 0.14412924564304797, + "ewc_loss": 0.009873246774077415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.936623554385733e-06, + "grad_norm": 14.333703994750977, + "learning_rate": 4.798643493005511e-07, + "loss": 0.463, + "mean_token_accuracy": 0.8504175543785095, + "num_tokens": 43209397.0, + "step": 1133 + }, + { + "epoch": 0.14425645592163847, + "ewc_loss": 0.009891380555927753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.945690307067707e-06, + "grad_norm": 14.420368194580078, + "learning_rate": 4.80288257736329e-07, + "loss": 0.5625, + "mean_token_accuracy": 0.8241850137710571, + "num_tokens": 43246788.0, + "step": 1134 + }, + { + "epoch": 0.14438366620022897, + "ewc_loss": 0.009906020015478134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.953009920427576e-06, + "grad_norm": 14.390603065490723, + "learning_rate": 4.807121661721068e-07, + "loss": 0.4729, + "mean_token_accuracy": 0.8485621809959412, + "num_tokens": 43282774.0, + "step": 1135 + }, + { + "epoch": 0.1445108764788195, + "ewc_loss": 0.00990871712565422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.954358701070305e-06, + "grad_norm": 14.36614990234375, + "learning_rate": 4.811360746078847e-07, + "loss": 0.5313, + "mean_token_accuracy": 0.8368664979934692, + "num_tokens": 43322303.0, + "step": 1136 + }, + { + "epoch": 0.14463808675741, + "ewc_loss": 0.009912285022437572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9561426749278326e-06, + "grad_norm": 14.445779800415039, + "learning_rate": 4.815599830436625e-07, + "loss": 0.5235, + "mean_token_accuracy": 0.8341408967971802, + "num_tokens": 43354611.0, + "step": 1137 + }, + { + "epoch": 0.1447652970360005, + "ewc_loss": 0.009932282380759716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.966141204931773e-06, + "grad_norm": 14.400945663452148, + "learning_rate": 4.819838914794405e-07, + "loss": 0.5041, + "mean_token_accuracy": 0.8394832015037537, + "num_tokens": 43390640.0, + "step": 1138 + }, + { + "epoch": 0.14489250731459102, + "ewc_loss": 0.009920700453221798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.960350452165585e-06, + "grad_norm": 14.448420524597168, + "learning_rate": 4.824077999152183e-07, + "loss": 0.5261, + "mean_token_accuracy": 0.8298512697219849, + "num_tokens": 43424513.0, + "step": 1139 + }, + { + "epoch": 0.14501971759318152, + "ewc_loss": 0.009936925023794174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.968462690158049e-06, + "grad_norm": 14.435724258422852, + "learning_rate": 4.828317083509962e-07, + "loss": 0.5318, + "mean_token_accuracy": 0.8312188982963562, + "num_tokens": 43461699.0, + "step": 1140 + }, + { + "epoch": 0.14514692787177205, + "ewc_loss": 0.009952405467629433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.976202944817487e-06, + "grad_norm": 14.492993354797363, + "learning_rate": 4.83255616786774e-07, + "loss": 0.5052, + "mean_token_accuracy": 0.8437352776527405, + "num_tokens": 43494681.0, + "step": 1141 + }, + { + "epoch": 0.14527413815036255, + "ewc_loss": 0.009954061359167099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9770305849961005e-06, + "grad_norm": 14.415818214416504, + "learning_rate": 4.83679525222552e-07, + "loss": 0.4805, + "mean_token_accuracy": 0.8474551439285278, + "num_tokens": 43531318.0, + "step": 1142 + }, + { + "epoch": 0.14540134842895305, + "ewc_loss": 0.009963298216462135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9816489990917034e-06, + "grad_norm": 14.456171035766602, + "learning_rate": 4.841034336583298e-07, + "loss": 0.4954, + "mean_token_accuracy": 0.8470645546913147, + "num_tokens": 43569991.0, + "step": 1143 + }, + { + "epoch": 0.14552855870754358, + "ewc_loss": 0.009988043457269669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.994021765014622e-06, + "grad_norm": 14.468841552734375, + "learning_rate": 4.845273420941076e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.8482561111450195, + "num_tokens": 43609978.0, + "step": 1144 + }, + { + "epoch": 0.14565576898613408, + "ewc_loss": 0.009981243871152401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.990622073819395e-06, + "grad_norm": 14.51187801361084, + "learning_rate": 4.849512505298855e-07, + "loss": 0.5338, + "mean_token_accuracy": 0.8274901509284973, + "num_tokens": 43640419.0, + "step": 1145 + }, + { + "epoch": 0.14578297926472458, + "ewc_loss": 0.009992547333240509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.996273673896212e-06, + "grad_norm": 14.42911148071289, + "learning_rate": 4.853751589656634e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.8438313603401184, + "num_tokens": 43682408.0, + "step": 1146 + }, + { + "epoch": 0.1459101895433151, + "ewc_loss": 0.009996061213314533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9980308176600374e-06, + "grad_norm": 14.487724304199219, + "learning_rate": 4.857990674014413e-07, + "loss": 0.534, + "mean_token_accuracy": 0.8296401500701904, + "num_tokens": 43726158.0, + "step": 1147 + }, + { + "epoch": 0.1460373998219056, + "ewc_loss": 0.010000525042414665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.000262717658188e-06, + "grad_norm": 14.407257080078125, + "learning_rate": 4.862229758372191e-07, + "loss": 0.5651, + "mean_token_accuracy": 0.8193133473396301, + "num_tokens": 43766466.0, + "step": 1148 + }, + { + "epoch": 0.1461646101004961, + "ewc_loss": 0.009992800652980804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 4.9964000936597586e-06, + "grad_norm": 14.444337844848633, + "learning_rate": 4.86646884272997e-07, + "loss": 0.5492, + "mean_token_accuracy": 0.8243252635002136, + "num_tokens": 43807255.0, + "step": 1149 + }, + { + "epoch": 0.14629182037908664, + "ewc_loss": 0.010049975477159023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.024987785873236e-06, + "grad_norm": 14.564811706542969, + "learning_rate": 4.870707927087749e-07, + "loss": 0.5287, + "mean_token_accuracy": 0.8315012454986572, + "num_tokens": 43845419.0, + "step": 1150 + }, + { + "epoch": 0.14641903065767714, + "ewc_loss": 0.01004960760474205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.024803613196127e-06, + "grad_norm": 14.496149063110352, + "learning_rate": 4.874947011445528e-07, + "loss": 0.538, + "mean_token_accuracy": 0.8330004215240479, + "num_tokens": 43882942.0, + "step": 1151 + }, + { + "epoch": 0.14654624093626764, + "ewc_loss": 0.010019938461482525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.00996929986286e-06, + "grad_norm": 14.47654914855957, + "learning_rate": 4.879186095803306e-07, + "loss": 0.4552, + "mean_token_accuracy": 0.855011522769928, + "num_tokens": 43923109.0, + "step": 1152 + }, + { + "epoch": 0.14667345121485817, + "ewc_loss": 0.010069204494357109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.034602054365678e-06, + "grad_norm": 14.569666862487793, + "learning_rate": 4.883425180161085e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.8407739400863647, + "num_tokens": 43959752.0, + "step": 1153 + }, + { + "epoch": 0.14680066149344867, + "ewc_loss": 0.01003353763371706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.016768682253314e-06, + "grad_norm": 14.454692840576172, + "learning_rate": 4.887664264518864e-07, + "loss": 0.5422, + "mean_token_accuracy": 0.8340466022491455, + "num_tokens": 43996247.0, + "step": 1154 + }, + { + "epoch": 0.14692787177203917, + "ewc_loss": 0.010035957209765911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.017978764954023e-06, + "grad_norm": 14.425912857055664, + "learning_rate": 4.891903348876643e-07, + "loss": 0.5225, + "mean_token_accuracy": 0.8376102447509766, + "num_tokens": 44035751.0, + "step": 1155 + }, + { + "epoch": 0.1470550820506297, + "ewc_loss": 0.01007908396422863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.039541974838357e-06, + "grad_norm": 14.503690719604492, + "learning_rate": 4.896142433234421e-07, + "loss": 0.558, + "mean_token_accuracy": 0.8288617134094238, + "num_tokens": 44073225.0, + "step": 1156 + }, + { + "epoch": 0.1471822923292202, + "ewc_loss": 0.010083384811878204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.0416924750606995e-06, + "grad_norm": 14.534591674804688, + "learning_rate": 4.9003815175922e-07, + "loss": 0.5616, + "mean_token_accuracy": 0.8291875123977661, + "num_tokens": 44111761.0, + "step": 1157 + }, + { + "epoch": 0.1473095026078107, + "ewc_loss": 0.010092586278915405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.046293154009618e-06, + "grad_norm": 14.511506080627441, + "learning_rate": 4.904620601949979e-07, + "loss": 0.5123, + "mean_token_accuracy": 0.8377161622047424, + "num_tokens": 44151311.0, + "step": 1158 + }, + { + "epoch": 0.14743671288640123, + "ewc_loss": 0.010097612626850605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.048806542617967e-06, + "grad_norm": 14.547115325927734, + "learning_rate": 4.908859686307758e-07, + "loss": 0.5591, + "mean_token_accuracy": 0.8268142938613892, + "num_tokens": 44184623.0, + "step": 1159 + }, + { + "epoch": 0.14756392316499173, + "ewc_loss": 0.010097092017531395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.048545972385909e-06, + "grad_norm": 14.425410270690918, + "learning_rate": 4.913098770665536e-07, + "loss": 0.5433, + "mean_token_accuracy": 0.8319092988967896, + "num_tokens": 44221753.0, + "step": 1160 + }, + { + "epoch": 0.14769113344358223, + "ewc_loss": 0.010096831247210503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.048415459896205e-06, + "grad_norm": 14.534953117370605, + "learning_rate": 4.917337855023314e-07, + "loss": 0.4965, + "mean_token_accuracy": 0.8386896848678589, + "num_tokens": 44262542.0, + "step": 1161 + }, + { + "epoch": 0.14781834372217276, + "ewc_loss": 0.01013443898409605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.06721926285536e-06, + "grad_norm": 14.563644409179688, + "learning_rate": 4.921576939381094e-07, + "loss": 0.5201, + "mean_token_accuracy": 0.839801549911499, + "num_tokens": 44300294.0, + "step": 1162 + }, + { + "epoch": 0.14794555400076326, + "ewc_loss": 0.010135144926607609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.067572601546999e-06, + "grad_norm": 14.570244789123535, + "learning_rate": 4.925816023738872e-07, + "loss": 0.5651, + "mean_token_accuracy": 0.8218849301338196, + "num_tokens": 44338476.0, + "step": 1163 + }, + { + "epoch": 0.14807276427935376, + "ewc_loss": 0.010150887072086334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.075443368696142e-06, + "grad_norm": 14.611701011657715, + "learning_rate": 4.930055108096651e-07, + "loss": 0.4687, + "mean_token_accuracy": 0.849895715713501, + "num_tokens": 44367437.0, + "step": 1164 + }, + { + "epoch": 0.1481999745579443, + "ewc_loss": 0.010158331133425236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.079165475763148e-06, + "grad_norm": 14.57099437713623, + "learning_rate": 4.934294192454429e-07, + "loss": 0.5773, + "mean_token_accuracy": 0.8190500140190125, + "num_tokens": 44402636.0, + "step": 1165 + }, + { + "epoch": 0.1483271848365348, + "ewc_loss": 0.010162544436752796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.081272320239805e-06, + "grad_norm": 14.564095497131348, + "learning_rate": 4.938533276812209e-07, + "loss": 0.5194, + "mean_token_accuracy": 0.8368186354637146, + "num_tokens": 44438187.0, + "step": 1166 + }, + { + "epoch": 0.14845439511512531, + "ewc_loss": 0.010193060152232647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.096530003356747e-06, + "grad_norm": 14.615513801574707, + "learning_rate": 4.942772361169987e-07, + "loss": 0.4437, + "mean_token_accuracy": 0.853522539138794, + "num_tokens": 44478345.0, + "step": 1167 + }, + { + "epoch": 0.14858160539371582, + "ewc_loss": 0.010189751163125038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.094875632494222e-06, + "grad_norm": 14.525199890136719, + "learning_rate": 4.947011445527766e-07, + "loss": 0.5243, + "mean_token_accuracy": 0.8346248865127563, + "num_tokens": 44520272.0, + "step": 1168 + }, + { + "epoch": 0.14870881567230632, + "ewc_loss": 0.010184810496866703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.092405444884207e-06, + "grad_norm": 14.510246276855469, + "learning_rate": 4.951250529885544e-07, + "loss": 0.4728, + "mean_token_accuracy": 0.8511554002761841, + "num_tokens": 44561058.0, + "step": 1169 + }, + { + "epoch": 0.14883602595089684, + "ewc_loss": 0.01021288987249136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.106444859848125e-06, + "grad_norm": 14.622576713562012, + "learning_rate": 4.955489614243324e-07, + "loss": 0.6086, + "mean_token_accuracy": 0.8073527216911316, + "num_tokens": 44599371.0, + "step": 1170 + }, + { + "epoch": 0.14896323622948734, + "ewc_loss": 0.010223513469099998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.11175676365383e-06, + "grad_norm": 14.563676834106445, + "learning_rate": 4.959728698601102e-07, + "loss": 0.5178, + "mean_token_accuracy": 0.8281967043876648, + "num_tokens": 44640499.0, + "step": 1171 + }, + { + "epoch": 0.14909044650807785, + "ewc_loss": 0.010217501781880856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.10875088366447e-06, + "grad_norm": 14.534806251525879, + "learning_rate": 4.963967782958881e-07, + "loss": 0.5038, + "mean_token_accuracy": 0.8403109908103943, + "num_tokens": 44683164.0, + "step": 1172 + }, + { + "epoch": 0.14921765678666837, + "ewc_loss": 0.010267015546560287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.13350778419408e-06, + "grad_norm": 14.673696517944336, + "learning_rate": 4.968206867316659e-07, + "loss": 0.5199, + "mean_token_accuracy": 0.8328410387039185, + "num_tokens": 44721938.0, + "step": 1173 + }, + { + "epoch": 0.14934486706525887, + "ewc_loss": 0.01026912871748209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.134564162290189e-06, + "grad_norm": 14.633214950561523, + "learning_rate": 4.972445951674439e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.8421601057052612, + "num_tokens": 44753506.0, + "step": 1174 + }, + { + "epoch": 0.14947207734384937, + "ewc_loss": 0.010283651761710644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.141826022736495e-06, + "grad_norm": 14.72677993774414, + "learning_rate": 4.976685036032216e-07, + "loss": 0.5796, + "mean_token_accuracy": 0.8172636032104492, + "num_tokens": 44792170.0, + "step": 1175 + }, + { + "epoch": 0.1495992876224399, + "ewc_loss": 0.010269579477608204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.134789716976229e-06, + "grad_norm": 14.640303611755371, + "learning_rate": 4.980924120389996e-07, + "loss": 0.4541, + "mean_token_accuracy": 0.8564695119857788, + "num_tokens": 44830733.0, + "step": 1176 + }, + { + "epoch": 0.1497264979010304, + "ewc_loss": 0.010263188742101192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.13159420734155e-06, + "grad_norm": 14.692985534667969, + "learning_rate": 4.985163204747774e-07, + "loss": 0.4698, + "mean_token_accuracy": 0.8497076630592346, + "num_tokens": 44869743.0, + "step": 1177 + }, + { + "epoch": 0.1498537081796209, + "ewc_loss": 0.010296237654983997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.148118816578062e-06, + "grad_norm": 14.66529369354248, + "learning_rate": 4.989402289105554e-07, + "loss": 0.5162, + "mean_token_accuracy": 0.8414629101753235, + "num_tokens": 44913358.0, + "step": 1178 + }, + { + "epoch": 0.14998091845821143, + "ewc_loss": 0.010273168794810772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.136584604770178e-06, + "grad_norm": 14.705511093139648, + "learning_rate": 4.993641373463331e-07, + "loss": 0.576, + "mean_token_accuracy": 0.8179205656051636, + "num_tokens": 44949963.0, + "step": 1179 + }, + { + "epoch": 0.15010812873680193, + "ewc_loss": 0.010315930470824242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.157965006219456e-06, + "grad_norm": 14.654999732971191, + "learning_rate": 4.997880457821111e-07, + "loss": 0.5316, + "mean_token_accuracy": 0.8302448391914368, + "num_tokens": 44988569.0, + "step": 1180 + }, + { + "epoch": 0.15023533901539243, + "ewc_loss": 0.010293888859450817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.1469442041707225e-06, + "grad_norm": 14.625802040100098, + "learning_rate": 5.002119542178889e-07, + "loss": 0.5072, + "mean_token_accuracy": 0.837849497795105, + "num_tokens": 45032634.0, + "step": 1181 + }, + { + "epoch": 0.15036254929398296, + "ewc_loss": 0.010331005789339542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.16550289830775e-06, + "grad_norm": 14.727001190185547, + "learning_rate": 5.006358626536667e-07, + "loss": 0.475, + "mean_token_accuracy": 0.8492792844772339, + "num_tokens": 45069685.0, + "step": 1182 + }, + { + "epoch": 0.15048975957257346, + "ewc_loss": 0.010329230688512325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.16461523147882e-06, + "grad_norm": 14.711939811706543, + "learning_rate": 5.010597710894446e-07, + "loss": 0.4979, + "mean_token_accuracy": 0.8412163257598877, + "num_tokens": 45107481.0, + "step": 1183 + }, + { + "epoch": 0.15061696985116396, + "ewc_loss": 0.010347909294068813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.173954832571326e-06, + "grad_norm": 14.724849700927734, + "learning_rate": 5.014836795252225e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.8463059067726135, + "num_tokens": 45145747.0, + "step": 1184 + }, + { + "epoch": 0.1507441801297545, + "ewc_loss": 0.010353931225836277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.176965714781545e-06, + "grad_norm": 14.767545700073242, + "learning_rate": 5.019075879610004e-07, + "loss": 0.4899, + "mean_token_accuracy": 0.8471256494522095, + "num_tokens": 45183140.0, + "step": 1185 + }, + { + "epoch": 0.150871390408345, + "ewc_loss": 0.010342566296458244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.171283191884868e-06, + "grad_norm": 14.644693374633789, + "learning_rate": 5.023314963967783e-07, + "loss": 0.5228, + "mean_token_accuracy": 0.8330718278884888, + "num_tokens": 45224911.0, + "step": 1186 + }, + { + "epoch": 0.1509986006869355, + "ewc_loss": 0.010361934080719948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.180967036721995e-06, + "grad_norm": 14.794346809387207, + "learning_rate": 5.027554048325562e-07, + "loss": 0.5802, + "mean_token_accuracy": 0.8180077075958252, + "num_tokens": 45257388.0, + "step": 1187 + }, + { + "epoch": 0.15112581096552602, + "ewc_loss": 0.010399880819022655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.199940460443031e-06, + "grad_norm": 14.723896980285645, + "learning_rate": 5.03179313268334e-07, + "loss": 0.5652, + "mean_token_accuracy": 0.8217133283615112, + "num_tokens": 45297876.0, + "step": 1188 + }, + { + "epoch": 0.15125302124411652, + "ewc_loss": 0.010351542383432388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.175771093490766e-06, + "grad_norm": 14.734660148620605, + "learning_rate": 5.036032217041119e-07, + "loss": 0.5386, + "mean_token_accuracy": 0.8275936841964722, + "num_tokens": 45334040.0, + "step": 1189 + }, + { + "epoch": 0.15138023152270705, + "ewc_loss": 0.010399837046861649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.1999186325701885e-06, + "grad_norm": 14.801708221435547, + "learning_rate": 5.040271301398897e-07, + "loss": 0.524, + "mean_token_accuracy": 0.8313502669334412, + "num_tokens": 45375985.0, + "step": 1190 + }, + { + "epoch": 0.15150744180129755, + "ewc_loss": 0.010397035628557205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.198518010729458e-06, + "grad_norm": 14.789399147033691, + "learning_rate": 5.044510385756676e-07, + "loss": 0.4928, + "mean_token_accuracy": 0.839952826499939, + "num_tokens": 45419951.0, + "step": 1191 + }, + { + "epoch": 0.15163465207988805, + "ewc_loss": 0.010385471396148205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.192735898162937e-06, + "grad_norm": 14.794001579284668, + "learning_rate": 5.048749470114455e-07, + "loss": 0.5062, + "mean_token_accuracy": 0.8402603268623352, + "num_tokens": 45456988.0, + "step": 1192 + }, + { + "epoch": 0.15176186235847858, + "ewc_loss": 0.010410581715404987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.205291017773561e-06, + "grad_norm": 14.849715232849121, + "learning_rate": 5.052988554472234e-07, + "loss": 0.5237, + "mean_token_accuracy": 0.8312366008758545, + "num_tokens": 45488226.0, + "step": 1193 + }, + { + "epoch": 0.15188907263706908, + "ewc_loss": 0.010394538752734661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.197269274503924e-06, + "grad_norm": 14.737082481384277, + "learning_rate": 5.057227638830013e-07, + "loss": 0.5209, + "mean_token_accuracy": 0.8364536166191101, + "num_tokens": 45524661.0, + "step": 1194 + }, + { + "epoch": 0.15201628291565958, + "ewc_loss": 0.01039558183401823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.197790869715391e-06, + "grad_norm": 14.783838272094727, + "learning_rate": 5.061466723187792e-07, + "loss": 0.5231, + "mean_token_accuracy": 0.8359779119491577, + "num_tokens": 45563491.0, + "step": 1195 + }, + { + "epoch": 0.1521434931942501, + "ewc_loss": 0.010432128794491291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.216064437263412e-06, + "grad_norm": 14.805270195007324, + "learning_rate": 5.065705807545569e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.8443660736083984, + "num_tokens": 45596771.0, + "step": 1196 + }, + { + "epoch": 0.1522707034728406, + "ewc_loss": 0.010424135252833366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.212067662796471e-06, + "grad_norm": 14.784178733825684, + "learning_rate": 5.069944891903349e-07, + "loss": 0.512, + "mean_token_accuracy": 0.8368678092956543, + "num_tokens": 45640049.0, + "step": 1197 + }, + { + "epoch": 0.1523979137514311, + "ewc_loss": 0.010444714687764645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.222357231104979e-06, + "grad_norm": 14.787969589233398, + "learning_rate": 5.074183976261127e-07, + "loss": 0.512, + "mean_token_accuracy": 0.835692286491394, + "num_tokens": 45677375.0, + "step": 1198 + }, + { + "epoch": 0.15252512403002164, + "ewc_loss": 0.010474764741957188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.237382538325619e-06, + "grad_norm": 14.795394897460938, + "learning_rate": 5.078423060618906e-07, + "loss": 0.5151, + "mean_token_accuracy": 0.8375033140182495, + "num_tokens": 45719253.0, + "step": 1199 + }, + { + "epoch": 0.15265233430861214, + "ewc_loss": 0.010479923337697983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.239961865299847e-06, + "grad_norm": 14.800464630126953, + "learning_rate": 5.082662144976685e-07, + "loss": 0.5125, + "mean_token_accuracy": 0.834069013595581, + "num_tokens": 45756065.0, + "step": 1200 + }, + { + "epoch": 0.15277954458720264, + "ewc_loss": 0.010505066253244877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.252533355815103e-06, + "grad_norm": 14.78856086730957, + "learning_rate": 5.086901229334464e-07, + "loss": 0.4997, + "mean_token_accuracy": 0.8435958027839661, + "num_tokens": 45791707.0, + "step": 1201 + }, + { + "epoch": 0.15290675486579317, + "ewc_loss": 0.01049207616597414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.246038199402392e-06, + "grad_norm": 14.790546417236328, + "learning_rate": 5.091140313692243e-07, + "loss": 0.5159, + "mean_token_accuracy": 0.8356097936630249, + "num_tokens": 45828464.0, + "step": 1202 + }, + { + "epoch": 0.15303396514438367, + "ewc_loss": 0.010511843487620354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.255921678326558e-06, + "grad_norm": 14.821137428283691, + "learning_rate": 5.095379398050022e-07, + "loss": 0.4989, + "mean_token_accuracy": 0.8402205109596252, + "num_tokens": 45867046.0, + "step": 1203 + }, + { + "epoch": 0.15316117542297417, + "ewc_loss": 0.010525564663112164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.262782451609382e-06, + "grad_norm": 14.86142635345459, + "learning_rate": 5.099618482407799e-07, + "loss": 0.5309, + "mean_token_accuracy": 0.833458423614502, + "num_tokens": 45900530.0, + "step": 1204 + }, + { + "epoch": 0.1532883857015647, + "ewc_loss": 0.010548622347414494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.274311206449056e-06, + "grad_norm": 14.782593727111816, + "learning_rate": 5.103857566765578e-07, + "loss": 0.5098, + "mean_token_accuracy": 0.8397952318191528, + "num_tokens": 45940002.0, + "step": 1205 + }, + { + "epoch": 0.1534155959801552, + "ewc_loss": 0.010546945966780186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.273473107081372e-06, + "grad_norm": 14.855162620544434, + "learning_rate": 5.108096651123357e-07, + "loss": 0.4694, + "mean_token_accuracy": 0.8502705693244934, + "num_tokens": 45975299.0, + "step": 1206 + }, + { + "epoch": 0.1535428062587457, + "ewc_loss": 0.010570274665951729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.28513737663161e-06, + "grad_norm": 14.841928482055664, + "learning_rate": 5.112335735481135e-07, + "loss": 0.5081, + "mean_token_accuracy": 0.8359403610229492, + "num_tokens": 46012693.0, + "step": 1207 + }, + { + "epoch": 0.15367001653733622, + "ewc_loss": 0.010566074401140213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.283037353365216e-06, + "grad_norm": 14.790352821350098, + "learning_rate": 5.116574819838915e-07, + "loss": 0.4115, + "mean_token_accuracy": 0.868578314781189, + "num_tokens": 46051558.0, + "step": 1208 + }, + { + "epoch": 0.15379722681592672, + "ewc_loss": 0.010570052079856396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.285025963530643e-06, + "grad_norm": 14.848435401916504, + "learning_rate": 5.120813904196693e-07, + "loss": 0.4965, + "mean_token_accuracy": 0.8424698114395142, + "num_tokens": 46087173.0, + "step": 1209 + }, + { + "epoch": 0.15392443709451722, + "ewc_loss": 0.010612471029162407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.306235379975988e-06, + "grad_norm": 14.887345314025879, + "learning_rate": 5.125052988554473e-07, + "loss": 0.5522, + "mean_token_accuracy": 0.8294079303741455, + "num_tokens": 46120482.0, + "step": 1210 + }, + { + "epoch": 0.15405164737310775, + "ewc_loss": 0.010605518706142902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.3027592912258115e-06, + "grad_norm": 14.796713829040527, + "learning_rate": 5.12929207291225e-07, + "loss": 0.5016, + "mean_token_accuracy": 0.8381830453872681, + "num_tokens": 46154478.0, + "step": 1211 + }, + { + "epoch": 0.15417885765169825, + "ewc_loss": 0.010598614811897278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.299307304085232e-06, + "grad_norm": 14.853104591369629, + "learning_rate": 5.133531157270029e-07, + "loss": 0.4773, + "mean_token_accuracy": 0.8478359580039978, + "num_tokens": 46195012.0, + "step": 1212 + }, + { + "epoch": 0.15430606793028875, + "ewc_loss": 0.010641472414135933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.320736363501055e-06, + "grad_norm": 14.862083435058594, + "learning_rate": 5.137770241627808e-07, + "loss": 0.4407, + "mean_token_accuracy": 0.8587251305580139, + "num_tokens": 46230641.0, + "step": 1213 + }, + { + "epoch": 0.15443327820887928, + "ewc_loss": 0.01063244603574276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.316222996043507e-06, + "grad_norm": 14.923515319824219, + "learning_rate": 5.142009325985587e-07, + "loss": 0.4826, + "mean_token_accuracy": 0.8478903770446777, + "num_tokens": 46268993.0, + "step": 1214 + }, + { + "epoch": 0.15456048848746978, + "ewc_loss": 0.010665223933756351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.332612090569455e-06, + "grad_norm": 14.905550956726074, + "learning_rate": 5.146248410343365e-07, + "loss": 0.5164, + "mean_token_accuracy": 0.8376989364624023, + "num_tokens": 46309310.0, + "step": 1215 + }, + { + "epoch": 0.1546876987660603, + "ewc_loss": 0.010656501166522503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.328250608727103e-06, + "grad_norm": 14.893601417541504, + "learning_rate": 5.150487494701145e-07, + "loss": 0.4565, + "mean_token_accuracy": 0.8543679118156433, + "num_tokens": 46352415.0, + "step": 1216 + }, + { + "epoch": 0.1548149090446508, + "ewc_loss": 0.010663923807442188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.3319618018576875e-06, + "grad_norm": 14.937867164611816, + "learning_rate": 5.154726579058923e-07, + "loss": 0.5126, + "mean_token_accuracy": 0.8359035849571228, + "num_tokens": 46387838.0, + "step": 1217 + }, + { + "epoch": 0.1549421193232413, + "ewc_loss": 0.010676514357328415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.33825732418336e-06, + "grad_norm": 14.897933006286621, + "learning_rate": 5.158965663416703e-07, + "loss": 0.5079, + "mean_token_accuracy": 0.8403131365776062, + "num_tokens": 46432669.0, + "step": 1218 + }, + { + "epoch": 0.15506932960183184, + "ewc_loss": 0.010687675327062607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.3438375289260875e-06, + "grad_norm": 14.90680980682373, + "learning_rate": 5.16320474777448e-07, + "loss": 0.4625, + "mean_token_accuracy": 0.851506769657135, + "num_tokens": 46474383.0, + "step": 1219 + }, + { + "epoch": 0.15519653988042234, + "ewc_loss": 0.010706428438425064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.3532144193013664e-06, + "grad_norm": 14.93045711517334, + "learning_rate": 5.167443832132259e-07, + "loss": 0.5139, + "mean_token_accuracy": 0.8373541235923767, + "num_tokens": 46514263.0, + "step": 1220 + }, + { + "epoch": 0.15532375015901284, + "ewc_loss": 0.010695560835301876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.347780188458273e-06, + "grad_norm": 14.955421447753906, + "learning_rate": 5.171682916490038e-07, + "loss": 0.4801, + "mean_token_accuracy": 0.847128689289093, + "num_tokens": 46549835.0, + "step": 1221 + }, + { + "epoch": 0.15545096043760337, + "ewc_loss": 0.010716646909713745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.358323505788576e-06, + "grad_norm": 14.977736473083496, + "learning_rate": 5.175922000847816e-07, + "loss": 0.4681, + "mean_token_accuracy": 0.8519805073738098, + "num_tokens": 46584811.0, + "step": 1222 + }, + { + "epoch": 0.15557817071619387, + "ewc_loss": 0.0107074910774827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.353745564207202e-06, + "grad_norm": 14.985089302062988, + "learning_rate": 5.180161085205595e-07, + "loss": 0.4891, + "mean_token_accuracy": 0.8447859883308411, + "num_tokens": 46621441.0, + "step": 1223 + }, + { + "epoch": 0.15570538099478437, + "ewc_loss": 0.010731982998549938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.365991455619223e-06, + "grad_norm": 14.888891220092773, + "learning_rate": 5.184400169563374e-07, + "loss": 0.5436, + "mean_token_accuracy": 0.8278319835662842, + "num_tokens": 46661572.0, + "step": 1224 + }, + { + "epoch": 0.1558325912733749, + "ewc_loss": 0.010739530436694622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.36976540388423e-06, + "grad_norm": 15.007939338684082, + "learning_rate": 5.188639253921153e-07, + "loss": 0.5186, + "mean_token_accuracy": 0.8344486951828003, + "num_tokens": 46702157.0, + "step": 1225 + }, + { + "epoch": 0.1559598015519654, + "ewc_loss": 0.010745709761977196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.372854957386153e-06, + "grad_norm": 14.917242050170898, + "learning_rate": 5.192878338278932e-07, + "loss": 0.4958, + "mean_token_accuracy": 0.8427226543426514, + "num_tokens": 46739119.0, + "step": 1226 + }, + { + "epoch": 0.1560870118305559, + "ewc_loss": 0.010727109387516975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.363554919313174e-06, + "grad_norm": 14.909449577331543, + "learning_rate": 5.19711742263671e-07, + "loss": 0.5531, + "mean_token_accuracy": 0.8252406716346741, + "num_tokens": 46783990.0, + "step": 1227 + }, + { + "epoch": 0.15621422210914643, + "ewc_loss": 0.010761052370071411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.380526090448257e-06, + "grad_norm": 14.941069602966309, + "learning_rate": 5.201356506994488e-07, + "loss": 0.4894, + "mean_token_accuracy": 0.839043140411377, + "num_tokens": 46816131.0, + "step": 1228 + }, + { + "epoch": 0.15634143238773693, + "ewc_loss": 0.010798090137541294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.399045221565757e-06, + "grad_norm": 15.047182083129883, + "learning_rate": 5.205595591352268e-07, + "loss": 0.5758, + "mean_token_accuracy": 0.819778561592102, + "num_tokens": 46856453.0, + "step": 1229 + }, + { + "epoch": 0.15646864266632743, + "ewc_loss": 0.010778422467410564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.389211310102837e-06, + "grad_norm": 14.895393371582031, + "learning_rate": 5.209834675710046e-07, + "loss": 0.5131, + "mean_token_accuracy": 0.8363879919052124, + "num_tokens": 46897961.0, + "step": 1230 + }, + { + "epoch": 0.15659585294491796, + "ewc_loss": 0.01075804140418768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.379020876716822e-06, + "grad_norm": 15.016952514648438, + "learning_rate": 5.214073760067825e-07, + "loss": 0.5891, + "mean_token_accuracy": 0.8159976005554199, + "num_tokens": 46931961.0, + "step": 1231 + }, + { + "epoch": 0.15672306322350846, + "ewc_loss": 0.010823049582540989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.411524853116134e-06, + "grad_norm": 15.029881477355957, + "learning_rate": 5.218312844425604e-07, + "loss": 0.5089, + "mean_token_accuracy": 0.8411198258399963, + "num_tokens": 46967822.0, + "step": 1232 + }, + { + "epoch": 0.15685027350209896, + "ewc_loss": 0.010811097919940948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.405549018178135e-06, + "grad_norm": 14.946722984313965, + "learning_rate": 5.222551928783383e-07, + "loss": 0.4886, + "mean_token_accuracy": 0.8445045948028564, + "num_tokens": 47004511.0, + "step": 1233 + }, + { + "epoch": 0.1569774837806895, + "ewc_loss": 0.010845777578651905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.422888989414787e-06, + "grad_norm": 15.015583038330078, + "learning_rate": 5.226791013141161e-07, + "loss": 0.5418, + "mean_token_accuracy": 0.8315447568893433, + "num_tokens": 47047244.0, + "step": 1234 + }, + { + "epoch": 0.15710469405928, + "ewc_loss": 0.010837347246706486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.418673481472069e-06, + "grad_norm": 14.98218822479248, + "learning_rate": 5.23103009749894e-07, + "loss": 0.5006, + "mean_token_accuracy": 0.8409584760665894, + "num_tokens": 47088206.0, + "step": 1235 + }, + { + "epoch": 0.1572319043378705, + "ewc_loss": 0.010868174023926258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.434087142930366e-06, + "grad_norm": 15.045694351196289, + "learning_rate": 5.235269181856718e-07, + "loss": 0.5086, + "mean_token_accuracy": 0.8373300433158875, + "num_tokens": 47132733.0, + "step": 1236 + }, + { + "epoch": 0.15735911461646102, + "ewc_loss": 0.010857999324798584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.428999884315999e-06, + "grad_norm": 14.973329544067383, + "learning_rate": 5.239508266214498e-07, + "loss": 0.5676, + "mean_token_accuracy": 0.8178125023841858, + "num_tokens": 47166840.0, + "step": 1237 + }, + { + "epoch": 0.15748632489505152, + "ewc_loss": 0.010854470543563366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.42723546459456e-06, + "grad_norm": 14.961944580078125, + "learning_rate": 5.243747350572276e-07, + "loss": 0.5261, + "mean_token_accuracy": 0.8345366716384888, + "num_tokens": 47203444.0, + "step": 1238 + }, + { + "epoch": 0.15761353517364202, + "ewc_loss": 0.01087122317403555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4356114560505375e-06, + "grad_norm": 14.928236961364746, + "learning_rate": 5.247986434930056e-07, + "loss": 0.5197, + "mean_token_accuracy": 0.8378883004188538, + "num_tokens": 47240380.0, + "step": 1239 + }, + { + "epoch": 0.15774074545223254, + "ewc_loss": 0.010898934677243233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.449467153084697e-06, + "grad_norm": 15.004178047180176, + "learning_rate": 5.252225519287834e-07, + "loss": 0.5811, + "mean_token_accuracy": 0.817284107208252, + "num_tokens": 47277886.0, + "step": 1240 + }, + { + "epoch": 0.15786795573082305, + "ewc_loss": 0.01094429288059473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.472146312968107e-06, + "grad_norm": 15.018553733825684, + "learning_rate": 5.256464603645613e-07, + "loss": 0.5082, + "mean_token_accuracy": 0.8428455591201782, + "num_tokens": 47315261.0, + "step": 1241 + }, + { + "epoch": 0.15799516600941357, + "ewc_loss": 0.010935315862298012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4676579566148575e-06, + "grad_norm": 14.984579086303711, + "learning_rate": 5.260703688003391e-07, + "loss": 0.5061, + "mean_token_accuracy": 0.8398182392120361, + "num_tokens": 47357489.0, + "step": 1242 + }, + { + "epoch": 0.15812237628800407, + "ewc_loss": 0.010944495908915997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.4722481763747055e-06, + "grad_norm": 15.027527809143066, + "learning_rate": 5.26494277236117e-07, + "loss": 0.525, + "mean_token_accuracy": 0.8342618346214294, + "num_tokens": 47389851.0, + "step": 1243 + }, + { + "epoch": 0.15824958656659457, + "ewc_loss": 0.010943638160824776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.471818894875469e-06, + "grad_norm": 14.960264205932617, + "learning_rate": 5.269181856718948e-07, + "loss": 0.5502, + "mean_token_accuracy": 0.824467658996582, + "num_tokens": 47427491.0, + "step": 1244 + }, + { + "epoch": 0.1583767968451851, + "ewc_loss": 0.01096253376454115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.481266725837486e-06, + "grad_norm": 15.008622169494629, + "learning_rate": 5.273420941076727e-07, + "loss": 0.4897, + "mean_token_accuracy": 0.8456329107284546, + "num_tokens": 47462790.0, + "step": 1245 + }, + { + "epoch": 0.1585040071237756, + "ewc_loss": 0.010985031723976135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.492515811056364e-06, + "grad_norm": 15.01854419708252, + "learning_rate": 5.277660025434506e-07, + "loss": 0.4516, + "mean_token_accuracy": 0.8546907901763916, + "num_tokens": 47502240.0, + "step": 1246 + }, + { + "epoch": 0.1586312174023661, + "ewc_loss": 0.01097785122692585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.488925580721116e-06, + "grad_norm": 14.978219032287598, + "learning_rate": 5.281899109792285e-07, + "loss": 0.4772, + "mean_token_accuracy": 0.8478303551673889, + "num_tokens": 47542493.0, + "step": 1247 + }, + { + "epoch": 0.15875842768095663, + "ewc_loss": 0.010978704318404198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.489352133736247e-06, + "grad_norm": 15.004693031311035, + "learning_rate": 5.286138194150064e-07, + "loss": 0.5145, + "mean_token_accuracy": 0.8391655683517456, + "num_tokens": 47583378.0, + "step": 1248 + }, + { + "epoch": 0.15888563795954713, + "ewc_loss": 0.011029074899852276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.514537406270392e-06, + "grad_norm": 15.009060859680176, + "learning_rate": 5.290377278507841e-07, + "loss": 0.5387, + "mean_token_accuracy": 0.8345550298690796, + "num_tokens": 47622671.0, + "step": 1249 + }, + { + "epoch": 0.15901284823813763, + "ewc_loss": 0.011029507964849472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.514753866009414e-06, + "grad_norm": 15.103785514831543, + "learning_rate": 5.294616362865621e-07, + "loss": 0.4987, + "mean_token_accuracy": 0.8454298973083496, + "num_tokens": 47663774.0, + "step": 1250 + }, + { + "epoch": 0.15914005851672816, + "ewc_loss": 0.011061503551900387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5307518778136e-06, + "grad_norm": 15.0338716506958, + "learning_rate": 5.298855447223399e-07, + "loss": 0.5701, + "mean_token_accuracy": 0.8290306329727173, + "num_tokens": 47706143.0, + "step": 1251 + }, + { + "epoch": 0.15926726879531866, + "ewc_loss": 0.011019541881978512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5097707445384e-06, + "grad_norm": 15.092545509338379, + "learning_rate": 5.303094531581178e-07, + "loss": 0.5321, + "mean_token_accuracy": 0.832917332649231, + "num_tokens": 47741197.0, + "step": 1252 + }, + { + "epoch": 0.15939447907390916, + "ewc_loss": 0.011067927815020084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.533963758352911e-06, + "grad_norm": 15.084270477294922, + "learning_rate": 5.307333615938957e-07, + "loss": 0.4545, + "mean_token_accuracy": 0.8544579744338989, + "num_tokens": 47779306.0, + "step": 1253 + }, + { + "epoch": 0.1595216893524997, + "ewc_loss": 0.011037376709282398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.518688340089284e-06, + "grad_norm": 15.149225234985352, + "learning_rate": 5.311572700296736e-07, + "loss": 0.54, + "mean_token_accuracy": 0.8306338787078857, + "num_tokens": 47811056.0, + "step": 1254 + }, + { + "epoch": 0.1596488996310902, + "ewc_loss": 0.011081727221608162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.540863639907911e-06, + "grad_norm": 15.106208801269531, + "learning_rate": 5.315811784654515e-07, + "loss": 0.4474, + "mean_token_accuracy": 0.8526498675346375, + "num_tokens": 47847075.0, + "step": 1255 + }, + { + "epoch": 0.1597761099096807, + "ewc_loss": 0.011053810827434063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.526905624719802e-06, + "grad_norm": 15.134535789489746, + "learning_rate": 5.320050869012294e-07, + "loss": 0.5047, + "mean_token_accuracy": 0.8413568139076233, + "num_tokens": 47892639.0, + "step": 1256 + }, + { + "epoch": 0.15990332018827122, + "ewc_loss": 0.011087507009506226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.543753559322795e-06, + "grad_norm": 15.033618927001953, + "learning_rate": 5.324289953370071e-07, + "loss": 0.5631, + "mean_token_accuracy": 0.8241999745368958, + "num_tokens": 47933522.0, + "step": 1257 + }, + { + "epoch": 0.16003053046686172, + "ewc_loss": 0.011052820831537247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.526410404854687e-06, + "grad_norm": 15.101261138916016, + "learning_rate": 5.328529037727851e-07, + "loss": 0.4994, + "mean_token_accuracy": 0.8405803442001343, + "num_tokens": 47977307.0, + "step": 1258 + }, + { + "epoch": 0.16015774074545222, + "ewc_loss": 0.011131771840155125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.565886112890439e-06, + "grad_norm": 15.137799263000488, + "learning_rate": 5.332768122085629e-07, + "loss": 0.4966, + "mean_token_accuracy": 0.8414750099182129, + "num_tokens": 48015129.0, + "step": 1259 + }, + { + "epoch": 0.16028495102404275, + "ewc_loss": 0.011077748611569405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.538874120247783e-06, + "grad_norm": 15.09382438659668, + "learning_rate": 5.337007206443408e-07, + "loss": 0.5318, + "mean_token_accuracy": 0.828643798828125, + "num_tokens": 48051904.0, + "step": 1260 + }, + { + "epoch": 0.16041216130263325, + "ewc_loss": 0.011107826605439186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.5539135246363e-06, + "grad_norm": 15.158279418945312, + "learning_rate": 5.341246290801187e-07, + "loss": 0.4928, + "mean_token_accuracy": 0.8462687134742737, + "num_tokens": 48094001.0, + "step": 1261 + }, + { + "epoch": 0.16053937158122375, + "ewc_loss": 0.011122857220470905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.56142867935705e-06, + "grad_norm": 15.152524948120117, + "learning_rate": 5.345485375158966e-07, + "loss": 0.4848, + "mean_token_accuracy": 0.8433158993721008, + "num_tokens": 48134124.0, + "step": 1262 + }, + { + "epoch": 0.16066658185981428, + "ewc_loss": 0.011111760511994362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.555880306928884e-06, + "grad_norm": 15.084639549255371, + "learning_rate": 5.349724459516745e-07, + "loss": 0.4747, + "mean_token_accuracy": 0.8471415042877197, + "num_tokens": 48170730.0, + "step": 1263 + }, + { + "epoch": 0.16079379213840478, + "ewc_loss": 0.011127714067697525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.563857030210784e-06, + "grad_norm": 15.2337064743042, + "learning_rate": 5.353963543874522e-07, + "loss": 0.5467, + "mean_token_accuracy": 0.8291091322898865, + "num_tokens": 48212640.0, + "step": 1264 + }, + { + "epoch": 0.1609210024169953, + "ewc_loss": 0.011153174564242363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.576587227551499e-06, + "grad_norm": 15.140594482421875, + "learning_rate": 5.358202628232301e-07, + "loss": 0.526, + "mean_token_accuracy": 0.8344765305519104, + "num_tokens": 48243136.0, + "step": 1265 + }, + { + "epoch": 0.1610482126955858, + "ewc_loss": 0.011126911267638206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.563455488299951e-06, + "grad_norm": 15.159753799438477, + "learning_rate": 5.36244171259008e-07, + "loss": 0.5026, + "mean_token_accuracy": 0.8405811786651611, + "num_tokens": 48282905.0, + "step": 1266 + }, + { + "epoch": 0.1611754229741763, + "ewc_loss": 0.011182993650436401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.591497028945014e-06, + "grad_norm": 15.272285461425781, + "learning_rate": 5.366680796947859e-07, + "loss": 0.5409, + "mean_token_accuracy": 0.8305063247680664, + "num_tokens": 48323325.0, + "step": 1267 + }, + { + "epoch": 0.16130263325276684, + "ewc_loss": 0.011184128932654858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.59206455363892e-06, + "grad_norm": 15.166367530822754, + "learning_rate": 5.370919881305637e-07, + "loss": 0.5188, + "mean_token_accuracy": 0.8391008377075195, + "num_tokens": 48358400.0, + "step": 1268 + }, + { + "epoch": 0.16142984353135734, + "ewc_loss": 0.01116303913295269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.581519417319214e-06, + "grad_norm": 15.20211410522461, + "learning_rate": 5.375158965663417e-07, + "loss": 0.5205, + "mean_token_accuracy": 0.8371560573577881, + "num_tokens": 48393018.0, + "step": 1269 + }, + { + "epoch": 0.16155705380994784, + "ewc_loss": 0.011190515011548996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.595257334789494e-06, + "grad_norm": 15.209942817687988, + "learning_rate": 5.379398050021195e-07, + "loss": 0.5072, + "mean_token_accuracy": 0.8393207788467407, + "num_tokens": 48432952.0, + "step": 1270 + }, + { + "epoch": 0.16168426408853837, + "ewc_loss": 0.0111763384193182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.588169187831227e-06, + "grad_norm": 15.237288475036621, + "learning_rate": 5.383637134378975e-07, + "loss": 0.4973, + "mean_token_accuracy": 0.8393450975418091, + "num_tokens": 48466913.0, + "step": 1271 + }, + { + "epoch": 0.16181147436712887, + "ewc_loss": 0.011203163303434849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.601581506198272e-06, + "grad_norm": 15.154584884643555, + "learning_rate": 5.387876218736752e-07, + "loss": 0.5768, + "mean_token_accuracy": 0.8151583671569824, + "num_tokens": 48512658.0, + "step": 1272 + }, + { + "epoch": 0.16193868464571937, + "ewc_loss": 0.011202226392924786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.601113116426859e-06, + "grad_norm": 15.225539207458496, + "learning_rate": 5.392115303094531e-07, + "loss": 0.4731, + "mean_token_accuracy": 0.8480966091156006, + "num_tokens": 48552283.0, + "step": 1273 + }, + { + "epoch": 0.1620658949243099, + "ewc_loss": 0.01124593336135149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.622966455121059e-06, + "grad_norm": 15.213896751403809, + "learning_rate": 5.39635438745231e-07, + "loss": 0.4536, + "mean_token_accuracy": 0.8556152582168579, + "num_tokens": 48590504.0, + "step": 1274 + }, + { + "epoch": 0.1621931052029004, + "ewc_loss": 0.011226657778024673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.613328994513722e-06, + "grad_norm": 15.189828872680664, + "learning_rate": 5.400593471810089e-07, + "loss": 0.5416, + "mean_token_accuracy": 0.8297262191772461, + "num_tokens": 48629993.0, + "step": 1275 + }, + { + "epoch": 0.1623203154814909, + "ewc_loss": 0.01125610526651144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.628052804240724e-06, + "grad_norm": 15.328093528747559, + "learning_rate": 5.404832556167867e-07, + "loss": 0.5457, + "mean_token_accuracy": 0.8284871578216553, + "num_tokens": 48665074.0, + "step": 1276 + }, + { + "epoch": 0.16244752576008142, + "ewc_loss": 0.011272834613919258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.63641742701293e-06, + "grad_norm": 15.153083801269531, + "learning_rate": 5.409071640525647e-07, + "loss": 0.5024, + "mean_token_accuracy": 0.8442843556404114, + "num_tokens": 48705524.0, + "step": 1277 + }, + { + "epoch": 0.16257473603867192, + "ewc_loss": 0.011249318718910217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.624659479508409e-06, + "grad_norm": 15.224444389343262, + "learning_rate": 5.413310724883425e-07, + "loss": 0.5494, + "mean_token_accuracy": 0.8182342052459717, + "num_tokens": 48744030.0, + "step": 1278 + }, + { + "epoch": 0.16270194631726242, + "ewc_loss": 0.011297008953988552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.648504611599492e-06, + "grad_norm": 15.175599098205566, + "learning_rate": 5.417549809241205e-07, + "loss": 0.5417, + "mean_token_accuracy": 0.8331141471862793, + "num_tokens": 48788401.0, + "step": 1279 + }, + { + "epoch": 0.16282915659585295, + "ewc_loss": 0.011301644146442413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.650822004099609e-06, + "grad_norm": 15.347881317138672, + "learning_rate": 5.421788893598982e-07, + "loss": 0.5443, + "mean_token_accuracy": 0.8288081884384155, + "num_tokens": 48823046.0, + "step": 1280 + }, + { + "epoch": 0.16295636687444345, + "ewc_loss": 0.011340772733092308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.670386144629447e-06, + "grad_norm": 15.268220901489258, + "learning_rate": 5.42602797795676e-07, + "loss": 0.55, + "mean_token_accuracy": 0.8251557946205139, + "num_tokens": 48868293.0, + "step": 1281 + }, + { + "epoch": 0.16308357715303395, + "ewc_loss": 0.011311078444123268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.655539098370355e-06, + "grad_norm": 15.232378005981445, + "learning_rate": 5.43026706231454e-07, + "loss": 0.5779, + "mean_token_accuracy": 0.8185622096061707, + "num_tokens": 48908775.0, + "step": 1282 + }, + { + "epoch": 0.16321078743162448, + "ewc_loss": 0.011341355741024017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.670677637681365e-06, + "grad_norm": 15.292078971862793, + "learning_rate": 5.434506146672319e-07, + "loss": 0.5556, + "mean_token_accuracy": 0.8248813152313232, + "num_tokens": 48957640.0, + "step": 1283 + }, + { + "epoch": 0.16333799771021498, + "ewc_loss": 0.011335887014865875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.667943696607836e-06, + "grad_norm": 15.284438133239746, + "learning_rate": 5.438745231030097e-07, + "loss": 0.5459, + "mean_token_accuracy": 0.8254954814910889, + "num_tokens": 48993601.0, + "step": 1284 + }, + { + "epoch": 0.16346520798880548, + "ewc_loss": 0.011353347450494766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.676673936250154e-06, + "grad_norm": 15.32648754119873, + "learning_rate": 5.442984315387876e-07, + "loss": 0.4541, + "mean_token_accuracy": 0.8556290864944458, + "num_tokens": 49028411.0, + "step": 1285 + }, + { + "epoch": 0.163592418267396, + "ewc_loss": 0.01133719552308321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.66859762329841e-06, + "grad_norm": 15.313230514526367, + "learning_rate": 5.447223399745655e-07, + "loss": 0.5164, + "mean_token_accuracy": 0.8389966487884521, + "num_tokens": 49067449.0, + "step": 1286 + }, + { + "epoch": 0.1637196285459865, + "ewc_loss": 0.011362921446561813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.681460606865585e-06, + "grad_norm": 15.265920639038086, + "learning_rate": 5.451462484103433e-07, + "loss": 0.4913, + "mean_token_accuracy": 0.8450338840484619, + "num_tokens": 49102650.0, + "step": 1287 + }, + { + "epoch": 0.163846838824577, + "ewc_loss": 0.011363070458173752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.68153518543113e-06, + "grad_norm": 15.294539451599121, + "learning_rate": 5.455701568461212e-07, + "loss": 0.5171, + "mean_token_accuracy": 0.8344357013702393, + "num_tokens": 49137110.0, + "step": 1288 + }, + { + "epoch": 0.16397404910316754, + "ewc_loss": 0.011376295238733292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.68814766666037e-06, + "grad_norm": 15.272340774536133, + "learning_rate": 5.45994065281899e-07, + "loss": 0.5886, + "mean_token_accuracy": 0.8131082057952881, + "num_tokens": 49177765.0, + "step": 1289 + }, + { + "epoch": 0.16410125938175804, + "ewc_loss": 0.01139145065099001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.6957251217681915e-06, + "grad_norm": 15.36406421661377, + "learning_rate": 5.46417973717677e-07, + "loss": 0.5284, + "mean_token_accuracy": 0.8296920657157898, + "num_tokens": 49207860.0, + "step": 1290 + }, + { + "epoch": 0.16422846966034857, + "ewc_loss": 0.01140598300844431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.702991529688006e-06, + "grad_norm": 15.24080753326416, + "learning_rate": 5.468418821534548e-07, + "loss": 0.4592, + "mean_token_accuracy": 0.8530628681182861, + "num_tokens": 49239994.0, + "step": 1291 + }, + { + "epoch": 0.16435567993893907, + "ewc_loss": 0.011405415832996368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.702707767341053e-06, + "grad_norm": 15.318723678588867, + "learning_rate": 5.472657905892327e-07, + "loss": 0.4743, + "mean_token_accuracy": 0.8507050275802612, + "num_tokens": 49273464.0, + "step": 1292 + }, + { + "epoch": 0.16448289021752957, + "ewc_loss": 0.01146619487553835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.733097623306094e-06, + "grad_norm": 15.351716041564941, + "learning_rate": 5.476896990250106e-07, + "loss": 0.5901, + "mean_token_accuracy": 0.8157523274421692, + "num_tokens": 49313144.0, + "step": 1293 + }, + { + "epoch": 0.1646101004961201, + "ewc_loss": 0.01146110612899065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.730552857130533e-06, + "grad_norm": 15.292320251464844, + "learning_rate": 5.481136074607885e-07, + "loss": 0.4752, + "mean_token_accuracy": 0.8438786268234253, + "num_tokens": 49351392.0, + "step": 1294 + }, + { + "epoch": 0.1647373107747106, + "ewc_loss": 0.011451899074018002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.725949449697509e-06, + "grad_norm": 15.26054573059082, + "learning_rate": 5.485375158965663e-07, + "loss": 0.4764, + "mean_token_accuracy": 0.8486469984054565, + "num_tokens": 49396726.0, + "step": 1295 + }, + { + "epoch": 0.1648645210533011, + "ewc_loss": 0.011471481993794441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.735741069656797e-06, + "grad_norm": 15.28789234161377, + "learning_rate": 5.489614243323442e-07, + "loss": 0.5104, + "mean_token_accuracy": 0.8368774652481079, + "num_tokens": 49435856.0, + "step": 1296 + }, + { + "epoch": 0.16499173133189163, + "ewc_loss": 0.011502888053655624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.751443950430257e-06, + "grad_norm": 15.313902854919434, + "learning_rate": 5.49385332768122e-07, + "loss": 0.451, + "mean_token_accuracy": 0.8540650606155396, + "num_tokens": 49473695.0, + "step": 1297 + }, + { + "epoch": 0.16511894161048213, + "ewc_loss": 0.011494950391352177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.747475370299071e-06, + "grad_norm": 15.378215789794922, + "learning_rate": 5.498092412039e-07, + "loss": 0.5788, + "mean_token_accuracy": 0.8216226100921631, + "num_tokens": 49509165.0, + "step": 1298 + }, + { + "epoch": 0.16524615188907263, + "ewc_loss": 0.011520304717123508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.760152362199733e-06, + "grad_norm": 15.341522216796875, + "learning_rate": 5.502331496396778e-07, + "loss": 0.4829, + "mean_token_accuracy": 0.8450844883918762, + "num_tokens": 49553790.0, + "step": 1299 + }, + { + "epoch": 0.16537336216766316, + "ewc_loss": 0.01152021437883377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.760107342211995e-06, + "grad_norm": 15.363103866577148, + "learning_rate": 5.506570580754557e-07, + "loss": 0.5054, + "mean_token_accuracy": 0.8400691747665405, + "num_tokens": 49593124.0, + "step": 1300 + }, + { + "epoch": 0.16550057244625366, + "ewc_loss": 0.01154803391546011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7740171541809104e-06, + "grad_norm": 15.404321670532227, + "learning_rate": 5.510809665112336e-07, + "loss": 0.4946, + "mean_token_accuracy": 0.8454455137252808, + "num_tokens": 49632016.0, + "step": 1301 + }, + { + "epoch": 0.16562778272484416, + "ewc_loss": 0.011545640416443348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.772820259153377e-06, + "grad_norm": 15.364822387695312, + "learning_rate": 5.515048749470113e-07, + "loss": 0.5077, + "mean_token_accuracy": 0.8393670320510864, + "num_tokens": 49670689.0, + "step": 1302 + }, + { + "epoch": 0.1657549930034347, + "ewc_loss": 0.011541057378053665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.77052878725226e-06, + "grad_norm": 15.498571395874023, + "learning_rate": 5.519287833827893e-07, + "loss": 0.4938, + "mean_token_accuracy": 0.8419557809829712, + "num_tokens": 49709230.0, + "step": 1303 + }, + { + "epoch": 0.1658822032820252, + "ewc_loss": 0.011580894701182842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.790447175968438e-06, + "grad_norm": 15.383493423461914, + "learning_rate": 5.523526918185671e-07, + "loss": 0.5517, + "mean_token_accuracy": 0.8242837190628052, + "num_tokens": 49741648.0, + "step": 1304 + }, + { + "epoch": 0.1660094135606157, + "ewc_loss": 0.011561323888599873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.7806619224720635e-06, + "grad_norm": 15.3517484664917, + "learning_rate": 5.52776600254345e-07, + "loss": 0.4691, + "mean_token_accuracy": 0.8469798564910889, + "num_tokens": 49773906.0, + "step": 1305 + }, + { + "epoch": 0.16613662383920622, + "ewc_loss": 0.011615168303251266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.807584329886595e-06, + "grad_norm": 15.397011756896973, + "learning_rate": 5.532005086901229e-07, + "loss": 0.5265, + "mean_token_accuracy": 0.8374612331390381, + "num_tokens": 49812645.0, + "step": 1306 + }, + { + "epoch": 0.16626383411779672, + "ewc_loss": 0.011624198406934738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.812099061586196e-06, + "grad_norm": 15.37232780456543, + "learning_rate": 5.536244171259008e-07, + "loss": 0.496, + "mean_token_accuracy": 0.8435163497924805, + "num_tokens": 49854500.0, + "step": 1307 + }, + { + "epoch": 0.16639104439638722, + "ewc_loss": 0.011605738662183285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.802869509352604e-06, + "grad_norm": 15.39014720916748, + "learning_rate": 5.540483255616786e-07, + "loss": 0.5902, + "mean_token_accuracy": 0.8158482313156128, + "num_tokens": 49889408.0, + "step": 1308 + }, + { + "epoch": 0.16651825467497774, + "ewc_loss": 0.0116573516279459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.82867596676806e-06, + "grad_norm": 15.437967300415039, + "learning_rate": 5.544722339974566e-07, + "loss": 0.5145, + "mean_token_accuracy": 0.8384370803833008, + "num_tokens": 49923581.0, + "step": 1309 + }, + { + "epoch": 0.16664546495356825, + "ewc_loss": 0.011650861240923405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.825430434924783e-06, + "grad_norm": 15.368978500366211, + "learning_rate": 5.548961424332343e-07, + "loss": 0.5035, + "mean_token_accuracy": 0.837151050567627, + "num_tokens": 49959433.0, + "step": 1310 + }, + { + "epoch": 0.16677267523215875, + "ewc_loss": 0.011650849133729935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.825424523209222e-06, + "grad_norm": 15.444437026977539, + "learning_rate": 5.553200508690123e-07, + "loss": 0.4704, + "mean_token_accuracy": 0.8515444397926331, + "num_tokens": 50000307.0, + "step": 1311 + }, + { + "epoch": 0.16689988551074927, + "ewc_loss": 0.011709376238286495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8546879699861165e-06, + "grad_norm": 15.435759544372559, + "learning_rate": 5.557439593047901e-07, + "loss": 0.5647, + "mean_token_accuracy": 0.8197736740112305, + "num_tokens": 50035587.0, + "step": 1312 + }, + { + "epoch": 0.16702709578933977, + "ewc_loss": 0.011683535762131214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.84176768825273e-06, + "grad_norm": 15.450504302978516, + "learning_rate": 5.56167867740568e-07, + "loss": 0.5015, + "mean_token_accuracy": 0.8418508768081665, + "num_tokens": 50079425.0, + "step": 1313 + }, + { + "epoch": 0.16715430606793028, + "ewc_loss": 0.011713124811649323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8565624385664705e-06, + "grad_norm": 15.460845947265625, + "learning_rate": 5.565917761763459e-07, + "loss": 0.4832, + "mean_token_accuracy": 0.8468345999717712, + "num_tokens": 50110674.0, + "step": 1314 + }, + { + "epoch": 0.1672815163465208, + "ewc_loss": 0.011691045016050339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.845522537129e-06, + "grad_norm": 15.408609390258789, + "learning_rate": 5.570156846121238e-07, + "loss": 0.5234, + "mean_token_accuracy": 0.8324893712997437, + "num_tokens": 50149033.0, + "step": 1315 + }, + { + "epoch": 0.1674087266251113, + "ewc_loss": 0.011723565869033337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.861782938154647e-06, + "grad_norm": 15.476638793945312, + "learning_rate": 5.574395930479016e-07, + "loss": 0.5427, + "mean_token_accuracy": 0.8292851448059082, + "num_tokens": 50185728.0, + "step": 1316 + }, + { + "epoch": 0.16753593690370183, + "ewc_loss": 0.011723479256033897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.861739737156313e-06, + "grad_norm": 15.459149360656738, + "learning_rate": 5.578635014836796e-07, + "loss": 0.493, + "mean_token_accuracy": 0.843727707862854, + "num_tokens": 50219671.0, + "step": 1317 + }, + { + "epoch": 0.16766314718229233, + "ewc_loss": 0.011739592999219894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.869796495971968e-06, + "grad_norm": 15.533880233764648, + "learning_rate": 5.582874099194573e-07, + "loss": 0.5559, + "mean_token_accuracy": 0.8280340433120728, + "num_tokens": 50251989.0, + "step": 1318 + }, + { + "epoch": 0.16779035746088283, + "ewc_loss": 0.011744268238544464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.872134352102876e-06, + "grad_norm": 15.474828720092773, + "learning_rate": 5.587113183552353e-07, + "loss": 0.5385, + "mean_token_accuracy": 0.8307807445526123, + "num_tokens": 50291079.0, + "step": 1319 + }, + { + "epoch": 0.16791756773947336, + "ewc_loss": 0.01174231432378292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.871157100045821e-06, + "grad_norm": 15.458295822143555, + "learning_rate": 5.591352267910131e-07, + "loss": 0.4926, + "mean_token_accuracy": 0.8442414402961731, + "num_tokens": 50332262.0, + "step": 1320 + }, + { + "epoch": 0.16804477801806386, + "ewc_loss": 0.011757745407521725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.878872798348311e-06, + "grad_norm": 15.486876487731934, + "learning_rate": 5.59559135226791e-07, + "loss": 0.5228, + "mean_token_accuracy": 0.8374708294868469, + "num_tokens": 50371278.0, + "step": 1321 + }, + { + "epoch": 0.16817198829665436, + "ewc_loss": 0.011785793118178844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.892896751902299e-06, + "grad_norm": 15.531595230102539, + "learning_rate": 5.599830436625689e-07, + "loss": 0.5019, + "mean_token_accuracy": 0.8401118516921997, + "num_tokens": 50409463.0, + "step": 1322 + }, + { + "epoch": 0.1682991985752449, + "ewc_loss": 0.011771366000175476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.885683094675187e-06, + "grad_norm": 15.511087417602539, + "learning_rate": 5.604069520983468e-07, + "loss": 0.4948, + "mean_token_accuracy": 0.8432341814041138, + "num_tokens": 50446745.0, + "step": 1323 + }, + { + "epoch": 0.1684264088538354, + "ewc_loss": 0.011748041026294231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.874020644114353e-06, + "grad_norm": 15.464523315429688, + "learning_rate": 5.608308605341246e-07, + "loss": 0.4861, + "mean_token_accuracy": 0.8440612554550171, + "num_tokens": 50479725.0, + "step": 1324 + }, + { + "epoch": 0.1685536191324259, + "ewc_loss": 0.011815052479505539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9075264289276674e-06, + "grad_norm": 15.664867401123047, + "learning_rate": 5.612547689699024e-07, + "loss": 0.5502, + "mean_token_accuracy": 0.8264821171760559, + "num_tokens": 50514734.0, + "step": 1325 + }, + { + "epoch": 0.16868082941101642, + "ewc_loss": 0.011804268695414066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.902134489588207e-06, + "grad_norm": 15.446598052978516, + "learning_rate": 5.616786774056803e-07, + "loss": 0.5282, + "mean_token_accuracy": 0.8351759910583496, + "num_tokens": 50555165.0, + "step": 1326 + }, + { + "epoch": 0.16880803968960692, + "ewc_loss": 0.011779412627220154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.8897062444884796e-06, + "grad_norm": 15.544933319091797, + "learning_rate": 5.621025858414582e-07, + "loss": 0.5296, + "mean_token_accuracy": 0.8319774866104126, + "num_tokens": 50598251.0, + "step": 1327 + }, + { + "epoch": 0.16893524996819742, + "ewc_loss": 0.011804573237895966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.902286829950754e-06, + "grad_norm": 15.511157989501953, + "learning_rate": 5.625264942772361e-07, + "loss": 0.5217, + "mean_token_accuracy": 0.8374611735343933, + "num_tokens": 50638423.0, + "step": 1328 + }, + { + "epoch": 0.16906246024678795, + "ewc_loss": 0.011808691546320915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9043459259555675e-06, + "grad_norm": 15.494392395019531, + "learning_rate": 5.629504027130139e-07, + "loss": 0.5811, + "mean_token_accuracy": 0.8261763453483582, + "num_tokens": 50678226.0, + "step": 1329 + }, + { + "epoch": 0.16918967052537845, + "ewc_loss": 0.01182242576032877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.911213065701304e-06, + "grad_norm": 15.622964859008789, + "learning_rate": 5.633743111487919e-07, + "loss": 0.5804, + "mean_token_accuracy": 0.8142658472061157, + "num_tokens": 50717542.0, + "step": 1330 + }, + { + "epoch": 0.16931688080396895, + "ewc_loss": 0.011852211318910122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.926105586695485e-06, + "grad_norm": 15.545879364013672, + "learning_rate": 5.637982195845697e-07, + "loss": 0.46, + "mean_token_accuracy": 0.8535477519035339, + "num_tokens": 50752914.0, + "step": 1331 + }, + { + "epoch": 0.16944409108255948, + "ewc_loss": 0.011820108629763126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.910054369451245e-06, + "grad_norm": 15.60257339477539, + "learning_rate": 5.642221280203476e-07, + "loss": 0.4585, + "mean_token_accuracy": 0.8553327322006226, + "num_tokens": 50789276.0, + "step": 1332 + }, + { + "epoch": 0.16957130136114998, + "ewc_loss": 0.011852468363940716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.926234280195786e-06, + "grad_norm": 15.595565795898438, + "learning_rate": 5.646460364561254e-07, + "loss": 0.4801, + "mean_token_accuracy": 0.8452187776565552, + "num_tokens": 50820759.0, + "step": 1333 + }, + { + "epoch": 0.16969851163974048, + "ewc_loss": 0.011823119595646858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9115595831826795e-06, + "grad_norm": 15.549153327941895, + "learning_rate": 5.650699448919033e-07, + "loss": 0.5706, + "mean_token_accuracy": 0.8214854001998901, + "num_tokens": 50863241.0, + "step": 1334 + }, + { + "epoch": 0.169825721918331, + "ewc_loss": 0.011847415007650852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9237077039142605e-06, + "grad_norm": 15.581388473510742, + "learning_rate": 5.654938533276812e-07, + "loss": 0.5104, + "mean_token_accuracy": 0.838382363319397, + "num_tokens": 50904296.0, + "step": 1335 + }, + { + "epoch": 0.1699529321969215, + "ewc_loss": 0.011858857236802578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.929428425588412e-06, + "grad_norm": 15.486920356750488, + "learning_rate": 5.659177617634591e-07, + "loss": 0.4948, + "mean_token_accuracy": 0.8435672521591187, + "num_tokens": 50942180.0, + "step": 1336 + }, + { + "epoch": 0.170080142475512, + "ewc_loss": 0.011837265454232693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9186327234783676e-06, + "grad_norm": 15.592327117919922, + "learning_rate": 5.663416701992369e-07, + "loss": 0.5224, + "mean_token_accuracy": 0.8363608121871948, + "num_tokens": 50980287.0, + "step": 1337 + }, + { + "epoch": 0.17020735275410254, + "ewc_loss": 0.011884896084666252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.942447842244292e-06, + "grad_norm": 15.613829612731934, + "learning_rate": 5.667655786350149e-07, + "loss": 0.587, + "mean_token_accuracy": 0.8107329607009888, + "num_tokens": 51014218.0, + "step": 1338 + }, + { + "epoch": 0.17033456303269304, + "ewc_loss": 0.011860382743179798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.9301914916432e-06, + "grad_norm": 15.506884574890137, + "learning_rate": 5.671894870707927e-07, + "loss": 0.5017, + "mean_token_accuracy": 0.8435165882110596, + "num_tokens": 51055915.0, + "step": 1339 + }, + { + "epoch": 0.17046177331128357, + "ewc_loss": 0.011882305145263672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.941152721788967e-06, + "grad_norm": 15.569378852844238, + "learning_rate": 5.676133955065705e-07, + "loss": 0.4437, + "mean_token_accuracy": 0.8551386594772339, + "num_tokens": 51090684.0, + "step": 1340 + }, + { + "epoch": 0.17058898358987407, + "ewc_loss": 0.011928399093449116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.964199317531893e-06, + "grad_norm": 15.655180931091309, + "learning_rate": 5.680373039423484e-07, + "loss": 0.4927, + "mean_token_accuracy": 0.8429144620895386, + "num_tokens": 51130156.0, + "step": 1341 + }, + { + "epoch": 0.17071619386846457, + "ewc_loss": 0.01192391011863947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.961955139355268e-06, + "grad_norm": 15.585746765136719, + "learning_rate": 5.684612123781263e-07, + "loss": 0.5324, + "mean_token_accuracy": 0.8307156562805176, + "num_tokens": 51171282.0, + "step": 1342 + }, + { + "epoch": 0.1708434041470551, + "ewc_loss": 0.011911232955753803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.955616416031262e-06, + "grad_norm": 15.576237678527832, + "learning_rate": 5.688851208139042e-07, + "loss": 0.5497, + "mean_token_accuracy": 0.8251686096191406, + "num_tokens": 51210616.0, + "step": 1343 + }, + { + "epoch": 0.1709706144256456, + "ewc_loss": 0.01195745263248682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.978726221655961e-06, + "grad_norm": 15.619856834411621, + "learning_rate": 5.69309029249682e-07, + "loss": 0.5356, + "mean_token_accuracy": 0.8360871076583862, + "num_tokens": 51253357.0, + "step": 1344 + }, + { + "epoch": 0.1710978247042361, + "ewc_loss": 0.011950389482080936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.975194653728977e-06, + "grad_norm": 15.581987380981445, + "learning_rate": 5.697329376854599e-07, + "loss": 0.513, + "mean_token_accuracy": 0.8360064029693604, + "num_tokens": 51292671.0, + "step": 1345 + }, + { + "epoch": 0.17122503498282662, + "ewc_loss": 0.011960038915276527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.980019523121882e-06, + "grad_norm": 15.622925758361816, + "learning_rate": 5.701568461212378e-07, + "loss": 0.4536, + "mean_token_accuracy": 0.8569890856742859, + "num_tokens": 51329145.0, + "step": 1346 + }, + { + "epoch": 0.17135224526141712, + "ewc_loss": 0.011966024525463581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.983012215438066e-06, + "grad_norm": 15.636496543884277, + "learning_rate": 5.705807545570157e-07, + "loss": 0.4622, + "mean_token_accuracy": 0.8517768383026123, + "num_tokens": 51371265.0, + "step": 1347 + }, + { + "epoch": 0.17147945554000762, + "ewc_loss": 0.011981294490396976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.99064742345945e-06, + "grad_norm": 15.689349174499512, + "learning_rate": 5.710046629927934e-07, + "loss": 0.5005, + "mean_token_accuracy": 0.838329553604126, + "num_tokens": 51410262.0, + "step": 1348 + }, + { + "epoch": 0.17160666581859815, + "ewc_loss": 0.011997156776487827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.99857821725891e-06, + "grad_norm": 15.646197319030762, + "learning_rate": 5.714285714285714e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8497898578643799, + "num_tokens": 51447796.0, + "step": 1349 + }, + { + "epoch": 0.17173387609718865, + "ewc_loss": 0.01197683997452259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 5.988420070934808e-06, + "grad_norm": 15.646199226379395, + "learning_rate": 5.718524798643492e-07, + "loss": 0.5351, + "mean_token_accuracy": 0.8347505331039429, + "num_tokens": 51482677.0, + "step": 1350 + }, + { + "epoch": 0.17186108637577915, + "ewc_loss": 0.012012731283903122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.00636576564284e-06, + "grad_norm": 15.733120918273926, + "learning_rate": 5.722763883001272e-07, + "loss": 0.5341, + "mean_token_accuracy": 0.833856463432312, + "num_tokens": 51524111.0, + "step": 1351 + }, + { + "epoch": 0.17198829665436968, + "ewc_loss": 0.012020952999591827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.010476681694854e-06, + "grad_norm": 15.672593116760254, + "learning_rate": 5.72700296735905e-07, + "loss": 0.5077, + "mean_token_accuracy": 0.8383631706237793, + "num_tokens": 51561008.0, + "step": 1352 + }, + { + "epoch": 0.17211550693296018, + "ewc_loss": 0.012015830725431442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0079155446146615e-06, + "grad_norm": 15.669087409973145, + "learning_rate": 5.731242051716829e-07, + "loss": 0.5615, + "mean_token_accuracy": 0.8194683790206909, + "num_tokens": 51603091.0, + "step": 1353 + }, + { + "epoch": 0.17224271721155068, + "ewc_loss": 0.012034623883664608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0173119891260285e-06, + "grad_norm": 15.760271072387695, + "learning_rate": 5.735481136074608e-07, + "loss": 0.4772, + "mean_token_accuracy": 0.8466188907623291, + "num_tokens": 51637789.0, + "step": 1354 + }, + { + "epoch": 0.1723699274901412, + "ewc_loss": 0.012058480642735958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.029240466887131e-06, + "grad_norm": 15.654821395874023, + "learning_rate": 5.739720220432386e-07, + "loss": 0.4816, + "mean_token_accuracy": 0.8436825275421143, + "num_tokens": 51674207.0, + "step": 1355 + }, + { + "epoch": 0.1724971377687317, + "ewc_loss": 0.012038817629218102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.019408829160966e-06, + "grad_norm": 15.71463680267334, + "learning_rate": 5.743959304790164e-07, + "loss": 0.5784, + "mean_token_accuracy": 0.8212875127792358, + "num_tokens": 51715109.0, + "step": 1356 + }, + { + "epoch": 0.1726243480473222, + "ewc_loss": 0.01208192016929388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.040960215614177e-06, + "grad_norm": 15.741072654724121, + "learning_rate": 5.748198389147944e-07, + "loss": 0.5251, + "mean_token_accuracy": 0.8361724615097046, + "num_tokens": 51753373.0, + "step": 1357 + }, + { + "epoch": 0.17275155832591274, + "ewc_loss": 0.012081639841198921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.040819698682753e-06, + "grad_norm": 15.719200134277344, + "learning_rate": 5.752437473505722e-07, + "loss": 0.5631, + "mean_token_accuracy": 0.8266849517822266, + "num_tokens": 51795100.0, + "step": 1358 + }, + { + "epoch": 0.17287876860450324, + "ewc_loss": 0.01206494402140379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0324719015625305e-06, + "grad_norm": 15.638315200805664, + "learning_rate": 5.756676557863502e-07, + "loss": 0.5368, + "mean_token_accuracy": 0.8301790952682495, + "num_tokens": 51831840.0, + "step": 1359 + }, + { + "epoch": 0.17300597888309374, + "ewc_loss": 0.012089857831597328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0449287957453635e-06, + "grad_norm": 15.737990379333496, + "learning_rate": 5.76091564222128e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.8455640077590942, + "num_tokens": 51870227.0, + "step": 1360 + }, + { + "epoch": 0.17313318916168427, + "ewc_loss": 0.012125911191105843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.062955435481854e-06, + "grad_norm": 15.750176429748535, + "learning_rate": 5.765154726579059e-07, + "loss": 0.515, + "mean_token_accuracy": 0.8377645015716553, + "num_tokens": 51908251.0, + "step": 1361 + }, + { + "epoch": 0.17326039944027477, + "ewc_loss": 0.012102671898901463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.051335731172003e-06, + "grad_norm": 15.732549667358398, + "learning_rate": 5.769393810936838e-07, + "loss": 0.504, + "mean_token_accuracy": 0.8404229283332825, + "num_tokens": 51944531.0, + "step": 1362 + }, + { + "epoch": 0.17338760971886527, + "ewc_loss": 0.012125877663493156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.062939064577222e-06, + "grad_norm": 15.808391571044922, + "learning_rate": 5.773632895294616e-07, + "loss": 0.5592, + "mean_token_accuracy": 0.8239003419876099, + "num_tokens": 51985742.0, + "step": 1363 + }, + { + "epoch": 0.1735148199974558, + "ewc_loss": 0.012118157930672169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0590791690628976e-06, + "grad_norm": 15.753450393676758, + "learning_rate": 5.777871979652394e-07, + "loss": 0.4945, + "mean_token_accuracy": 0.8434759378433228, + "num_tokens": 52022036.0, + "step": 1364 + }, + { + "epoch": 0.1736420302760463, + "ewc_loss": 0.012137681245803833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.068840775697026e-06, + "grad_norm": 15.788761138916016, + "learning_rate": 5.782111064010173e-07, + "loss": 0.5176, + "mean_token_accuracy": 0.8349692225456238, + "num_tokens": 52066077.0, + "step": 1365 + }, + { + "epoch": 0.17376924055463683, + "ewc_loss": 0.012102226726710796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.051113359717419e-06, + "grad_norm": 15.756195068359375, + "learning_rate": 5.786350148367952e-07, + "loss": 0.5088, + "mean_token_accuracy": 0.8390834927558899, + "num_tokens": 52111280.0, + "step": 1366 + }, + { + "epoch": 0.17389645083322733, + "ewc_loss": 0.012135457247495651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.067728463676758e-06, + "grad_norm": 15.793581008911133, + "learning_rate": 5.790589232725731e-07, + "loss": 0.5062, + "mean_token_accuracy": 0.8374831080436707, + "num_tokens": 52153148.0, + "step": 1367 + }, + { + "epoch": 0.17402366111181783, + "ewc_loss": 0.012136544100940228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0682718867610674e-06, + "grad_norm": 15.81591796875, + "learning_rate": 5.79482831708351e-07, + "loss": 0.491, + "mean_token_accuracy": 0.8442642688751221, + "num_tokens": 52186722.0, + "step": 1368 + }, + { + "epoch": 0.17415087139040836, + "ewc_loss": 0.012165956199169159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.082977961341385e-06, + "grad_norm": 15.891120910644531, + "learning_rate": 5.799067401441288e-07, + "loss": 0.5023, + "mean_token_accuracy": 0.8382498025894165, + "num_tokens": 52220837.0, + "step": 1369 + }, + { + "epoch": 0.17427808166899886, + "ewc_loss": 0.012158515863120556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.0792581280111335e-06, + "grad_norm": 15.680325508117676, + "learning_rate": 5.803306485799068e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.8463807702064514, + "num_tokens": 52257049.0, + "step": 1370 + }, + { + "epoch": 0.17440529194758936, + "ewc_loss": 0.012152491137385368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.076245426811511e-06, + "grad_norm": 15.831695556640625, + "learning_rate": 5.807545570156845e-07, + "loss": 0.463, + "mean_token_accuracy": 0.8499706983566284, + "num_tokens": 52297632.0, + "step": 1371 + }, + { + "epoch": 0.1745325022261799, + "ewc_loss": 0.012217489071190357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.108744400989963e-06, + "grad_norm": 15.819571495056152, + "learning_rate": 5.811784654514624e-07, + "loss": 0.5643, + "mean_token_accuracy": 0.8234262466430664, + "num_tokens": 52331008.0, + "step": 1372 + }, + { + "epoch": 0.1746597125047704, + "ewc_loss": 0.012196571566164494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.098285666666925e-06, + "grad_norm": 15.744474411010742, + "learning_rate": 5.816023738872403e-07, + "loss": 0.4951, + "mean_token_accuracy": 0.8442372679710388, + "num_tokens": 52369420.0, + "step": 1373 + }, + { + "epoch": 0.1747869227833609, + "ewc_loss": 0.012230915948748589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.1154578361311e-06, + "grad_norm": 15.784187316894531, + "learning_rate": 5.820262823230182e-07, + "loss": 0.526, + "mean_token_accuracy": 0.8346142768859863, + "num_tokens": 52411759.0, + "step": 1374 + }, + { + "epoch": 0.17491413306195142, + "ewc_loss": 0.012247239239513874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.12361964158481e-06, + "grad_norm": 15.852933883666992, + "learning_rate": 5.824501907587961e-07, + "loss": 0.4852, + "mean_token_accuracy": 0.8444902896881104, + "num_tokens": 52446459.0, + "step": 1375 + }, + { + "epoch": 0.17504134334054192, + "ewc_loss": 0.012251297943294048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.125649179011816e-06, + "grad_norm": 15.826135635375977, + "learning_rate": 5.82874099194574e-07, + "loss": 0.5691, + "mean_token_accuracy": 0.8200865387916565, + "num_tokens": 52484615.0, + "step": 1376 + }, + { + "epoch": 0.17516855361913242, + "ewc_loss": 0.012253702618181705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.12685153100756e-06, + "grad_norm": 15.834514617919922, + "learning_rate": 5.832980076303518e-07, + "loss": 0.5802, + "mean_token_accuracy": 0.8171328902244568, + "num_tokens": 52525195.0, + "step": 1377 + }, + { + "epoch": 0.17529576389772294, + "ewc_loss": 0.012265619821846485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.132810085546225e-06, + "grad_norm": 15.89494514465332, + "learning_rate": 5.837219160661297e-07, + "loss": 0.4619, + "mean_token_accuracy": 0.8531465530395508, + "num_tokens": 52558783.0, + "step": 1378 + }, + { + "epoch": 0.17542297417631345, + "ewc_loss": 0.012294602580368519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.147301064629573e-06, + "grad_norm": 15.808777809143066, + "learning_rate": 5.841458245019075e-07, + "loss": 0.458, + "mean_token_accuracy": 0.8531876802444458, + "num_tokens": 52599176.0, + "step": 1379 + }, + { + "epoch": 0.17555018445490395, + "ewc_loss": 0.01227351650595665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.136758202046622e-06, + "grad_norm": 15.824189186096191, + "learning_rate": 5.845697329376855e-07, + "loss": 0.5157, + "mean_token_accuracy": 0.8404996991157532, + "num_tokens": 52630865.0, + "step": 1380 + }, + { + "epoch": 0.17567739473349447, + "ewc_loss": 0.012319809757173061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.159904842206743e-06, + "grad_norm": 15.870765686035156, + "learning_rate": 5.849936413734633e-07, + "loss": 0.5664, + "mean_token_accuracy": 0.8222982287406921, + "num_tokens": 52671702.0, + "step": 1381 + }, + { + "epoch": 0.17580460501208497, + "ewc_loss": 0.012294139713048935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.147070052975323e-06, + "grad_norm": 15.840842247009277, + "learning_rate": 5.854175498092412e-07, + "loss": 0.4705, + "mean_token_accuracy": 0.850278377532959, + "num_tokens": 52712288.0, + "step": 1382 + }, + { + "epoch": 0.17593181529067548, + "ewc_loss": 0.012336790561676025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.168395429995144e-06, + "grad_norm": 15.869810104370117, + "learning_rate": 5.858414582450191e-07, + "loss": 0.5135, + "mean_token_accuracy": 0.8396621346473694, + "num_tokens": 52756569.0, + "step": 1383 + }, + { + "epoch": 0.176059025569266, + "ewc_loss": 0.012334587052464485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.167293577163946e-06, + "grad_norm": 15.820453643798828, + "learning_rate": 5.86265366680797e-07, + "loss": 0.5365, + "mean_token_accuracy": 0.8310695886611938, + "num_tokens": 52796195.0, + "step": 1384 + }, + { + "epoch": 0.1761862358478565, + "ewc_loss": 0.012339691631495953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.169845619297121e-06, + "grad_norm": 15.839245796203613, + "learning_rate": 5.866892751165748e-07, + "loss": 0.5081, + "mean_token_accuracy": 0.8405770659446716, + "num_tokens": 52839687.0, + "step": 1385 + }, + { + "epoch": 0.176313446126447, + "ewc_loss": 0.012386453337967396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.193226454342948e-06, + "grad_norm": 15.920063972473145, + "learning_rate": 5.871131835523526e-07, + "loss": 0.5622, + "mean_token_accuracy": 0.8233487606048584, + "num_tokens": 52875660.0, + "step": 1386 + }, + { + "epoch": 0.17644065640503753, + "ewc_loss": 0.012386090122163296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.1930450101499446e-06, + "grad_norm": 15.861478805541992, + "learning_rate": 5.875370919881305e-07, + "loss": 0.5195, + "mean_token_accuracy": 0.8342692255973816, + "num_tokens": 52921220.0, + "step": 1387 + }, + { + "epoch": 0.17656786668362803, + "ewc_loss": 0.012359321117401123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.179660431371303e-06, + "grad_norm": 15.892550468444824, + "learning_rate": 5.879610004239084e-07, + "loss": 0.5728, + "mean_token_accuracy": 0.8232819437980652, + "num_tokens": 52960042.0, + "step": 1388 + }, + { + "epoch": 0.17669507696221853, + "ewc_loss": 0.01238427497446537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.192137334437575e-06, + "grad_norm": 15.871132850646973, + "learning_rate": 5.883849088596863e-07, + "loss": 0.556, + "mean_token_accuracy": 0.8259564638137817, + "num_tokens": 53002817.0, + "step": 1389 + }, + { + "epoch": 0.17682228724080906, + "ewc_loss": 0.012388081289827824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.194040452101035e-06, + "grad_norm": 15.897089004516602, + "learning_rate": 5.888088172954641e-07, + "loss": 0.5307, + "mean_token_accuracy": 0.8336613178253174, + "num_tokens": 53037751.0, + "step": 1390 + }, + { + "epoch": 0.17694949751939956, + "ewc_loss": 0.012424885295331478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.212442713149358e-06, + "grad_norm": 15.889226913452148, + "learning_rate": 5.892327257312421e-07, + "loss": 0.4401, + "mean_token_accuracy": 0.8613495826721191, + "num_tokens": 53074286.0, + "step": 1391 + }, + { + "epoch": 0.1770767077979901, + "ewc_loss": 0.012437370605766773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.218685484782327e-06, + "grad_norm": 15.914953231811523, + "learning_rate": 5.896566341670199e-07, + "loss": 0.4635, + "mean_token_accuracy": 0.8475764393806458, + "num_tokens": 53113525.0, + "step": 1392 + }, + { + "epoch": 0.1772039180765806, + "ewc_loss": 0.012436518445611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.218259386514546e-06, + "grad_norm": 15.961649894714355, + "learning_rate": 5.900805426027977e-07, + "loss": 0.5067, + "mean_token_accuracy": 0.8400306701660156, + "num_tokens": 53150137.0, + "step": 1393 + }, + { + "epoch": 0.1773311283551711, + "ewc_loss": 0.012474553659558296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.237276920728618e-06, + "grad_norm": 15.940620422363281, + "learning_rate": 5.905044510385756e-07, + "loss": 0.4582, + "mean_token_accuracy": 0.8536518216133118, + "num_tokens": 53189519.0, + "step": 1394 + }, + { + "epoch": 0.17745833863376162, + "ewc_loss": 0.012451584450900555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.225792276381981e-06, + "grad_norm": 15.946430206298828, + "learning_rate": 5.909283594743535e-07, + "loss": 0.5725, + "mean_token_accuracy": 0.8195475339889526, + "num_tokens": 53227452.0, + "step": 1395 + }, + { + "epoch": 0.17758554891235212, + "ewc_loss": 0.012496031820774078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.248015779419802e-06, + "grad_norm": 15.922454833984375, + "learning_rate": 5.913522679101314e-07, + "loss": 0.5044, + "mean_token_accuracy": 0.8363665342330933, + "num_tokens": 53265875.0, + "step": 1396 + }, + { + "epoch": 0.17771275919094262, + "ewc_loss": 0.012487939558923244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.243969892238965e-06, + "grad_norm": 16.007631301879883, + "learning_rate": 5.917761763459093e-07, + "loss": 0.5641, + "mean_token_accuracy": 0.8209762573242188, + "num_tokens": 53305477.0, + "step": 1397 + }, + { + "epoch": 0.17783996946953315, + "ewc_loss": 0.012497642077505589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.248821136978222e-06, + "grad_norm": 15.924221992492676, + "learning_rate": 5.922000847816871e-07, + "loss": 0.4818, + "mean_token_accuracy": 0.848301887512207, + "num_tokens": 53342800.0, + "step": 1398 + }, + { + "epoch": 0.17796717974812365, + "ewc_loss": 0.012487280182540417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.2436402004095726e-06, + "grad_norm": 16.033111572265625, + "learning_rate": 5.926239932174651e-07, + "loss": 0.5465, + "mean_token_accuracy": 0.8289766311645508, + "num_tokens": 53387476.0, + "step": 1399 + }, + { + "epoch": 0.17809439002671415, + "ewc_loss": 0.012508451007306576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.254225354496157e-06, + "grad_norm": 15.916728019714355, + "learning_rate": 5.930479016532429e-07, + "loss": 0.5041, + "mean_token_accuracy": 0.8397897481918335, + "num_tokens": 53424628.0, + "step": 1400 + }, + { + "epoch": 0.17822160030530468, + "ewc_loss": 0.012493418529629707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.246709290280705e-06, + "grad_norm": 15.988381385803223, + "learning_rate": 5.934718100890207e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.8431874513626099, + "num_tokens": 53464459.0, + "step": 1401 + }, + { + "epoch": 0.17834881058389518, + "ewc_loss": 0.012533566914498806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.266783657338237e-06, + "grad_norm": 15.952740669250488, + "learning_rate": 5.938957185247986e-07, + "loss": 0.5321, + "mean_token_accuracy": 0.8288686275482178, + "num_tokens": 53501721.0, + "step": 1402 + }, + { + "epoch": 0.17847602086248568, + "ewc_loss": 0.012518896721303463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.259448582568439e-06, + "grad_norm": 16.05622673034668, + "learning_rate": 5.943196269605765e-07, + "loss": 0.4932, + "mean_token_accuracy": 0.840400218963623, + "num_tokens": 53536607.0, + "step": 1403 + }, + { + "epoch": 0.1786032311410762, + "ewc_loss": 0.01256786659359932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.283933089434868e-06, + "grad_norm": 15.999321937561035, + "learning_rate": 5.947435353963544e-07, + "loss": 0.5458, + "mean_token_accuracy": 0.8307308554649353, + "num_tokens": 53569809.0, + "step": 1404 + }, + { + "epoch": 0.1787304414196667, + "ewc_loss": 0.012527846731245518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.2639232965011615e-06, + "grad_norm": 15.989938735961914, + "learning_rate": 5.951674438321323e-07, + "loss": 0.521, + "mean_token_accuracy": 0.8355880975723267, + "num_tokens": 53604423.0, + "step": 1405 + }, + { + "epoch": 0.1788576516982572, + "ewc_loss": 0.012571352533996105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.2856761360308155e-06, + "grad_norm": 16.05470085144043, + "learning_rate": 5.955913522679101e-07, + "loss": 0.5174, + "mean_token_accuracy": 0.8337040543556213, + "num_tokens": 53642145.0, + "step": 1406 + }, + { + "epoch": 0.17898486197684774, + "ewc_loss": 0.01259758323431015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.298791504377732e-06, + "grad_norm": 16.043548583984375, + "learning_rate": 5.96015260703688e-07, + "loss": 0.594, + "mean_token_accuracy": 0.8113270401954651, + "num_tokens": 53682998.0, + "step": 1407 + }, + { + "epoch": 0.17911207225543824, + "ewc_loss": 0.01258821040391922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.294105332926847e-06, + "grad_norm": 16.01046371459961, + "learning_rate": 5.964391691394659e-07, + "loss": 0.5259, + "mean_token_accuracy": 0.8351619243621826, + "num_tokens": 53729764.0, + "step": 1408 + }, + { + "epoch": 0.17923928253402874, + "ewc_loss": 0.01258040964603424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.290204964898294e-06, + "grad_norm": 16.097476959228516, + "learning_rate": 5.968630775752436e-07, + "loss": 0.5604, + "mean_token_accuracy": 0.8248487710952759, + "num_tokens": 53764522.0, + "step": 1409 + }, + { + "epoch": 0.17936649281261927, + "ewc_loss": 0.012607289478182793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.303644568106392e-06, + "grad_norm": 16.099178314208984, + "learning_rate": 5.972869860110216e-07, + "loss": 0.5418, + "mean_token_accuracy": 0.8255554437637329, + "num_tokens": 53799791.0, + "step": 1410 + }, + { + "epoch": 0.17949370309120977, + "ewc_loss": 0.012578555382788181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.289277735049836e-06, + "grad_norm": 16.04193115234375, + "learning_rate": 5.977108944467994e-07, + "loss": 0.5799, + "mean_token_accuracy": 0.822258472442627, + "num_tokens": 53840181.0, + "step": 1411 + }, + { + "epoch": 0.17962091336980027, + "ewc_loss": 0.012629752978682518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.314876372925937e-06, + "grad_norm": 16.06656265258789, + "learning_rate": 5.981348028825774e-07, + "loss": 0.485, + "mean_token_accuracy": 0.8484709858894348, + "num_tokens": 53882813.0, + "step": 1412 + }, + { + "epoch": 0.1797481236483908, + "ewc_loss": 0.012601365335285664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.300682798610069e-06, + "grad_norm": 16.062822341918945, + "learning_rate": 5.985587113183552e-07, + "loss": 0.5345, + "mean_token_accuracy": 0.8319951891899109, + "num_tokens": 53922198.0, + "step": 1413 + }, + { + "epoch": 0.1798753339269813, + "ewc_loss": 0.012647842057049274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.323920842987718e-06, + "grad_norm": 16.10866355895996, + "learning_rate": 5.989826197541331e-07, + "loss": 0.529, + "mean_token_accuracy": 0.8347089886665344, + "num_tokens": 53959390.0, + "step": 1414 + }, + { + "epoch": 0.18000254420557182, + "ewc_loss": 0.01264229230582714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.321145974652609e-06, + "grad_norm": 16.132856369018555, + "learning_rate": 5.99406528189911e-07, + "loss": 0.5584, + "mean_token_accuracy": 0.8295333981513977, + "num_tokens": 54005707.0, + "step": 1415 + }, + { + "epoch": 0.18012975448416232, + "ewc_loss": 0.01262367982417345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.311840024864068e-06, + "grad_norm": 16.010229110717773, + "learning_rate": 5.998304366256888e-07, + "loss": 0.5207, + "mean_token_accuracy": 0.8341010808944702, + "num_tokens": 54047808.0, + "step": 1416 + }, + { + "epoch": 0.18025696476275282, + "ewc_loss": 0.012665348127484322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.332674274744932e-06, + "grad_norm": 16.208202362060547, + "learning_rate": 6.002543450614666e-07, + "loss": 0.4448, + "mean_token_accuracy": 0.860421895980835, + "num_tokens": 54082929.0, + "step": 1417 + }, + { + "epoch": 0.18038417504134335, + "ewc_loss": 0.012668553739786148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.3342768044094555e-06, + "grad_norm": 15.996745109558105, + "learning_rate": 6.006782534972446e-07, + "loss": 0.5068, + "mean_token_accuracy": 0.8401139974594116, + "num_tokens": 54120199.0, + "step": 1418 + }, + { + "epoch": 0.18051138531993385, + "ewc_loss": 0.01265519205480814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.3275961110775825e-06, + "grad_norm": 16.202878952026367, + "learning_rate": 6.011021619330224e-07, + "loss": 0.5425, + "mean_token_accuracy": 0.8272807598114014, + "num_tokens": 54158629.0, + "step": 1419 + }, + { + "epoch": 0.18063859559852435, + "ewc_loss": 0.012733694165945053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.366847173921997e-06, + "grad_norm": 16.098228454589844, + "learning_rate": 6.015260703688004e-07, + "loss": 0.5224, + "mean_token_accuracy": 0.8318520784378052, + "num_tokens": 54194772.0, + "step": 1420 + }, + { + "epoch": 0.18076580587711488, + "ewc_loss": 0.012660524807870388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.33026229479583e-06, + "grad_norm": 16.10330581665039, + "learning_rate": 6.019499788045782e-07, + "loss": 0.531, + "mean_token_accuracy": 0.8330292105674744, + "num_tokens": 54229825.0, + "step": 1421 + }, + { + "epoch": 0.18089301615570538, + "ewc_loss": 0.012703047133982182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.3515235524391755e-06, + "grad_norm": 16.12122917175293, + "learning_rate": 6.023738872403561e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.8426293134689331, + "num_tokens": 54260051.0, + "step": 1422 + }, + { + "epoch": 0.18102022643429588, + "ewc_loss": 0.012701394036412239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.350696821755264e-06, + "grad_norm": 16.04332733154297, + "learning_rate": 6.02797795676134e-07, + "loss": 0.5128, + "mean_token_accuracy": 0.8416448831558228, + "num_tokens": 54297865.0, + "step": 1423 + }, + { + "epoch": 0.1811474367128864, + "ewc_loss": 0.012755181640386581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.37759103483404e-06, + "grad_norm": 16.220945358276367, + "learning_rate": 6.032217041119118e-07, + "loss": 0.5282, + "mean_token_accuracy": 0.8292922973632812, + "num_tokens": 54334618.0, + "step": 1424 + }, + { + "epoch": 0.1812746469914769, + "ewc_loss": 0.012790863402187824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.395431682904018e-06, + "grad_norm": 16.100738525390625, + "learning_rate": 6.036456125476896e-07, + "loss": 0.5658, + "mean_token_accuracy": 0.8217145204544067, + "num_tokens": 54369761.0, + "step": 1425 + }, + { + "epoch": 0.1814018572700674, + "ewc_loss": 0.012751450762152672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.375725206453353e-06, + "grad_norm": 16.148155212402344, + "learning_rate": 6.040695209834675e-07, + "loss": 0.4638, + "mean_token_accuracy": 0.850197434425354, + "num_tokens": 54411549.0, + "step": 1426 + }, + { + "epoch": 0.18152906754865794, + "ewc_loss": 0.012806492857635021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.403246516129002e-06, + "grad_norm": 16.161060333251953, + "learning_rate": 6.044934294192454e-07, + "loss": 0.5479, + "mean_token_accuracy": 0.8283752202987671, + "num_tokens": 54449137.0, + "step": 1427 + }, + { + "epoch": 0.18165627782724844, + "ewc_loss": 0.012776811607182026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.388405836332822e-06, + "grad_norm": 16.109508514404297, + "learning_rate": 6.049173378550233e-07, + "loss": 0.5034, + "mean_token_accuracy": 0.8424074649810791, + "num_tokens": 54494604.0, + "step": 1428 + }, + { + "epoch": 0.18178348810583894, + "ewc_loss": 0.01282150112092495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.41075075691333e-06, + "grad_norm": 16.189552307128906, + "learning_rate": 6.053412462908012e-07, + "loss": 0.4838, + "mean_token_accuracy": 0.8454164266586304, + "num_tokens": 54533533.0, + "step": 1429 + }, + { + "epoch": 0.18191069838442947, + "ewc_loss": 0.012823679484426975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.4118398768187035e-06, + "grad_norm": 16.166696548461914, + "learning_rate": 6.05765154726579e-07, + "loss": 0.5565, + "mean_token_accuracy": 0.825960099697113, + "num_tokens": 54575211.0, + "step": 1430 + }, + { + "epoch": 0.18203790866301997, + "ewc_loss": 0.012804490514099598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.40224516246235e-06, + "grad_norm": 16.134437561035156, + "learning_rate": 6.061890631623569e-07, + "loss": 0.4599, + "mean_token_accuracy": 0.8521186709403992, + "num_tokens": 54618308.0, + "step": 1431 + }, + { + "epoch": 0.18216511894161047, + "ewc_loss": 0.012821810320019722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.410905371012632e-06, + "grad_norm": 16.13772964477539, + "learning_rate": 6.066129715981347e-07, + "loss": 0.4988, + "mean_token_accuracy": 0.8412273526191711, + "num_tokens": 54660260.0, + "step": 1432 + }, + { + "epoch": 0.182292329220201, + "ewc_loss": 0.012820291332900524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.4101454881893005e-06, + "grad_norm": 16.21549415588379, + "learning_rate": 6.070368800339126e-07, + "loss": 0.4733, + "mean_token_accuracy": 0.8484454154968262, + "num_tokens": 54692056.0, + "step": 1433 + }, + { + "epoch": 0.1824195394987915, + "ewc_loss": 0.01283752266317606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.4187611314991955e-06, + "grad_norm": 16.20890998840332, + "learning_rate": 6.074607884696905e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.8415387868881226, + "num_tokens": 54729252.0, + "step": 1434 + }, + { + "epoch": 0.182546749777382, + "ewc_loss": 0.012847112491726875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.423556442314293e-06, + "grad_norm": 16.188396453857422, + "learning_rate": 6.078846969054684e-07, + "loss": 0.5171, + "mean_token_accuracy": 0.834595799446106, + "num_tokens": 54763422.0, + "step": 1435 + }, + { + "epoch": 0.18267396005597253, + "ewc_loss": 0.012863793410360813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.431896508729551e-06, + "grad_norm": 16.228595733642578, + "learning_rate": 6.083086053412463e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.8438870906829834, + "num_tokens": 54803494.0, + "step": 1436 + }, + { + "epoch": 0.18280117033456303, + "ewc_loss": 0.012841573916375637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.420787030947395e-06, + "grad_norm": 16.200754165649414, + "learning_rate": 6.087325137770242e-07, + "loss": 0.4813, + "mean_token_accuracy": 0.8460346460342407, + "num_tokens": 54839349.0, + "step": 1437 + }, + { + "epoch": 0.18292838061315353, + "ewc_loss": 0.012874123640358448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.43706198388827e-06, + "grad_norm": 16.219175338745117, + "learning_rate": 6.09156422212802e-07, + "loss": 0.5549, + "mean_token_accuracy": 0.8289068341255188, + "num_tokens": 54881941.0, + "step": 1438 + }, + { + "epoch": 0.18305559089174406, + "ewc_loss": 0.012858012691140175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.429006134567317e-06, + "grad_norm": 16.13370132446289, + "learning_rate": 6.095803306485799e-07, + "loss": 0.5045, + "mean_token_accuracy": 0.8373116850852966, + "num_tokens": 54922927.0, + "step": 1439 + }, + { + "epoch": 0.18318280117033456, + "ewc_loss": 0.01288670115172863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.443350685003679e-06, + "grad_norm": 16.26945686340332, + "learning_rate": 6.100042390843577e-07, + "loss": 0.5344, + "mean_token_accuracy": 0.8301223516464233, + "num_tokens": 54963885.0, + "step": 1440 + }, + { + "epoch": 0.1833100114489251, + "ewc_loss": 0.012938408181071281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.4692039813962765e-06, + "grad_norm": 16.24850082397461, + "learning_rate": 6.104281475201356e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.8449094295501709, + "num_tokens": 55001284.0, + "step": 1441 + }, + { + "epoch": 0.1834372217275156, + "ewc_loss": 0.012892886064946651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.446442966989707e-06, + "grad_norm": 16.20453643798828, + "learning_rate": 6.108520559559135e-07, + "loss": 0.5428, + "mean_token_accuracy": 0.828522264957428, + "num_tokens": 55034813.0, + "step": 1442 + }, + { + "epoch": 0.1835644320061061, + "ewc_loss": 0.012924768030643463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.462384135375032e-06, + "grad_norm": 16.195226669311523, + "learning_rate": 6.112759643916914e-07, + "loss": 0.4472, + "mean_token_accuracy": 0.8565714359283447, + "num_tokens": 55070620.0, + "step": 1443 + }, + { + "epoch": 0.18369164228469662, + "ewc_loss": 0.012919651344418526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.459825726778945e-06, + "grad_norm": 16.271026611328125, + "learning_rate": 6.116998728274693e-07, + "loss": 0.5303, + "mean_token_accuracy": 0.8381104469299316, + "num_tokens": 55108394.0, + "step": 1444 + }, + { + "epoch": 0.18381885256328712, + "ewc_loss": 0.01293694693595171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.468473657150753e-06, + "grad_norm": 16.174030303955078, + "learning_rate": 6.121237812632472e-07, + "loss": 0.5348, + "mean_token_accuracy": 0.8301469087600708, + "num_tokens": 55148170.0, + "step": 1445 + }, + { + "epoch": 0.18394606284187762, + "ewc_loss": 0.012963262386620045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.481631317001302e-06, + "grad_norm": 16.274635314941406, + "learning_rate": 6.125476896990249e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.8506850004196167, + "num_tokens": 55188102.0, + "step": 1446 + }, + { + "epoch": 0.18407327312046814, + "ewc_loss": 0.012986668385565281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.493334240076365e-06, + "grad_norm": 16.256196975708008, + "learning_rate": 6.129715981348028e-07, + "loss": 0.5148, + "mean_token_accuracy": 0.8414393663406372, + "num_tokens": 55225893.0, + "step": 1447 + }, + { + "epoch": 0.18420048339905865, + "ewc_loss": 0.012950374744832516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.475187547039241e-06, + "grad_norm": 16.202991485595703, + "learning_rate": 6.133955065705807e-07, + "loss": 0.5335, + "mean_token_accuracy": 0.8295517563819885, + "num_tokens": 55265321.0, + "step": 1448 + }, + { + "epoch": 0.18432769367764915, + "ewc_loss": 0.01298442017287016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.492210104624974e-06, + "grad_norm": 16.288251876831055, + "learning_rate": 6.138194150063585e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.8449699282646179, + "num_tokens": 55304179.0, + "step": 1449 + }, + { + "epoch": 0.18445490395623967, + "ewc_loss": 0.013039783574640751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.519891940115485e-06, + "grad_norm": 16.24574851989746, + "learning_rate": 6.142433234421365e-07, + "loss": 0.5281, + "mean_token_accuracy": 0.8325566649436951, + "num_tokens": 55343266.0, + "step": 1450 + }, + { + "epoch": 0.18458211423483017, + "ewc_loss": 0.012982957996428013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.491478870884748e-06, + "grad_norm": 16.2563533782959, + "learning_rate": 6.146672318779143e-07, + "loss": 0.5328, + "mean_token_accuracy": 0.8327249884605408, + "num_tokens": 55381147.0, + "step": 1451 + }, + { + "epoch": 0.18470932451342068, + "ewc_loss": 0.01303956750780344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.519783710245974e-06, + "grad_norm": 16.294649124145508, + "learning_rate": 6.150911403136923e-07, + "loss": 0.5334, + "mean_token_accuracy": 0.8340469598770142, + "num_tokens": 55414855.0, + "step": 1452 + }, + { + "epoch": 0.1848365347920112, + "ewc_loss": 0.013028604909777641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.51430264042574e-06, + "grad_norm": 16.29881477355957, + "learning_rate": 6.155150487494701e-07, + "loss": 0.4555, + "mean_token_accuracy": 0.8542364239692688, + "num_tokens": 55449699.0, + "step": 1453 + }, + { + "epoch": 0.1849637450706017, + "ewc_loss": 0.013052595779299736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.526297966047423e-06, + "grad_norm": 16.225723266601562, + "learning_rate": 6.159389571852479e-07, + "loss": 0.4828, + "mean_token_accuracy": 0.8477851748466492, + "num_tokens": 55493202.0, + "step": 1454 + }, + { + "epoch": 0.1850909553491922, + "ewc_loss": 0.013044482097029686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.522241164930165e-06, + "grad_norm": 16.276447296142578, + "learning_rate": 6.163628656210258e-07, + "loss": 0.5068, + "mean_token_accuracy": 0.8403871059417725, + "num_tokens": 55539873.0, + "step": 1455 + }, + { + "epoch": 0.18521816562778273, + "ewc_loss": 0.01307513564825058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.537567969644442e-06, + "grad_norm": 16.365829467773438, + "learning_rate": 6.167867740568037e-07, + "loss": 0.5658, + "mean_token_accuracy": 0.8222851157188416, + "num_tokens": 55576338.0, + "step": 1456 + }, + { + "epoch": 0.18534537590637323, + "ewc_loss": 0.0130997309461236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.549865702254465e-06, + "grad_norm": 16.2814884185791, + "learning_rate": 6.172106824925815e-07, + "loss": 0.485, + "mean_token_accuracy": 0.8454246520996094, + "num_tokens": 55618330.0, + "step": 1457 + }, + { + "epoch": 0.18547258618496373, + "ewc_loss": 0.013059896416962147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.529948223032989e-06, + "grad_norm": 16.35009002685547, + "learning_rate": 6.176345909283595e-07, + "loss": 0.5025, + "mean_token_accuracy": 0.8405296802520752, + "num_tokens": 55657394.0, + "step": 1458 + }, + { + "epoch": 0.18559979646355426, + "ewc_loss": 0.013121828436851501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.560914243891602e-06, + "grad_norm": 16.32202911376953, + "learning_rate": 6.180584993641373e-07, + "loss": 0.4886, + "mean_token_accuracy": 0.8444958925247192, + "num_tokens": 55695087.0, + "step": 1459 + }, + { + "epoch": 0.18572700674214476, + "ewc_loss": 0.013086458668112755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.543229119415628e-06, + "grad_norm": 16.367557525634766, + "learning_rate": 6.184824077999153e-07, + "loss": 0.4721, + "mean_token_accuracy": 0.8496623039245605, + "num_tokens": 55730852.0, + "step": 1460 + }, + { + "epoch": 0.18585421702073526, + "ewc_loss": 0.013104679062962532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.552339527843287e-06, + "grad_norm": 16.338212966918945, + "learning_rate": 6.189063162356931e-07, + "loss": 0.5691, + "mean_token_accuracy": 0.8219500780105591, + "num_tokens": 55765678.0, + "step": 1461 + }, + { + "epoch": 0.1859814272993258, + "ewc_loss": 0.013124056160449982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.562027920153923e-06, + "grad_norm": 16.48838996887207, + "learning_rate": 6.193302246714709e-07, + "loss": 0.4948, + "mean_token_accuracy": 0.8456023335456848, + "num_tokens": 55804409.0, + "step": 1462 + }, + { + "epoch": 0.1861086375779163, + "ewc_loss": 0.013152788393199444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.5763942984631285e-06, + "grad_norm": 16.361555099487305, + "learning_rate": 6.197541331072488e-07, + "loss": 0.5242, + "mean_token_accuracy": 0.8344638347625732, + "num_tokens": 55839520.0, + "step": 1463 + }, + { + "epoch": 0.1862358478565068, + "ewc_loss": 0.013110142201185226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.555071195180062e-06, + "grad_norm": 16.32754135131836, + "learning_rate": 6.201780415430267e-07, + "loss": 0.5037, + "mean_token_accuracy": 0.8398892879486084, + "num_tokens": 55879089.0, + "step": 1464 + }, + { + "epoch": 0.18636305813509732, + "ewc_loss": 0.01315197255462408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.575986390089383e-06, + "grad_norm": 16.361120223999023, + "learning_rate": 6.206019499788045e-07, + "loss": 0.5487, + "mean_token_accuracy": 0.8286603689193726, + "num_tokens": 55916655.0, + "step": 1465 + }, + { + "epoch": 0.18649026841368782, + "ewc_loss": 0.013128119520843029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.564059731317684e-06, + "grad_norm": 16.351652145385742, + "learning_rate": 6.210258584145825e-07, + "loss": 0.4818, + "mean_token_accuracy": 0.8434033393859863, + "num_tokens": 55953828.0, + "step": 1466 + }, + { + "epoch": 0.18661747869227835, + "ewc_loss": 0.013182577677071095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.591288638446713e-06, + "grad_norm": 16.40154266357422, + "learning_rate": 6.214497668503603e-07, + "loss": 0.5298, + "mean_token_accuracy": 0.8289244174957275, + "num_tokens": 55992037.0, + "step": 1467 + }, + { + "epoch": 0.18674468897086885, + "ewc_loss": 0.013156398199498653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.578199190698797e-06, + "grad_norm": 16.3222713470459, + "learning_rate": 6.218736752861383e-07, + "loss": 0.4916, + "mean_token_accuracy": 0.8453455567359924, + "num_tokens": 56020424.0, + "step": 1468 + }, + { + "epoch": 0.18687189924945935, + "ewc_loss": 0.0131783876568079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.589193617401179e-06, + "grad_norm": 16.348779678344727, + "learning_rate": 6.22297583721916e-07, + "loss": 0.5224, + "mean_token_accuracy": 0.8403680324554443, + "num_tokens": 56060839.0, + "step": 1469 + }, + { + "epoch": 0.18699910952804988, + "ewc_loss": 0.01321929320693016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.609646789001999e-06, + "grad_norm": 16.358427047729492, + "learning_rate": 6.227214921576938e-07, + "loss": 0.5546, + "mean_token_accuracy": 0.8299879431724548, + "num_tokens": 56099095.0, + "step": 1470 + }, + { + "epoch": 0.18712631980664038, + "ewc_loss": 0.013232949189841747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.61647482047556e-06, + "grad_norm": 16.42691421508789, + "learning_rate": 6.231454005934718e-07, + "loss": 0.4485, + "mean_token_accuracy": 0.8587103486061096, + "num_tokens": 56133805.0, + "step": 1471 + }, + { + "epoch": 0.18725353008523088, + "ewc_loss": 0.013265821151435375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.632910753978649e-06, + "grad_norm": 16.39801788330078, + "learning_rate": 6.235693090292496e-07, + "loss": 0.5697, + "mean_token_accuracy": 0.8242950439453125, + "num_tokens": 56173064.0, + "step": 1472 + }, + { + "epoch": 0.1873807403638214, + "ewc_loss": 0.013219949789345264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.6099751165893395e-06, + "grad_norm": 16.322908401489258, + "learning_rate": 6.239932174650275e-07, + "loss": 0.4495, + "mean_token_accuracy": 0.85906982421875, + "num_tokens": 56213285.0, + "step": 1473 + }, + { + "epoch": 0.1875079506424119, + "ewc_loss": 0.013258861377835274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.629430572502315e-06, + "grad_norm": 16.431482315063477, + "learning_rate": 6.244171259008054e-07, + "loss": 0.5332, + "mean_token_accuracy": 0.8301964402198792, + "num_tokens": 56248218.0, + "step": 1474 + }, + { + "epoch": 0.1876351609210024, + "ewc_loss": 0.013299615122377872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.649807346548187e-06, + "grad_norm": 16.432159423828125, + "learning_rate": 6.248410343365833e-07, + "loss": 0.4771, + "mean_token_accuracy": 0.8493520617485046, + "num_tokens": 56286098.0, + "step": 1475 + }, + { + "epoch": 0.18776237119959294, + "ewc_loss": 0.013256045058369637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.6280226747039706e-06, + "grad_norm": 16.411649703979492, + "learning_rate": 6.252649427723612e-07, + "loss": 0.4844, + "mean_token_accuracy": 0.8476614356040955, + "num_tokens": 56329732.0, + "step": 1476 + }, + { + "epoch": 0.18788958147818344, + "ewc_loss": 0.013287212699651718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.643606411671499e-06, + "grad_norm": 16.431718826293945, + "learning_rate": 6.25688851208139e-07, + "loss": 0.4899, + "mean_token_accuracy": 0.8461197018623352, + "num_tokens": 56367838.0, + "step": 1477 + }, + { + "epoch": 0.18801679175677394, + "ewc_loss": 0.01328662596642971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.643313099630177e-06, + "grad_norm": 16.509801864624023, + "learning_rate": 6.261127596439168e-07, + "loss": 0.5418, + "mean_token_accuracy": 0.8277405500411987, + "num_tokens": 56413059.0, + "step": 1478 + }, + { + "epoch": 0.18814400203536447, + "ewc_loss": 0.01331452839076519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.657264293608023e-06, + "grad_norm": 16.485288619995117, + "learning_rate": 6.265366680796948e-07, + "loss": 0.4447, + "mean_token_accuracy": 0.8534291982650757, + "num_tokens": 56453806.0, + "step": 1479 + }, + { + "epoch": 0.18827121231395497, + "ewc_loss": 0.013287345878779888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.643672804784728e-06, + "grad_norm": 16.4462833404541, + "learning_rate": 6.269605765154726e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.8422040343284607, + "num_tokens": 56497448.0, + "step": 1480 + }, + { + "epoch": 0.18839842259254547, + "ewc_loss": 0.013343090191483498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.671545179415261e-06, + "grad_norm": 16.496013641357422, + "learning_rate": 6.273844849512505e-07, + "loss": 0.4949, + "mean_token_accuracy": 0.8417781591415405, + "num_tokens": 56537731.0, + "step": 1481 + }, + { + "epoch": 0.188525632871136, + "ewc_loss": 0.013311320915818214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.655660399701446e-06, + "grad_norm": 16.4739933013916, + "learning_rate": 6.278083933870284e-07, + "loss": 0.4339, + "mean_token_accuracy": 0.8620628714561462, + "num_tokens": 56580944.0, + "step": 1482 + }, + { + "epoch": 0.1886528431497265, + "ewc_loss": 0.013313851319253445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.6569255068316124e-06, + "grad_norm": 16.50337791442871, + "learning_rate": 6.282323018228063e-07, + "loss": 0.4666, + "mean_token_accuracy": 0.852644681930542, + "num_tokens": 56616306.0, + "step": 1483 + }, + { + "epoch": 0.188780053428317, + "ewc_loss": 0.013319732621312141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.659866357949795e-06, + "grad_norm": 16.437456130981445, + "learning_rate": 6.286562102585841e-07, + "loss": 0.5466, + "mean_token_accuracy": 0.8318309187889099, + "num_tokens": 56661874.0, + "step": 1484 + }, + { + "epoch": 0.18890726370690752, + "ewc_loss": 0.013343194499611855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.671597475360613e-06, + "grad_norm": 16.56873321533203, + "learning_rate": 6.29080118694362e-07, + "loss": 0.4413, + "mean_token_accuracy": 0.8572418093681335, + "num_tokens": 56696697.0, + "step": 1485 + }, + { + "epoch": 0.18903447398549802, + "ewc_loss": 0.013331915251910686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.665957698714919e-06, + "grad_norm": 16.41847038269043, + "learning_rate": 6.295040271301398e-07, + "loss": 0.4409, + "mean_token_accuracy": 0.8570384979248047, + "num_tokens": 56732581.0, + "step": 1486 + }, + { + "epoch": 0.18916168426408853, + "ewc_loss": 0.013311725109815598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.655862762272591e-06, + "grad_norm": 16.541955947875977, + "learning_rate": 6.299279355659178e-07, + "loss": 0.4909, + "mean_token_accuracy": 0.8453464508056641, + "num_tokens": 56777905.0, + "step": 1487 + }, + { + "epoch": 0.18928889454267905, + "ewc_loss": 0.013385143131017685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.69257178742555e-06, + "grad_norm": 16.54828453063965, + "learning_rate": 6.303518440016956e-07, + "loss": 0.5106, + "mean_token_accuracy": 0.8369529247283936, + "num_tokens": 56812915.0, + "step": 1488 + }, + { + "epoch": 0.18941610482126955, + "ewc_loss": 0.013354750350117683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.677375040453626e-06, + "grad_norm": 16.491676330566406, + "learning_rate": 6.307757524374735e-07, + "loss": 0.5058, + "mean_token_accuracy": 0.8425103425979614, + "num_tokens": 56846907.0, + "step": 1489 + }, + { + "epoch": 0.18954331509986008, + "ewc_loss": 0.013360695913434029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.6803481786337215e-06, + "grad_norm": 16.523942947387695, + "learning_rate": 6.311996608732514e-07, + "loss": 0.5326, + "mean_token_accuracy": 0.8363462686538696, + "num_tokens": 56884614.0, + "step": 1490 + }, + { + "epoch": 0.18967052537845058, + "ewc_loss": 0.013377024792134762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.688512257824186e-06, + "grad_norm": 16.45964241027832, + "learning_rate": 6.316235693090292e-07, + "loss": 0.5251, + "mean_token_accuracy": 0.832677960395813, + "num_tokens": 56920508.0, + "step": 1491 + }, + { + "epoch": 0.18979773565704108, + "ewc_loss": 0.01338240597397089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.6912029978993814e-06, + "grad_norm": 16.5395450592041, + "learning_rate": 6.320474777448071e-07, + "loss": 0.5278, + "mean_token_accuracy": 0.8324117064476013, + "num_tokens": 56957629.0, + "step": 1492 + }, + { + "epoch": 0.1899249459356316, + "ewc_loss": 0.01340438611805439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.702192877128255e-06, + "grad_norm": 16.485532760620117, + "learning_rate": 6.324713861805849e-07, + "loss": 0.4863, + "mean_token_accuracy": 0.8435434103012085, + "num_tokens": 56997265.0, + "step": 1493 + }, + { + "epoch": 0.1900521562142221, + "ewc_loss": 0.013407032936811447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.703516646666685e-06, + "grad_norm": 16.515609741210938, + "learning_rate": 6.328952946163628e-07, + "loss": 0.5906, + "mean_token_accuracy": 0.8188013434410095, + "num_tokens": 57031519.0, + "step": 1494 + }, + { + "epoch": 0.1901793664928126, + "ewc_loss": 0.01346745528280735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.733727786922827e-06, + "grad_norm": 16.522336959838867, + "learning_rate": 6.333192030521407e-07, + "loss": 0.46, + "mean_token_accuracy": 0.8498347401618958, + "num_tokens": 57067264.0, + "step": 1495 + }, + { + "epoch": 0.19030657677140314, + "ewc_loss": 0.013437660411000252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.718830263707787e-06, + "grad_norm": 16.559734344482422, + "learning_rate": 6.337431114879186e-07, + "loss": 0.5146, + "mean_token_accuracy": 0.838477373123169, + "num_tokens": 57104906.0, + "step": 1496 + }, + { + "epoch": 0.19043378704999364, + "ewc_loss": 0.013475507497787476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.737753665220225e-06, + "grad_norm": 16.55466079711914, + "learning_rate": 6.341670199236965e-07, + "loss": 0.4785, + "mean_token_accuracy": 0.8491591215133667, + "num_tokens": 57142534.0, + "step": 1497 + }, + { + "epoch": 0.19056099732858414, + "ewc_loss": 0.013447149656713009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.723574642819585e-06, + "grad_norm": 16.530885696411133, + "learning_rate": 6.345909283594744e-07, + "loss": 0.4615, + "mean_token_accuracy": 0.8535138964653015, + "num_tokens": 57188735.0, + "step": 1498 + }, + { + "epoch": 0.19068820760717467, + "ewc_loss": 0.013484599068760872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.742299319739686e-06, + "grad_norm": 16.566926956176758, + "learning_rate": 6.350148367952522e-07, + "loss": 0.5147, + "mean_token_accuracy": 0.8318514823913574, + "num_tokens": 57224610.0, + "step": 1499 + }, + { + "epoch": 0.19081541788576517, + "ewc_loss": 0.013505960814654827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.752980425517308e-06, + "grad_norm": 16.503389358520508, + "learning_rate": 6.354387452310301e-07, + "loss": 0.4682, + "mean_token_accuracy": 0.846940279006958, + "num_tokens": 57259553.0, + "step": 1500 + }, + { + "epoch": 0.19094262816435567, + "ewc_loss": 0.013508928008377552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.754463811375899e-06, + "grad_norm": 16.504743576049805, + "learning_rate": 6.358626536668079e-07, + "loss": 0.4735, + "mean_token_accuracy": 0.8509975671768188, + "num_tokens": 57295819.0, + "step": 1501 + }, + { + "epoch": 0.1910698384429462, + "ewc_loss": 0.013550643809139729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.775321708119009e-06, + "grad_norm": 16.574125289916992, + "learning_rate": 6.362865621025858e-07, + "loss": 0.4605, + "mean_token_accuracy": 0.8555886149406433, + "num_tokens": 57339110.0, + "step": 1502 + }, + { + "epoch": 0.1911970487215367, + "ewc_loss": 0.01356569305062294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.782846412534127e-06, + "grad_norm": 16.5351619720459, + "learning_rate": 6.367104705383637e-07, + "loss": 0.4903, + "mean_token_accuracy": 0.8441891670227051, + "num_tokens": 57377311.0, + "step": 1503 + }, + { + "epoch": 0.1913242590001272, + "ewc_loss": 0.013550452888011932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.775226665922673e-06, + "grad_norm": 16.657018661499023, + "learning_rate": 6.371343789741416e-07, + "loss": 0.5028, + "mean_token_accuracy": 0.8364895582199097, + "num_tokens": 57419281.0, + "step": 1504 + }, + { + "epoch": 0.19145146927871773, + "ewc_loss": 0.013562245294451714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.7811224653269164e-06, + "grad_norm": 16.46337890625, + "learning_rate": 6.375582874099195e-07, + "loss": 0.4888, + "mean_token_accuracy": 0.846366286277771, + "num_tokens": 57458134.0, + "step": 1505 + }, + { + "epoch": 0.19157867955730823, + "ewc_loss": 0.0135597949847579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.779897375963628e-06, + "grad_norm": 16.7213134765625, + "learning_rate": 6.379821958456974e-07, + "loss": 0.545, + "mean_token_accuracy": 0.826981782913208, + "num_tokens": 57492931.0, + "step": 1506 + }, + { + "epoch": 0.19170588983589873, + "ewc_loss": 0.013599764555692673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.799882157793036e-06, + "grad_norm": 16.588281631469727, + "learning_rate": 6.384061042814751e-07, + "loss": 0.5149, + "mean_token_accuracy": 0.8361371755599976, + "num_tokens": 57529581.0, + "step": 1507 + }, + { + "epoch": 0.19183310011448926, + "ewc_loss": 0.013572878204286098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.78643891660613e-06, + "grad_norm": 16.596025466918945, + "learning_rate": 6.38830012717253e-07, + "loss": 0.5021, + "mean_token_accuracy": 0.8411238193511963, + "num_tokens": 57570302.0, + "step": 1508 + }, + { + "epoch": 0.19196031039307976, + "ewc_loss": 0.013607637025415897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.803818450862309e-06, + "grad_norm": 16.624652862548828, + "learning_rate": 6.392539211530309e-07, + "loss": 0.5153, + "mean_token_accuracy": 0.8357625007629395, + "num_tokens": 57615974.0, + "step": 1509 + }, + { + "epoch": 0.19208752067167026, + "ewc_loss": 0.013601117767393589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.800558821851155e-06, + "grad_norm": 16.575950622558594, + "learning_rate": 6.396778295888087e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.8495870232582092, + "num_tokens": 57653737.0, + "step": 1510 + }, + { + "epoch": 0.1922147309502608, + "ewc_loss": 0.013597064651548862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.798532467655605e-06, + "grad_norm": 16.564373016357422, + "learning_rate": 6.401017380245867e-07, + "loss": 0.5123, + "mean_token_accuracy": 0.8359118700027466, + "num_tokens": 57693431.0, + "step": 1511 + }, + { + "epoch": 0.1923419412288513, + "ewc_loss": 0.013621986843645573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.810993454564596e-06, + "grad_norm": 16.64924430847168, + "learning_rate": 6.405256464603645e-07, + "loss": 0.4972, + "mean_token_accuracy": 0.840422511100769, + "num_tokens": 57736413.0, + "step": 1512 + }, + { + "epoch": 0.1924691515074418, + "ewc_loss": 0.013613360933959484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.806680630688788e-06, + "grad_norm": 16.61662483215332, + "learning_rate": 6.409495548961425e-07, + "loss": 0.4813, + "mean_token_accuracy": 0.8492861390113831, + "num_tokens": 57773293.0, + "step": 1513 + }, + { + "epoch": 0.19259636178603232, + "ewc_loss": 0.013645485043525696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.822742307122098e-06, + "grad_norm": 16.659446716308594, + "learning_rate": 6.413734633319203e-07, + "loss": 0.4947, + "mean_token_accuracy": 0.8445849418640137, + "num_tokens": 57809294.0, + "step": 1514 + }, + { + "epoch": 0.19272357206462282, + "ewc_loss": 0.013657348230481148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.828674031567061e-06, + "grad_norm": 16.705018997192383, + "learning_rate": 6.417973717676981e-07, + "loss": 0.4647, + "mean_token_accuracy": 0.8499006032943726, + "num_tokens": 57843610.0, + "step": 1515 + }, + { + "epoch": 0.19285078234321335, + "ewc_loss": 0.013664822094142437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.832411145296646e-06, + "grad_norm": 16.605093002319336, + "learning_rate": 6.42221280203476e-07, + "loss": 0.5249, + "mean_token_accuracy": 0.8314788937568665, + "num_tokens": 57893098.0, + "step": 1516 + }, + { + "epoch": 0.19297799262180385, + "ewc_loss": 0.013610966503620148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.805483280913904e-06, + "grad_norm": 16.652891159057617, + "learning_rate": 6.426451886392539e-07, + "loss": 0.5522, + "mean_token_accuracy": 0.8302069902420044, + "num_tokens": 57933098.0, + "step": 1517 + }, + { + "epoch": 0.19310520290039435, + "ewc_loss": 0.013679925352334976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.839962679805467e-06, + "grad_norm": 16.62326431274414, + "learning_rate": 6.430690970750317e-07, + "loss": 0.4839, + "mean_token_accuracy": 0.8461444973945618, + "num_tokens": 57971502.0, + "step": 1518 + }, + { + "epoch": 0.19323241317898487, + "ewc_loss": 0.013648279942572117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.824140200478723e-06, + "grad_norm": 16.651042938232422, + "learning_rate": 6.434930055108097e-07, + "loss": 0.5346, + "mean_token_accuracy": 0.8292074203491211, + "num_tokens": 58001557.0, + "step": 1519 + }, + { + "epoch": 0.19335962345757537, + "ewc_loss": 0.013728749938309193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.864374881843105e-06, + "grad_norm": 16.6738338470459, + "learning_rate": 6.439169139465875e-07, + "loss": 0.543, + "mean_token_accuracy": 0.8278023600578308, + "num_tokens": 58038058.0, + "step": 1520 + }, + { + "epoch": 0.19348683373616588, + "ewc_loss": 0.013733302243053913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.866651347081643e-06, + "grad_norm": 16.69149398803711, + "learning_rate": 6.443408223823655e-07, + "loss": 0.51, + "mean_token_accuracy": 0.8363334536552429, + "num_tokens": 58073138.0, + "step": 1521 + }, + { + "epoch": 0.1936140440147564, + "ewc_loss": 0.013726480305194855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.863240287202643e-06, + "grad_norm": 16.6411190032959, + "learning_rate": 6.447647308181432e-07, + "loss": 0.5016, + "mean_token_accuracy": 0.8390963673591614, + "num_tokens": 58111793.0, + "step": 1522 + }, + { + "epoch": 0.1937412542933469, + "ewc_loss": 0.01377895288169384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.889476480864687e-06, + "grad_norm": 16.649789810180664, + "learning_rate": 6.451886392539211e-07, + "loss": 0.4891, + "mean_token_accuracy": 0.8434104323387146, + "num_tokens": 58152911.0, + "step": 1523 + }, + { + "epoch": 0.1938684645719374, + "ewc_loss": 0.013764017261564732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.882008619868429e-06, + "grad_norm": 16.65031623840332, + "learning_rate": 6.45612547689699e-07, + "loss": 0.5073, + "mean_token_accuracy": 0.8402529954910278, + "num_tokens": 58192095.0, + "step": 1524 + }, + { + "epoch": 0.19399567485052793, + "ewc_loss": 0.013792476616799831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.896238119225018e-06, + "grad_norm": 16.72513198852539, + "learning_rate": 6.460364561254769e-07, + "loss": 0.462, + "mean_token_accuracy": 0.8518518209457397, + "num_tokens": 58232651.0, + "step": 1525 + }, + { + "epoch": 0.19412288512911843, + "ewc_loss": 0.013792254962027073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.896127615618752e-06, + "grad_norm": 16.735153198242188, + "learning_rate": 6.464603645612547e-07, + "loss": 0.459, + "mean_token_accuracy": 0.8541350960731506, + "num_tokens": 58270802.0, + "step": 1526 + }, + { + "epoch": 0.19425009540770893, + "ewc_loss": 0.013791486620903015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.895743354107253e-06, + "grad_norm": 16.641170501708984, + "learning_rate": 6.468842729970327e-07, + "loss": 0.491, + "mean_token_accuracy": 0.8466290831565857, + "num_tokens": 58313179.0, + "step": 1527 + }, + { + "epoch": 0.19437730568629946, + "ewc_loss": 0.013790125027298927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.895062597322976e-06, + "grad_norm": 16.625919342041016, + "learning_rate": 6.473081814328105e-07, + "loss": 0.5301, + "mean_token_accuracy": 0.8357982039451599, + "num_tokens": 58354728.0, + "step": 1528 + }, + { + "epoch": 0.19450451596488996, + "ewc_loss": 0.013837109319865704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.918554845469771e-06, + "grad_norm": 16.80777359008789, + "learning_rate": 6.477320898685885e-07, + "loss": 0.5158, + "mean_token_accuracy": 0.8391447067260742, + "num_tokens": 58398649.0, + "step": 1529 + }, + { + "epoch": 0.19463172624348046, + "ewc_loss": 0.013842697255313396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.921348813193617e-06, + "grad_norm": 16.651748657226562, + "learning_rate": 6.481559983043662e-07, + "loss": 0.5262, + "mean_token_accuracy": 0.8326929807662964, + "num_tokens": 58438261.0, + "step": 1530 + }, + { + "epoch": 0.194758936522071, + "ewc_loss": 0.013826785609126091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.913393008289859e-06, + "grad_norm": 16.785953521728516, + "learning_rate": 6.48579906740144e-07, + "loss": 0.47, + "mean_token_accuracy": 0.8486669063568115, + "num_tokens": 58479141.0, + "step": 1531 + }, + { + "epoch": 0.1948861468006615, + "ewc_loss": 0.013858046382665634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.929023129487177e-06, + "grad_norm": 16.661191940307617, + "learning_rate": 6.49003815175922e-07, + "loss": 0.4389, + "mean_token_accuracy": 0.8606119751930237, + "num_tokens": 58515700.0, + "step": 1532 + }, + { + "epoch": 0.195013357079252, + "ewc_loss": 0.01383751817047596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.918759027030319e-06, + "grad_norm": 16.85742950439453, + "learning_rate": 6.494277236116998e-07, + "loss": 0.4451, + "mean_token_accuracy": 0.8584734201431274, + "num_tokens": 58549825.0, + "step": 1533 + }, + { + "epoch": 0.19514056735784252, + "ewc_loss": 0.013898823410272598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.949411726964172e-06, + "grad_norm": 16.653528213500977, + "learning_rate": 6.498516320474777e-07, + "loss": 0.5846, + "mean_token_accuracy": 0.8196127414703369, + "num_tokens": 58596110.0, + "step": 1534 + }, + { + "epoch": 0.19526777763643302, + "ewc_loss": 0.013818170875310898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.909085641382262e-06, + "grad_norm": 16.693620681762695, + "learning_rate": 6.502755404832556e-07, + "loss": 0.4956, + "mean_token_accuracy": 0.8392581939697266, + "num_tokens": 58632410.0, + "step": 1535 + }, + { + "epoch": 0.19539498791502352, + "ewc_loss": 0.013889347203075886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.944673714315286e-06, + "grad_norm": 16.780323028564453, + "learning_rate": 6.506994489190335e-07, + "loss": 0.5581, + "mean_token_accuracy": 0.8294873237609863, + "num_tokens": 58671339.0, + "step": 1536 + }, + { + "epoch": 0.19552219819361405, + "ewc_loss": 0.01390053704380989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.950268470973242e-06, + "grad_norm": 16.697067260742188, + "learning_rate": 6.511233573548114e-07, + "loss": 0.4565, + "mean_token_accuracy": 0.8542627096176147, + "num_tokens": 58711786.0, + "step": 1537 + }, + { + "epoch": 0.19564940847220455, + "ewc_loss": 0.013885407708585262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.942703748791246e-06, + "grad_norm": 16.734237670898438, + "learning_rate": 6.515472657905892e-07, + "loss": 0.4999, + "mean_token_accuracy": 0.8401490449905396, + "num_tokens": 58743600.0, + "step": 1538 + }, + { + "epoch": 0.19577661875079505, + "ewc_loss": 0.01393337082117796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.966685305087594e-06, + "grad_norm": 16.74696159362793, + "learning_rate": 6.51971174226367e-07, + "loss": 0.5583, + "mean_token_accuracy": 0.8239984512329102, + "num_tokens": 58780622.0, + "step": 1539 + }, + { + "epoch": 0.19590382902938558, + "ewc_loss": 0.013950771652162075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.975385986152105e-06, + "grad_norm": 16.728763580322266, + "learning_rate": 6.52395082662145e-07, + "loss": 0.552, + "mean_token_accuracy": 0.8254716396331787, + "num_tokens": 58819370.0, + "step": 1540 + }, + { + "epoch": 0.19603103930797608, + "ewc_loss": 0.013960053212940693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.980026682867901e-06, + "grad_norm": 16.78314781188965, + "learning_rate": 6.528189910979228e-07, + "loss": 0.474, + "mean_token_accuracy": 0.8487945795059204, + "num_tokens": 58856270.0, + "step": 1541 + }, + { + "epoch": 0.1961582495865666, + "ewc_loss": 0.013953049667179585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.976524673518725e-06, + "grad_norm": 16.706998825073242, + "learning_rate": 6.532428995337007e-07, + "loss": 0.491, + "mean_token_accuracy": 0.8441766500473022, + "num_tokens": 58898702.0, + "step": 1542 + }, + { + "epoch": 0.1962854598651571, + "ewc_loss": 0.013940422795712948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.970211416046368e-06, + "grad_norm": 16.844493865966797, + "learning_rate": 6.536668079694786e-07, + "loss": 0.513, + "mean_token_accuracy": 0.8349345922470093, + "num_tokens": 58934602.0, + "step": 1543 + }, + { + "epoch": 0.1964126701437476, + "ewc_loss": 0.01399571355432272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.997856871748809e-06, + "grad_norm": 16.75352668762207, + "learning_rate": 6.540907164052565e-07, + "loss": 0.4979, + "mean_token_accuracy": 0.8417115807533264, + "num_tokens": 58977766.0, + "step": 1544 + }, + { + "epoch": 0.19653988042233814, + "ewc_loss": 0.01394781656563282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 6.973908057261724e-06, + "grad_norm": 16.703529357910156, + "learning_rate": 6.545146248410343e-07, + "loss": 0.5091, + "mean_token_accuracy": 0.8376193046569824, + "num_tokens": 59019649.0, + "step": 1545 + }, + { + "epoch": 0.19666709070092864, + "ewc_loss": 0.014012755826115608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.006377927609719e-06, + "grad_norm": 16.802080154418945, + "learning_rate": 6.549385332768122e-07, + "loss": 0.5211, + "mean_token_accuracy": 0.8367273807525635, + "num_tokens": 59060543.0, + "step": 1546 + }, + { + "epoch": 0.19679430097951914, + "ewc_loss": 0.014026365242898464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.013182766968384e-06, + "grad_norm": 16.887760162353516, + "learning_rate": 6.5536244171259e-07, + "loss": 0.5194, + "mean_token_accuracy": 0.8341450691223145, + "num_tokens": 59100049.0, + "step": 1547 + }, + { + "epoch": 0.19692151125810967, + "ewc_loss": 0.014016012661159039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.008006377873244e-06, + "grad_norm": 16.751630783081055, + "learning_rate": 6.55786350148368e-07, + "loss": 0.5272, + "mean_token_accuracy": 0.8323144912719727, + "num_tokens": 59137370.0, + "step": 1548 + }, + { + "epoch": 0.19704872153670017, + "ewc_loss": 0.01402260735630989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.0113037509145215e-06, + "grad_norm": 16.8979549407959, + "learning_rate": 6.562102585841458e-07, + "loss": 0.4791, + "mean_token_accuracy": 0.8442329168319702, + "num_tokens": 59176196.0, + "step": 1549 + }, + { + "epoch": 0.19717593181529067, + "ewc_loss": 0.014079472981393337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.039736374281347e-06, + "grad_norm": 16.82278823852539, + "learning_rate": 6.566341670199236e-07, + "loss": 0.4778, + "mean_token_accuracy": 0.8478692770004272, + "num_tokens": 59216215.0, + "step": 1550 + }, + { + "epoch": 0.1973031420938812, + "ewc_loss": 0.014019603841006756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.0098017204145435e-06, + "grad_norm": 16.864086151123047, + "learning_rate": 6.570580754557016e-07, + "loss": 0.4942, + "mean_token_accuracy": 0.842179536819458, + "num_tokens": 59254173.0, + "step": 1551 + }, + { + "epoch": 0.1974303523724717, + "ewc_loss": 0.014075635001063347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.037817340460606e-06, + "grad_norm": 16.907384872436523, + "learning_rate": 6.574819838914794e-07, + "loss": 0.4656, + "mean_token_accuracy": 0.8509421944618225, + "num_tokens": 59288496.0, + "step": 1552 + }, + { + "epoch": 0.1975575626510622, + "ewc_loss": 0.014044986106455326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.022493264230434e-06, + "grad_norm": 16.8911190032959, + "learning_rate": 6.579058923272573e-07, + "loss": 0.5259, + "mean_token_accuracy": 0.8321027755737305, + "num_tokens": 59329043.0, + "step": 1553 + }, + { + "epoch": 0.19768477292965272, + "ewc_loss": 0.014047646895051003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.023823400231777e-06, + "grad_norm": 16.882863998413086, + "learning_rate": 6.583298007630351e-07, + "loss": 0.5167, + "mean_token_accuracy": 0.8380938172340393, + "num_tokens": 59360743.0, + "step": 1554 + }, + { + "epoch": 0.19781198320824323, + "ewc_loss": 0.014094993472099304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.047496637824224e-06, + "grad_norm": 16.835533142089844, + "learning_rate": 6.58753709198813e-07, + "loss": 0.5118, + "mean_token_accuracy": 0.8373605012893677, + "num_tokens": 59404167.0, + "step": 1555 + }, + { + "epoch": 0.19793919348683373, + "ewc_loss": 0.014080091379582882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.04004560247995e-06, + "grad_norm": 16.905200958251953, + "learning_rate": 6.591776176345909e-07, + "loss": 0.4795, + "mean_token_accuracy": 0.8441462516784668, + "num_tokens": 59447770.0, + "step": 1556 + }, + { + "epoch": 0.19806640376542425, + "ewc_loss": 0.014121843501925468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.0609216891170945e-06, + "grad_norm": 16.870193481445312, + "learning_rate": 6.596015260703688e-07, + "loss": 0.4889, + "mean_token_accuracy": 0.8479980826377869, + "num_tokens": 59494989.0, + "step": 1557 + }, + { + "epoch": 0.19819361404401475, + "ewc_loss": 0.014093484729528427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.046742211969104e-06, + "grad_norm": 16.826318740844727, + "learning_rate": 6.600254345061466e-07, + "loss": 0.4761, + "mean_token_accuracy": 0.849987804889679, + "num_tokens": 59534926.0, + "step": 1558 + }, + { + "epoch": 0.19832082432260525, + "ewc_loss": 0.01409017015248537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.045085112622473e-06, + "grad_norm": 16.813058853149414, + "learning_rate": 6.604493429419246e-07, + "loss": 0.531, + "mean_token_accuracy": 0.8331037759780884, + "num_tokens": 59577832.0, + "step": 1559 + }, + { + "epoch": 0.19844803460119578, + "ewc_loss": 0.01412199903279543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.060999450914096e-06, + "grad_norm": 16.916826248168945, + "learning_rate": 6.608732513777023e-07, + "loss": 0.5388, + "mean_token_accuracy": 0.8376680016517639, + "num_tokens": 59624995.0, + "step": 1560 + }, + { + "epoch": 0.19857524487978628, + "ewc_loss": 0.014133116230368614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.066558282531332e-06, + "grad_norm": 16.80984115600586, + "learning_rate": 6.612971598134803e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.8426551818847656, + "num_tokens": 59663000.0, + "step": 1561 + }, + { + "epoch": 0.19870245515837678, + "ewc_loss": 0.014104325324296951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.05216280039167e-06, + "grad_norm": 16.901750564575195, + "learning_rate": 6.617210682492581e-07, + "loss": 0.5482, + "mean_token_accuracy": 0.8268560171127319, + "num_tokens": 59700241.0, + "step": 1562 + }, + { + "epoch": 0.1988296654369673, + "ewc_loss": 0.014173975214362144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.086987807269907e-06, + "grad_norm": 16.852935791015625, + "learning_rate": 6.62144976685036e-07, + "loss": 0.4643, + "mean_token_accuracy": 0.8543477654457092, + "num_tokens": 59737244.0, + "step": 1563 + }, + { + "epoch": 0.1989568757155578, + "ewc_loss": 0.014133545570075512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.066772923280951e-06, + "grad_norm": 16.87872314453125, + "learning_rate": 6.625688851208139e-07, + "loss": 0.5145, + "mean_token_accuracy": 0.8374598026275635, + "num_tokens": 59775538.0, + "step": 1564 + }, + { + "epoch": 0.19908408599414834, + "ewc_loss": 0.014204775914549828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.102387826307677e-06, + "grad_norm": 16.87604522705078, + "learning_rate": 6.629927935565918e-07, + "loss": 0.5084, + "mean_token_accuracy": 0.8368881940841675, + "num_tokens": 59810373.0, + "step": 1565 + }, + { + "epoch": 0.19921129627273884, + "ewc_loss": 0.01417367160320282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.0868359216547105e-06, + "grad_norm": 16.94093894958496, + "learning_rate": 6.634167019923696e-07, + "loss": 0.5404, + "mean_token_accuracy": 0.8332647085189819, + "num_tokens": 59849992.0, + "step": 1566 + }, + { + "epoch": 0.19933850655132934, + "ewc_loss": 0.014179602265357971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.089801329129841e-06, + "grad_norm": 16.866025924682617, + "learning_rate": 6.638406104281476e-07, + "loss": 0.5015, + "mean_token_accuracy": 0.8393279314041138, + "num_tokens": 59882552.0, + "step": 1567 + }, + { + "epoch": 0.19946571682991987, + "ewc_loss": 0.014199662022292614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.099830781953642e-06, + "grad_norm": 16.902132034301758, + "learning_rate": 6.642645188639253e-07, + "loss": 0.4764, + "mean_token_accuracy": 0.8466941118240356, + "num_tokens": 59925090.0, + "step": 1568 + }, + { + "epoch": 0.19959292710851037, + "ewc_loss": 0.01421351544559002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.106757948349696e-06, + "grad_norm": 16.968141555786133, + "learning_rate": 6.646884272997032e-07, + "loss": 0.5267, + "mean_token_accuracy": 0.8330914378166199, + "num_tokens": 59965406.0, + "step": 1569 + }, + { + "epoch": 0.19972013738710087, + "ewc_loss": 0.014260947704315186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.1304739321931265e-06, + "grad_norm": 16.930524826049805, + "learning_rate": 6.651123357354811e-07, + "loss": 0.5133, + "mean_token_accuracy": 0.8380403518676758, + "num_tokens": 60001770.0, + "step": 1570 + }, + { + "epoch": 0.1998473476656914, + "ewc_loss": 0.014246628619730473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.12331438990077e-06, + "grad_norm": 16.973302841186523, + "learning_rate": 6.655362441712589e-07, + "loss": 0.529, + "mean_token_accuracy": 0.8316287398338318, + "num_tokens": 60036911.0, + "step": 1571 + }, + { + "epoch": 0.1999745579442819, + "ewc_loss": 0.014262269251048565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.1311346800939646e-06, + "grad_norm": 16.91572380065918, + "learning_rate": 6.659601526070369e-07, + "loss": 0.5565, + "mean_token_accuracy": 0.8281443119049072, + "num_tokens": 60080210.0, + "step": 1572 + }, + { + "epoch": 0.2001017682228724, + "ewc_loss": 0.01425626128911972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.128130619094009e-06, + "grad_norm": 16.967620849609375, + "learning_rate": 6.663840610428147e-07, + "loss": 0.5922, + "mean_token_accuracy": 0.820456862449646, + "num_tokens": 60111533.0, + "step": 1573 + }, + { + "epoch": 0.20022897850146293, + "ewc_loss": 0.014320676214993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.160338100220542e-06, + "grad_norm": 16.949108123779297, + "learning_rate": 6.668079694785926e-07, + "loss": 0.4655, + "mean_token_accuracy": 0.8524167537689209, + "num_tokens": 60150733.0, + "step": 1574 + }, + { + "epoch": 0.20035618878005343, + "ewc_loss": 0.014308867044746876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.1544336606166326e-06, + "grad_norm": 17.009645462036133, + "learning_rate": 6.672318779143704e-07, + "loss": 0.4863, + "mean_token_accuracy": 0.8457736968994141, + "num_tokens": 60185034.0, + "step": 1575 + }, + { + "epoch": 0.20048339905864393, + "ewc_loss": 0.014345254749059677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.172627192630898e-06, + "grad_norm": 17.035253524780273, + "learning_rate": 6.676557863501483e-07, + "loss": 0.5193, + "mean_token_accuracy": 0.8367956876754761, + "num_tokens": 60216865.0, + "step": 1576 + }, + { + "epoch": 0.20061060933723446, + "ewc_loss": 0.01432857010513544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.164284852478886e-06, + "grad_norm": 16.90959930419922, + "learning_rate": 6.680796947859262e-07, + "loss": 0.5082, + "mean_token_accuracy": 0.8387616872787476, + "num_tokens": 60257685.0, + "step": 1577 + }, + { + "epoch": 0.20073781961582496, + "ewc_loss": 0.014327479526400566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.163739610405173e-06, + "grad_norm": 17.033414840698242, + "learning_rate": 6.685036032217041e-07, + "loss": 0.5009, + "mean_token_accuracy": 0.8420554399490356, + "num_tokens": 60294258.0, + "step": 1578 + }, + { + "epoch": 0.20086502989441546, + "ewc_loss": 0.014375189319252968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.187594746937975e-06, + "grad_norm": 16.962566375732422, + "learning_rate": 6.689275116574819e-07, + "loss": 0.4915, + "mean_token_accuracy": 0.8473931550979614, + "num_tokens": 60330319.0, + "step": 1579 + }, + { + "epoch": 0.200992240173006, + "ewc_loss": 0.014344881288707256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.172440746217035e-06, + "grad_norm": 16.956600189208984, + "learning_rate": 6.693514200932599e-07, + "loss": 0.508, + "mean_token_accuracy": 0.8422061204910278, + "num_tokens": 60370482.0, + "step": 1580 + }, + { + "epoch": 0.2011194504515965, + "ewc_loss": 0.01442711241543293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.2135562732000835e-06, + "grad_norm": 17.025161743164062, + "learning_rate": 6.697753285290377e-07, + "loss": 0.5141, + "mean_token_accuracy": 0.8364978432655334, + "num_tokens": 60412436.0, + "step": 1581 + }, + { + "epoch": 0.201246660730187, + "ewc_loss": 0.014409787952899933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.204893790913047e-06, + "grad_norm": 16.92348861694336, + "learning_rate": 6.701992369648156e-07, + "loss": 0.4365, + "mean_token_accuracy": 0.8609610199928284, + "num_tokens": 60448982.0, + "step": 1582 + }, + { + "epoch": 0.20137387100877752, + "ewc_loss": 0.014429135248064995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.214567631308455e-06, + "grad_norm": 17.037853240966797, + "learning_rate": 6.706231454005934e-07, + "loss": 0.4544, + "mean_token_accuracy": 0.8539556860923767, + "num_tokens": 60489878.0, + "step": 1583 + }, + { + "epoch": 0.20150108128736802, + "ewc_loss": 0.014463474042713642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.231737072288524e-06, + "grad_norm": 17.02857208251953, + "learning_rate": 6.710470538363713e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.847475528717041, + "num_tokens": 60528979.0, + "step": 1584 + }, + { + "epoch": 0.20162829156595852, + "ewc_loss": 0.01442642044275999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.213210210466059e-06, + "grad_norm": 17.022897720336914, + "learning_rate": 6.714709622721492e-07, + "loss": 0.5172, + "mean_token_accuracy": 0.8358898162841797, + "num_tokens": 60561670.0, + "step": 1585 + }, + { + "epoch": 0.20175550184454905, + "ewc_loss": 0.01448056846857071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.2402840487484355e-06, + "grad_norm": 16.96489143371582, + "learning_rate": 6.718948707079271e-07, + "loss": 0.4932, + "mean_token_accuracy": 0.8417191505432129, + "num_tokens": 60602061.0, + "step": 1586 + }, + { + "epoch": 0.20188271212313955, + "ewc_loss": 0.014484153129160404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.24207666280563e-06, + "grad_norm": 17.1731014251709, + "learning_rate": 6.723187791437049e-07, + "loss": 0.4958, + "mean_token_accuracy": 0.8417057991027832, + "num_tokens": 60639805.0, + "step": 1587 + }, + { + "epoch": 0.20200992240173005, + "ewc_loss": 0.014503194019198418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.251596798596438e-06, + "grad_norm": 16.988544464111328, + "learning_rate": 6.727426875794829e-07, + "loss": 0.5224, + "mean_token_accuracy": 0.8359718322753906, + "num_tokens": 60681816.0, + "step": 1588 + }, + { + "epoch": 0.20213713268032057, + "ewc_loss": 0.014438003301620483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.219001417979598e-06, + "grad_norm": 17.020713806152344, + "learning_rate": 6.731665960152607e-07, + "loss": 0.5069, + "mean_token_accuracy": 0.8366421461105347, + "num_tokens": 60726343.0, + "step": 1589 + }, + { + "epoch": 0.20226434295891108, + "ewc_loss": 0.014524195343255997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.262097824423108e-06, + "grad_norm": 17.18193244934082, + "learning_rate": 6.735905044510385e-07, + "loss": 0.5505, + "mean_token_accuracy": 0.8213057518005371, + "num_tokens": 60764814.0, + "step": 1590 + }, + { + "epoch": 0.2023915532375016, + "ewc_loss": 0.01450987160205841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.254936008393997e-06, + "grad_norm": 17.087675094604492, + "learning_rate": 6.740144128868164e-07, + "loss": 0.5182, + "mean_token_accuracy": 0.8367283344268799, + "num_tokens": 60802384.0, + "step": 1591 + }, + { + "epoch": 0.2025187635160921, + "ewc_loss": 0.014467381872236729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.2336911216552835e-06, + "grad_norm": 17.04964256286621, + "learning_rate": 6.744383213225942e-07, + "loss": 0.5456, + "mean_token_accuracy": 0.8269600868225098, + "num_tokens": 60842085.0, + "step": 1592 + }, + { + "epoch": 0.2026459737946826, + "ewc_loss": 0.014495529234409332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.247764642670518e-06, + "grad_norm": 17.186922073364258, + "learning_rate": 6.748622297583722e-07, + "loss": 0.4795, + "mean_token_accuracy": 0.8428905606269836, + "num_tokens": 60874835.0, + "step": 1593 + }, + { + "epoch": 0.20277318407327313, + "ewc_loss": 0.0144985131919384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.249256668728776e-06, + "grad_norm": 17.06734848022461, + "learning_rate": 6.7528613819415e-07, + "loss": 0.5081, + "mean_token_accuracy": 0.8377978205680847, + "num_tokens": 60910682.0, + "step": 1594 + }, + { + "epoch": 0.20290039435186363, + "ewc_loss": 0.014495531097054482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.2477655521652196e-06, + "grad_norm": 17.096174240112305, + "learning_rate": 6.757100466299279e-07, + "loss": 0.5029, + "mean_token_accuracy": 0.8431437015533447, + "num_tokens": 60951352.0, + "step": 1595 + }, + { + "epoch": 0.20302760463045413, + "ewc_loss": 0.014498245902359486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.249122973007616e-06, + "grad_norm": 17.10371971130371, + "learning_rate": 6.761339550657058e-07, + "loss": 0.5321, + "mean_token_accuracy": 0.8298746943473816, + "num_tokens": 60979055.0, + "step": 1596 + }, + { + "epoch": 0.20315481490904466, + "ewc_loss": 0.014546504244208336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.273252322193002e-06, + "grad_norm": 17.100921630859375, + "learning_rate": 6.765578635014837e-07, + "loss": 0.5091, + "mean_token_accuracy": 0.841315746307373, + "num_tokens": 61017437.0, + "step": 1597 + }, + { + "epoch": 0.20328202518763516, + "ewc_loss": 0.014526051469147205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.2630259637662675e-06, + "grad_norm": 17.035995483398438, + "learning_rate": 6.769817719372614e-07, + "loss": 0.4854, + "mean_token_accuracy": 0.8452205657958984, + "num_tokens": 61052286.0, + "step": 1598 + }, + { + "epoch": 0.20340923546622566, + "ewc_loss": 0.014561623334884644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.280811587406788e-06, + "grad_norm": 17.13001823425293, + "learning_rate": 6.774056803730394e-07, + "loss": 0.5306, + "mean_token_accuracy": 0.8315691947937012, + "num_tokens": 61086110.0, + "step": 1599 + }, + { + "epoch": 0.2035364457448162, + "ewc_loss": 0.014576632529497147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.288316282938467e-06, + "grad_norm": 17.061735153198242, + "learning_rate": 6.778295888088172e-07, + "loss": 0.5022, + "mean_token_accuracy": 0.8404970765113831, + "num_tokens": 61124181.0, + "step": 1600 + }, + { + "epoch": 0.2036636560234067, + "ewc_loss": 0.014581824652850628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.290912435564678e-06, + "grad_norm": 17.130340576171875, + "learning_rate": 6.782534972445952e-07, + "loss": 0.5231, + "mean_token_accuracy": 0.8326674103736877, + "num_tokens": 61160881.0, + "step": 1601 + }, + { + "epoch": 0.2037908663019972, + "ewc_loss": 0.014638293534517288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.319146789086517e-06, + "grad_norm": 17.098508834838867, + "learning_rate": 6.78677405680373e-07, + "loss": 0.5203, + "mean_token_accuracy": 0.8378476500511169, + "num_tokens": 61200755.0, + "step": 1602 + }, + { + "epoch": 0.20391807658058772, + "ewc_loss": 0.014621060341596603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.31053023628192e-06, + "grad_norm": 17.050872802734375, + "learning_rate": 6.791013141161509e-07, + "loss": 0.5114, + "mean_token_accuracy": 0.8364199995994568, + "num_tokens": 61240045.0, + "step": 1603 + }, + { + "epoch": 0.20404528685917822, + "ewc_loss": 0.014665679074823856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.332839686569059e-06, + "grad_norm": 17.16219711303711, + "learning_rate": 6.795252225519288e-07, + "loss": 0.5364, + "mean_token_accuracy": 0.8292578458786011, + "num_tokens": 61282832.0, + "step": 1604 + }, + { + "epoch": 0.20417249713776872, + "ewc_loss": 0.014639412984251976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.319706583075458e-06, + "grad_norm": 17.024978637695312, + "learning_rate": 6.799491309877067e-07, + "loss": 0.5096, + "mean_token_accuracy": 0.8369139432907104, + "num_tokens": 61319841.0, + "step": 1605 + }, + { + "epoch": 0.20429970741635925, + "ewc_loss": 0.014629291370511055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.314645699807443e-06, + "grad_norm": 17.12000274658203, + "learning_rate": 6.803730394234844e-07, + "loss": 0.4793, + "mean_token_accuracy": 0.8484736680984497, + "num_tokens": 61352732.0, + "step": 1606 + }, + { + "epoch": 0.20442691769494975, + "ewc_loss": 0.014690498821437359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.3452492870274e-06, + "grad_norm": 17.13084602355957, + "learning_rate": 6.807969478592624e-07, + "loss": 0.5123, + "mean_token_accuracy": 0.8400040864944458, + "num_tokens": 61388324.0, + "step": 1607 + }, + { + "epoch": 0.20455412797354025, + "ewc_loss": 0.014695028774440289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.3475143835821655e-06, + "grad_norm": 17.20317840576172, + "learning_rate": 6.812208562950402e-07, + "loss": 0.5356, + "mean_token_accuracy": 0.8362952470779419, + "num_tokens": 61431222.0, + "step": 1608 + }, + { + "epoch": 0.20468133825213078, + "ewc_loss": 0.014700688421726227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.350344276346732e-06, + "grad_norm": 17.149656295776367, + "learning_rate": 6.816447647308182e-07, + "loss": 0.5668, + "mean_token_accuracy": 0.8223052024841309, + "num_tokens": 61475617.0, + "step": 1609 + }, + { + "epoch": 0.20480854853072128, + "ewc_loss": 0.014681504108011723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.340751835727133e-06, + "grad_norm": 17.18368148803711, + "learning_rate": 6.82068673166596e-07, + "loss": 0.4912, + "mean_token_accuracy": 0.8457914590835571, + "num_tokens": 61511338.0, + "step": 1610 + }, + { + "epoch": 0.20493575880931178, + "ewc_loss": 0.014747913926839828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.373957032541512e-06, + "grad_norm": 17.285892486572266, + "learning_rate": 6.824925816023738e-07, + "loss": 0.5139, + "mean_token_accuracy": 0.8354668617248535, + "num_tokens": 61549093.0, + "step": 1611 + }, + { + "epoch": 0.2050629690879023, + "ewc_loss": 0.014683817513287067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.341908712987788e-06, + "grad_norm": 17.08258056640625, + "learning_rate": 6.829164900381518e-07, + "loss": 0.4805, + "mean_token_accuracy": 0.8458938598632812, + "num_tokens": 61593843.0, + "step": 1612 + }, + { + "epoch": 0.2051901793664928, + "ewc_loss": 0.01469105388969183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.345527137658792e-06, + "grad_norm": 17.268970489501953, + "learning_rate": 6.833403984739295e-07, + "loss": 0.4931, + "mean_token_accuracy": 0.841812014579773, + "num_tokens": 61629362.0, + "step": 1613 + }, + { + "epoch": 0.2053173896450833, + "ewc_loss": 0.014756962656974792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.378481313935481e-06, + "grad_norm": 17.15102195739746, + "learning_rate": 6.837643069097074e-07, + "loss": 0.5736, + "mean_token_accuracy": 0.8199945688247681, + "num_tokens": 61667037.0, + "step": 1614 + }, + { + "epoch": 0.20544459992367384, + "ewc_loss": 0.014719505794346333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.359752999036573e-06, + "grad_norm": 17.21825408935547, + "learning_rate": 6.841882153454853e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.8422516584396362, + "num_tokens": 61701976.0, + "step": 1615 + }, + { + "epoch": 0.20557181020226434, + "ewc_loss": 0.014765286818146706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.382643616438145e-06, + "grad_norm": 17.157386779785156, + "learning_rate": 6.846121237812632e-07, + "loss": 0.4605, + "mean_token_accuracy": 0.8538377285003662, + "num_tokens": 61739068.0, + "step": 1616 + }, + { + "epoch": 0.20569902048085487, + "ewc_loss": 0.014730005525052547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.365002602455206e-06, + "grad_norm": 17.16922950744629, + "learning_rate": 6.850360322170411e-07, + "loss": 0.4739, + "mean_token_accuracy": 0.8498937487602234, + "num_tokens": 61781515.0, + "step": 1617 + }, + { + "epoch": 0.20582623075944537, + "ewc_loss": 0.0147892776876688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.394638942059828e-06, + "grad_norm": 17.246122360229492, + "learning_rate": 6.85459940652819e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.8416151404380798, + "num_tokens": 61815201.0, + "step": 1618 + }, + { + "epoch": 0.20595344103803587, + "ewc_loss": 0.01477795373648405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.388976882793941e-06, + "grad_norm": 17.28618812561035, + "learning_rate": 6.858838490885968e-07, + "loss": 0.5463, + "mean_token_accuracy": 0.8271937370300293, + "num_tokens": 61856639.0, + "step": 1619 + }, + { + "epoch": 0.2060806513166264, + "ewc_loss": 0.01480855606496334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.404278221656568e-06, + "grad_norm": 17.226774215698242, + "learning_rate": 6.863077575243748e-07, + "loss": 0.5594, + "mean_token_accuracy": 0.8277428150177002, + "num_tokens": 61894313.0, + "step": 1620 + }, + { + "epoch": 0.2062078615952169, + "ewc_loss": 0.01478141825646162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.390709015453467e-06, + "grad_norm": 17.20486068725586, + "learning_rate": 6.867316659601525e-07, + "loss": 0.4944, + "mean_token_accuracy": 0.8402910232543945, + "num_tokens": 61930748.0, + "step": 1621 + }, + { + "epoch": 0.2063350718738074, + "ewc_loss": 0.014846055768430233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.423027909680968e-06, + "grad_norm": 17.277645111083984, + "learning_rate": 6.871555743959304e-07, + "loss": 0.5082, + "mean_token_accuracy": 0.8407424688339233, + "num_tokens": 61972002.0, + "step": 1622 + }, + { + "epoch": 0.20646228215239792, + "ewc_loss": 0.014840065501630306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.42003294362803e-06, + "grad_norm": 17.252897262573242, + "learning_rate": 6.875794828317083e-07, + "loss": 0.4845, + "mean_token_accuracy": 0.8469146490097046, + "num_tokens": 62010141.0, + "step": 1623 + }, + { + "epoch": 0.20658949243098843, + "ewc_loss": 0.014840641058981419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.42032034395379e-06, + "grad_norm": 17.230348587036133, + "learning_rate": 6.880033912674862e-07, + "loss": 0.4736, + "mean_token_accuracy": 0.8519755601882935, + "num_tokens": 62055907.0, + "step": 1624 + }, + { + "epoch": 0.20671670270957893, + "ewc_loss": 0.014853758737444878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.426879164995626e-06, + "grad_norm": 17.293405532836914, + "learning_rate": 6.884272997032641e-07, + "loss": 0.58, + "mean_token_accuracy": 0.8182679414749146, + "num_tokens": 62095885.0, + "step": 1625 + }, + { + "epoch": 0.20684391298816945, + "ewc_loss": 0.014877942390739918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.438971351803048e-06, + "grad_norm": 17.250953674316406, + "learning_rate": 6.88851208139042e-07, + "loss": 0.4722, + "mean_token_accuracy": 0.8514495491981506, + "num_tokens": 62130497.0, + "step": 1626 + }, + { + "epoch": 0.20697112326675995, + "ewc_loss": 0.014881863258779049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.440931767632719e-06, + "grad_norm": 17.358625411987305, + "learning_rate": 6.892751165748198e-07, + "loss": 0.534, + "mean_token_accuracy": 0.8304533958435059, + "num_tokens": 62170296.0, + "step": 1627 + }, + { + "epoch": 0.20709833354535045, + "ewc_loss": 0.014881998300552368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.44099907024065e-06, + "grad_norm": 17.21115493774414, + "learning_rate": 6.896990250105978e-07, + "loss": 0.4675, + "mean_token_accuracy": 0.8549338579177856, + "num_tokens": 62210168.0, + "step": 1628 + }, + { + "epoch": 0.20722554382394098, + "ewc_loss": 0.014863638207316399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.431819085468305e-06, + "grad_norm": 17.326066970825195, + "learning_rate": 6.901229334463755e-07, + "loss": 0.4957, + "mean_token_accuracy": 0.8404760360717773, + "num_tokens": 62245251.0, + "step": 1629 + }, + { + "epoch": 0.20735275410253148, + "ewc_loss": 0.014930706471204758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.465353064617375e-06, + "grad_norm": 17.34263038635254, + "learning_rate": 6.905468418821534e-07, + "loss": 0.4935, + "mean_token_accuracy": 0.8475821018218994, + "num_tokens": 62286124.0, + "step": 1630 + }, + { + "epoch": 0.20747996438112198, + "ewc_loss": 0.014873028732836246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.436514351866208e-06, + "grad_norm": 17.3066463470459, + "learning_rate": 6.909707503179313e-07, + "loss": 0.4976, + "mean_token_accuracy": 0.838469386100769, + "num_tokens": 62325604.0, + "step": 1631 + }, + { + "epoch": 0.2076071746597125, + "ewc_loss": 0.014882019720971584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.441009984177072e-06, + "grad_norm": 17.345399856567383, + "learning_rate": 6.913946587537091e-07, + "loss": 0.4947, + "mean_token_accuracy": 0.842986524105072, + "num_tokens": 62364467.0, + "step": 1632 + }, + { + "epoch": 0.207734384938303, + "ewc_loss": 0.014897956512868404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.448978067259304e-06, + "grad_norm": 17.331085205078125, + "learning_rate": 6.918185671894871e-07, + "loss": 0.5665, + "mean_token_accuracy": 0.8217884302139282, + "num_tokens": 62402075.0, + "step": 1633 + }, + { + "epoch": 0.2078615952168935, + "ewc_loss": 0.01492755301296711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.4637764555518515e-06, + "grad_norm": 17.49025535583496, + "learning_rate": 6.922424756252649e-07, + "loss": 0.5331, + "mean_token_accuracy": 0.8375726938247681, + "num_tokens": 62437802.0, + "step": 1634 + }, + { + "epoch": 0.20798880549548404, + "ewc_loss": 0.014895005151629448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.447502412105678e-06, + "grad_norm": 17.265422821044922, + "learning_rate": 6.926663840610428e-07, + "loss": 0.4517, + "mean_token_accuracy": 0.8541181087493896, + "num_tokens": 62476176.0, + "step": 1635 + }, + { + "epoch": 0.20811601577407454, + "ewc_loss": 0.0148903988301754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.445199571520789e-06, + "grad_norm": 17.347721099853516, + "learning_rate": 6.930902924968206e-07, + "loss": 0.4974, + "mean_token_accuracy": 0.8383846282958984, + "num_tokens": 62514378.0, + "step": 1636 + }, + { + "epoch": 0.20824322605266504, + "ewc_loss": 0.01494026929140091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.470134733011946e-06, + "grad_norm": 17.420795440673828, + "learning_rate": 6.935142009325985e-07, + "loss": 0.52, + "mean_token_accuracy": 0.835720956325531, + "num_tokens": 62555302.0, + "step": 1637 + }, + { + "epoch": 0.20837043633125557, + "ewc_loss": 0.014906631782650948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.45331590223941e-06, + "grad_norm": 17.32222557067871, + "learning_rate": 6.939381093683764e-07, + "loss": 0.5386, + "mean_token_accuracy": 0.8310578465461731, + "num_tokens": 62594356.0, + "step": 1638 + }, + { + "epoch": 0.20849764660984607, + "ewc_loss": 0.01489986665546894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.449933491443517e-06, + "grad_norm": 17.340951919555664, + "learning_rate": 6.943620178041543e-07, + "loss": 0.4553, + "mean_token_accuracy": 0.8552827835083008, + "num_tokens": 62632903.0, + "step": 1639 + }, + { + "epoch": 0.2086248568884366, + "ewc_loss": 0.01493591908365488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.467959676432656e-06, + "grad_norm": 17.375640869140625, + "learning_rate": 6.947859262399321e-07, + "loss": 0.5067, + "mean_token_accuracy": 0.8465467691421509, + "num_tokens": 62668979.0, + "step": 1640 + }, + { + "epoch": 0.2087520671670271, + "ewc_loss": 0.014979889616370201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.489944891858613e-06, + "grad_norm": 17.44226837158203, + "learning_rate": 6.952098346757101e-07, + "loss": 0.4918, + "mean_token_accuracy": 0.8433420658111572, + "num_tokens": 62705867.0, + "step": 1641 + }, + { + "epoch": 0.2088792774456176, + "ewc_loss": 0.014946427196264267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.473213827324798e-06, + "grad_norm": 17.33669090270996, + "learning_rate": 6.956337431114879e-07, + "loss": 0.4901, + "mean_token_accuracy": 0.8465868830680847, + "num_tokens": 62744559.0, + "step": 1642 + }, + { + "epoch": 0.20900648772420813, + "ewc_loss": 0.014975829049944878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.487914444936905e-06, + "grad_norm": 17.320220947265625, + "learning_rate": 6.960576515472658e-07, + "loss": 0.5186, + "mean_token_accuracy": 0.8365030288696289, + "num_tokens": 62785221.0, + "step": 1643 + }, + { + "epoch": 0.20913369800279863, + "ewc_loss": 0.014997297897934914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.49864875615458e-06, + "grad_norm": 17.3260440826416, + "learning_rate": 6.964815599830436e-07, + "loss": 0.5085, + "mean_token_accuracy": 0.8381925821304321, + "num_tokens": 62820095.0, + "step": 1644 + }, + { + "epoch": 0.20926090828138913, + "ewc_loss": 0.015023091807961464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.511545845773071e-06, + "grad_norm": 17.385770797729492, + "learning_rate": 6.969054684188215e-07, + "loss": 0.5605, + "mean_token_accuracy": 0.8207981586456299, + "num_tokens": 62863414.0, + "step": 1645 + }, + { + "epoch": 0.20938811855997966, + "ewc_loss": 0.014975406229496002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.487702987418743e-06, + "grad_norm": 17.28390121459961, + "learning_rate": 6.973293768545994e-07, + "loss": 0.4696, + "mean_token_accuracy": 0.850744903087616, + "num_tokens": 62904647.0, + "step": 1646 + }, + { + "epoch": 0.20951532883857016, + "ewc_loss": 0.015032856725156307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.516428468079539e-06, + "grad_norm": 17.369522094726562, + "learning_rate": 6.977532852903773e-07, + "loss": 0.5033, + "mean_token_accuracy": 0.8390953540802002, + "num_tokens": 62949476.0, + "step": 1647 + }, + { + "epoch": 0.20964253911716066, + "ewc_loss": 0.015047706663608551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.523853128077462e-06, + "grad_norm": 17.348670959472656, + "learning_rate": 6.981771937261551e-07, + "loss": 0.5579, + "mean_token_accuracy": 0.8247982859611511, + "num_tokens": 62984784.0, + "step": 1648 + }, + { + "epoch": 0.2097697493957512, + "ewc_loss": 0.015035456977784634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.517728590755723e-06, + "grad_norm": 17.290019989013672, + "learning_rate": 6.986011021619331e-07, + "loss": 0.5096, + "mean_token_accuracy": 0.839195728302002, + "num_tokens": 63024395.0, + "step": 1649 + }, + { + "epoch": 0.2098969596743417, + "ewc_loss": 0.015069478191435337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.534739324910333e-06, + "grad_norm": 17.30343246459961, + "learning_rate": 6.990250105977109e-07, + "loss": 0.5272, + "mean_token_accuracy": 0.8359320163726807, + "num_tokens": 63061886.0, + "step": 1650 + }, + { + "epoch": 0.2100241699529322, + "ewc_loss": 0.015079176984727383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.539588295912836e-06, + "grad_norm": 17.34934425354004, + "learning_rate": 6.994489190334886e-07, + "loss": 0.4956, + "mean_token_accuracy": 0.8411968946456909, + "num_tokens": 63104150.0, + "step": 1651 + }, + { + "epoch": 0.21015138023152272, + "ewc_loss": 0.015131724067032337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.565862233605003e-06, + "grad_norm": 17.296165466308594, + "learning_rate": 6.998728274692666e-07, + "loss": 0.4573, + "mean_token_accuracy": 0.8545820713043213, + "num_tokens": 63144346.0, + "step": 1652 + }, + { + "epoch": 0.21027859051011322, + "ewc_loss": 0.015101986937224865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.550993359473068e-06, + "grad_norm": 17.319772720336914, + "learning_rate": 7.002967359050444e-07, + "loss": 0.5004, + "mean_token_accuracy": 0.8467257022857666, + "num_tokens": 63180042.0, + "step": 1653 + }, + { + "epoch": 0.21040580078870372, + "ewc_loss": 0.015172655694186687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.586327683384297e-06, + "grad_norm": 17.39933204650879, + "learning_rate": 7.007206443408224e-07, + "loss": 0.4498, + "mean_token_accuracy": 0.8542989492416382, + "num_tokens": 63216570.0, + "step": 1654 + }, + { + "epoch": 0.21053301106729425, + "ewc_loss": 0.015191980637609959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.595990155095933e-06, + "grad_norm": 17.44016456604004, + "learning_rate": 7.011445527766002e-07, + "loss": 0.5012, + "mean_token_accuracy": 0.8415794372558594, + "num_tokens": 63253457.0, + "step": 1655 + }, + { + "epoch": 0.21066022134588475, + "ewc_loss": 0.01516910083591938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.584550530737033e-06, + "grad_norm": 17.410858154296875, + "learning_rate": 7.015684612123781e-07, + "loss": 0.4866, + "mean_token_accuracy": 0.8442777991294861, + "num_tokens": 63295430.0, + "step": 1656 + }, + { + "epoch": 0.21078743162447525, + "ewc_loss": 0.015166640281677246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.583319984405534e-06, + "grad_norm": 17.40546226501465, + "learning_rate": 7.01992369648156e-07, + "loss": 0.5078, + "mean_token_accuracy": 0.8414846658706665, + "num_tokens": 63340118.0, + "step": 1657 + }, + { + "epoch": 0.21091464190306577, + "ewc_loss": 0.015180032700300217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.5900161391473375e-06, + "grad_norm": 17.402170181274414, + "learning_rate": 7.024162780839339e-07, + "loss": 0.5395, + "mean_token_accuracy": 0.8230464458465576, + "num_tokens": 63372037.0, + "step": 1658 + }, + { + "epoch": 0.21104185218165628, + "ewc_loss": 0.01519988477230072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.599942364322487e-06, + "grad_norm": 17.40353775024414, + "learning_rate": 7.028401865197116e-07, + "loss": 0.5305, + "mean_token_accuracy": 0.8307031393051147, + "num_tokens": 63415508.0, + "step": 1659 + }, + { + "epoch": 0.21116906246024678, + "ewc_loss": 0.015218163840472698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.609081876580603e-06, + "grad_norm": 17.405216217041016, + "learning_rate": 7.032640949554896e-07, + "loss": 0.4373, + "mean_token_accuracy": 0.85679030418396, + "num_tokens": 63459320.0, + "step": 1660 + }, + { + "epoch": 0.2112962727388373, + "ewc_loss": 0.015229534357786179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.614767127961386e-06, + "grad_norm": 17.47069549560547, + "learning_rate": 7.036880033912674e-07, + "loss": 0.5391, + "mean_token_accuracy": 0.8311471343040466, + "num_tokens": 63499624.0, + "step": 1661 + }, + { + "epoch": 0.2114234830174278, + "ewc_loss": 0.01526650134474039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.633250788785517e-06, + "grad_norm": 17.499406814575195, + "learning_rate": 7.041119118270454e-07, + "loss": 0.4974, + "mean_token_accuracy": 0.8405846953392029, + "num_tokens": 63536507.0, + "step": 1662 + }, + { + "epoch": 0.2115506932960183, + "ewc_loss": 0.015250831842422485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.625415946677094e-06, + "grad_norm": 17.4766845703125, + "learning_rate": 7.045358202628232e-07, + "loss": 0.5368, + "mean_token_accuracy": 0.8324127197265625, + "num_tokens": 63576483.0, + "step": 1663 + }, + { + "epoch": 0.21167790357460883, + "ewc_loss": 0.015236224047839642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.618112249474507e-06, + "grad_norm": 17.475534439086914, + "learning_rate": 7.049597286986011e-07, + "loss": 0.5263, + "mean_token_accuracy": 0.8326698541641235, + "num_tokens": 63615568.0, + "step": 1664 + }, + { + "epoch": 0.21180511385319933, + "ewc_loss": 0.015253123827278614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.626561910001328e-06, + "grad_norm": 17.47675323486328, + "learning_rate": 7.05383637134379e-07, + "loss": 0.4874, + "mean_token_accuracy": 0.8448840379714966, + "num_tokens": 63650023.0, + "step": 1665 + }, + { + "epoch": 0.21193232413178986, + "ewc_loss": 0.01529247872531414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.646239282621536e-06, + "grad_norm": 17.482683181762695, + "learning_rate": 7.058075455701568e-07, + "loss": 0.5165, + "mean_token_accuracy": 0.8317423462867737, + "num_tokens": 63690229.0, + "step": 1666 + }, + { + "epoch": 0.21205953441038036, + "ewc_loss": 0.015281004831194878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.640502190042753e-06, + "grad_norm": 17.51238441467285, + "learning_rate": 7.062314540059346e-07, + "loss": 0.4594, + "mean_token_accuracy": 0.8562771081924438, + "num_tokens": 63728169.0, + "step": 1667 + }, + { + "epoch": 0.21218674468897086, + "ewc_loss": 0.015269150026142597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.634575013071299e-06, + "grad_norm": 17.422956466674805, + "learning_rate": 7.066553624417126e-07, + "loss": 0.4868, + "mean_token_accuracy": 0.8436170816421509, + "num_tokens": 63762816.0, + "step": 1668 + }, + { + "epoch": 0.2123139549675614, + "ewc_loss": 0.015278453938663006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.639227078470867e-06, + "grad_norm": 17.442543029785156, + "learning_rate": 7.070792708774904e-07, + "loss": 0.5082, + "mean_token_accuracy": 0.8382577896118164, + "num_tokens": 63804288.0, + "step": 1669 + }, + { + "epoch": 0.2124411652461519, + "ewc_loss": 0.015319393947720528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.65969707572367e-06, + "grad_norm": 17.494544982910156, + "learning_rate": 7.075031793132684e-07, + "loss": 0.5284, + "mean_token_accuracy": 0.8361951112747192, + "num_tokens": 63851365.0, + "step": 1670 + }, + { + "epoch": 0.2125683755247424, + "ewc_loss": 0.01533307321369648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.666536475881003e-06, + "grad_norm": 17.531930923461914, + "learning_rate": 7.079270877490462e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.846738874912262, + "num_tokens": 63886187.0, + "step": 1671 + }, + { + "epoch": 0.21269558580333292, + "ewc_loss": 0.015309727750718594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.654864020878449e-06, + "grad_norm": 17.446229934692383, + "learning_rate": 7.08350996184824e-07, + "loss": 0.5639, + "mean_token_accuracy": 0.8238330483436584, + "num_tokens": 63927187.0, + "step": 1672 + }, + { + "epoch": 0.21282279608192342, + "ewc_loss": 0.01534029096364975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.670145350857638e-06, + "grad_norm": 17.474044799804688, + "learning_rate": 7.08774904620602e-07, + "loss": 0.4841, + "mean_token_accuracy": 0.8491066694259644, + "num_tokens": 63964088.0, + "step": 1673 + }, + { + "epoch": 0.21295000636051392, + "ewc_loss": 0.015373479574918747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.686739991186187e-06, + "grad_norm": 17.588388442993164, + "learning_rate": 7.091988130563797e-07, + "loss": 0.5296, + "mean_token_accuracy": 0.8329238891601562, + "num_tokens": 64004931.0, + "step": 1674 + }, + { + "epoch": 0.21307721663910445, + "ewc_loss": 0.015378464013338089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.689232006669044e-06, + "grad_norm": 17.476306915283203, + "learning_rate": 7.096227214921576e-07, + "loss": 0.5231, + "mean_token_accuracy": 0.835330069065094, + "num_tokens": 64043966.0, + "step": 1675 + }, + { + "epoch": 0.21320442691769495, + "ewc_loss": 0.015364986844360828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.682493560423609e-06, + "grad_norm": 17.64870834350586, + "learning_rate": 7.100466299279355e-07, + "loss": 0.5048, + "mean_token_accuracy": 0.8413723707199097, + "num_tokens": 64078886.0, + "step": 1676 + }, + { + "epoch": 0.21333163719628545, + "ewc_loss": 0.015401052311062813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.70052611187566e-06, + "grad_norm": 17.5941219329834, + "learning_rate": 7.104705383637134e-07, + "loss": 0.5301, + "mean_token_accuracy": 0.8387600183486938, + "num_tokens": 64117291.0, + "step": 1677 + }, + { + "epoch": 0.21345884747487598, + "ewc_loss": 0.015373055823147297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.686528078920674e-06, + "grad_norm": 17.608163833618164, + "learning_rate": 7.108944467994913e-07, + "loss": 0.4848, + "mean_token_accuracy": 0.8452866673469543, + "num_tokens": 64156369.0, + "step": 1678 + }, + { + "epoch": 0.21358605775346648, + "ewc_loss": 0.015375511720776558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.687755896768067e-06, + "grad_norm": 17.591266632080078, + "learning_rate": 7.113183552352692e-07, + "loss": 0.5591, + "mean_token_accuracy": 0.8195476531982422, + "num_tokens": 64193646.0, + "step": 1679 + }, + { + "epoch": 0.21371326803205698, + "ewc_loss": 0.015389641746878624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.694820851611439e-06, + "grad_norm": 17.532569885253906, + "learning_rate": 7.11742263671047e-07, + "loss": 0.45, + "mean_token_accuracy": 0.8562209606170654, + "num_tokens": 64231933.0, + "step": 1680 + }, + { + "epoch": 0.2138404783106475, + "ewc_loss": 0.015404786914587021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.7023933044984e-06, + "grad_norm": 17.638893127441406, + "learning_rate": 7.12166172106825e-07, + "loss": 0.5846, + "mean_token_accuracy": 0.8133698105812073, + "num_tokens": 64267418.0, + "step": 1681 + }, + { + "epoch": 0.213967688589238, + "ewc_loss": 0.015459000132977962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.729499884590041e-06, + "grad_norm": 17.646501541137695, + "learning_rate": 7.125900805426027e-07, + "loss": 0.4642, + "mean_token_accuracy": 0.8517873287200928, + "num_tokens": 64302440.0, + "step": 1682 + }, + { + "epoch": 0.2140948988678285, + "ewc_loss": 0.015423484146595001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.711742000537924e-06, + "grad_norm": 17.625835418701172, + "learning_rate": 7.130139889783806e-07, + "loss": 0.5043, + "mean_token_accuracy": 0.840263843536377, + "num_tokens": 64338240.0, + "step": 1683 + }, + { + "epoch": 0.21422210914641904, + "ewc_loss": 0.015447827987372875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.723913768131752e-06, + "grad_norm": 17.571565628051758, + "learning_rate": 7.134378974141585e-07, + "loss": 0.4949, + "mean_token_accuracy": 0.841138482093811, + "num_tokens": 64375734.0, + "step": 1684 + }, + { + "epoch": 0.21434931942500954, + "ewc_loss": 0.015433681197464466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.716840627836064e-06, + "grad_norm": 17.649377822875977, + "learning_rate": 7.138618058499364e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.8457057476043701, + "num_tokens": 64411401.0, + "step": 1685 + }, + { + "epoch": 0.21447652970360004, + "ewc_loss": 0.015504641458392143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.752320925646927e-06, + "grad_norm": 17.732141494750977, + "learning_rate": 7.142857142857143e-07, + "loss": 0.4639, + "mean_token_accuracy": 0.8533570766448975, + "num_tokens": 64454719.0, + "step": 1686 + }, + { + "epoch": 0.21460373998219057, + "ewc_loss": 0.015464176423847675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.732088306511287e-06, + "grad_norm": 17.585052490234375, + "learning_rate": 7.147096227214922e-07, + "loss": 0.4884, + "mean_token_accuracy": 0.8455693125724792, + "num_tokens": 64495736.0, + "step": 1687 + }, + { + "epoch": 0.21473095026078107, + "ewc_loss": 0.015493524260818958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.746762094029691e-06, + "grad_norm": 17.787338256835938, + "learning_rate": 7.1513353115727e-07, + "loss": 0.4694, + "mean_token_accuracy": 0.8500880002975464, + "num_tokens": 64532843.0, + "step": 1688 + }, + { + "epoch": 0.21485816053937157, + "ewc_loss": 0.0155019611120224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.750980330456514e-06, + "grad_norm": 17.75014877319336, + "learning_rate": 7.155574395930479e-07, + "loss": 0.5163, + "mean_token_accuracy": 0.8328283429145813, + "num_tokens": 64568950.0, + "step": 1689 + }, + { + "epoch": 0.2149853708179621, + "ewc_loss": 0.01545496005564928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.727479896857403e-06, + "grad_norm": 17.766021728515625, + "learning_rate": 7.159813480288257e-07, + "loss": 0.4916, + "mean_token_accuracy": 0.8406786322593689, + "num_tokens": 64606878.0, + "step": 1690 + }, + { + "epoch": 0.2151125810965526, + "ewc_loss": 0.015518526546657085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.75926309870556e-06, + "grad_norm": 17.738378524780273, + "learning_rate": 7.164052564646035e-07, + "loss": 0.4573, + "mean_token_accuracy": 0.853635311126709, + "num_tokens": 64643012.0, + "step": 1691 + }, + { + "epoch": 0.21523979137514312, + "ewc_loss": 0.015498274937272072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.749137694190722e-06, + "grad_norm": 17.791248321533203, + "learning_rate": 7.168291649003815e-07, + "loss": 0.4798, + "mean_token_accuracy": 0.8508797883987427, + "num_tokens": 64684444.0, + "step": 1692 + }, + { + "epoch": 0.21536700165373363, + "ewc_loss": 0.01552242785692215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.761213964840863e-06, + "grad_norm": 17.70334243774414, + "learning_rate": 7.172530733361593e-07, + "loss": 0.5313, + "mean_token_accuracy": 0.8327213525772095, + "num_tokens": 64722068.0, + "step": 1693 + }, + { + "epoch": 0.21549421193232413, + "ewc_loss": 0.015497354790568352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.748677489871625e-06, + "grad_norm": 17.682159423828125, + "learning_rate": 7.176769817719373e-07, + "loss": 0.4423, + "mean_token_accuracy": 0.864357054233551, + "num_tokens": 64761829.0, + "step": 1694 + }, + { + "epoch": 0.21562142221091465, + "ewc_loss": 0.015523574315011501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76178694650298e-06, + "grad_norm": 17.70287322998047, + "learning_rate": 7.181008902077151e-07, + "loss": 0.4873, + "mean_token_accuracy": 0.8463743925094604, + "num_tokens": 64799185.0, + "step": 1695 + }, + { + "epoch": 0.21574863248950515, + "ewc_loss": 0.015525185503065586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.76259275880875e-06, + "grad_norm": 17.693349838256836, + "learning_rate": 7.18524798643493e-07, + "loss": 0.5186, + "mean_token_accuracy": 0.8376146554946899, + "num_tokens": 64839306.0, + "step": 1696 + }, + { + "epoch": 0.21587584276809565, + "ewc_loss": 0.015534443780779839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.767222086840775e-06, + "grad_norm": 17.702041625976562, + "learning_rate": 7.189487070792708e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.8460215330123901, + "num_tokens": 64876011.0, + "step": 1697 + }, + { + "epoch": 0.21600305304668618, + "ewc_loss": 0.015541428700089455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.770714546495583e-06, + "grad_norm": 17.66830062866211, + "learning_rate": 7.193726155150487e-07, + "loss": 0.4927, + "mean_token_accuracy": 0.8464384078979492, + "num_tokens": 64918510.0, + "step": 1698 + }, + { + "epoch": 0.21613026332527668, + "ewc_loss": 0.0155578488484025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.778924555168487e-06, + "grad_norm": 17.682233810424805, + "learning_rate": 7.197965239508265e-07, + "loss": 0.4989, + "mean_token_accuracy": 0.8427059650421143, + "num_tokens": 64957156.0, + "step": 1699 + }, + { + "epoch": 0.21625747360386718, + "ewc_loss": 0.015569654293358326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.784827175782993e-06, + "grad_norm": 17.703609466552734, + "learning_rate": 7.202204323866045e-07, + "loss": 0.4977, + "mean_token_accuracy": 0.8445183634757996, + "num_tokens": 64998761.0, + "step": 1700 + }, + { + "epoch": 0.2163846838824577, + "ewc_loss": 0.015583385713398457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.791692951286677e-06, + "grad_norm": 17.753843307495117, + "learning_rate": 7.206443408223823e-07, + "loss": 0.5009, + "mean_token_accuracy": 0.839368462562561, + "num_tokens": 65038849.0, + "step": 1701 + }, + { + "epoch": 0.2165118941610482, + "ewc_loss": 0.015590326860547066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.795163583068643e-06, + "grad_norm": 17.69654655456543, + "learning_rate": 7.210682492581603e-07, + "loss": 0.4812, + "mean_token_accuracy": 0.8448864221572876, + "num_tokens": 65071130.0, + "step": 1702 + }, + { + "epoch": 0.2166391044396387, + "ewc_loss": 0.01557157002389431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.78578487370396e-06, + "grad_norm": 17.649885177612305, + "learning_rate": 7.214921576939381e-07, + "loss": 0.5234, + "mean_token_accuracy": 0.8319618701934814, + "num_tokens": 65111619.0, + "step": 1703 + }, + { + "epoch": 0.21676631471822924, + "ewc_loss": 0.015589891001582146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.794945304340217e-06, + "grad_norm": 17.71210289001465, + "learning_rate": 7.219160661297159e-07, + "loss": 0.4701, + "mean_token_accuracy": 0.8497922420501709, + "num_tokens": 65147537.0, + "step": 1704 + }, + { + "epoch": 0.21689352499681974, + "ewc_loss": 0.015658646821975708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.829323294572532e-06, + "grad_norm": 17.726837158203125, + "learning_rate": 7.223399745654938e-07, + "loss": 0.5059, + "mean_token_accuracy": 0.8390480279922485, + "num_tokens": 65187254.0, + "step": 1705 + }, + { + "epoch": 0.21702073527541024, + "ewc_loss": 0.015631763264536858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.81588187237503e-06, + "grad_norm": 17.76235008239746, + "learning_rate": 7.227638830012717e-07, + "loss": 0.5171, + "mean_token_accuracy": 0.8330029249191284, + "num_tokens": 65226130.0, + "step": 1706 + }, + { + "epoch": 0.21714794555400077, + "ewc_loss": 0.015657294541597366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.828647540009115e-06, + "grad_norm": 17.755020141601562, + "learning_rate": 7.231877914370495e-07, + "loss": 0.481, + "mean_token_accuracy": 0.8501332402229309, + "num_tokens": 65261282.0, + "step": 1707 + }, + { + "epoch": 0.21727515583259127, + "ewc_loss": 0.01567915640771389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.839577847335022e-06, + "grad_norm": 17.835538864135742, + "learning_rate": 7.236116998728275e-07, + "loss": 0.5321, + "mean_token_accuracy": 0.8314064145088196, + "num_tokens": 65294850.0, + "step": 1708 + }, + { + "epoch": 0.21740236611118177, + "ewc_loss": 0.0157157052308321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.857852324377745e-06, + "grad_norm": 17.81155014038086, + "learning_rate": 7.240356083086053e-07, + "loss": 0.4234, + "mean_token_accuracy": 0.863484799861908, + "num_tokens": 65332323.0, + "step": 1709 + }, + { + "epoch": 0.2175295763897723, + "ewc_loss": 0.01567418873310089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.83709401730448e-06, + "grad_norm": 17.716890335083008, + "learning_rate": 7.244595167443833e-07, + "loss": 0.5186, + "mean_token_accuracy": 0.8371392488479614, + "num_tokens": 65376033.0, + "step": 1710 + }, + { + "epoch": 0.2176567866683628, + "ewc_loss": 0.015725476667284966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.86273812991567e-06, + "grad_norm": 17.826580047607422, + "learning_rate": 7.248834251801611e-07, + "loss": 0.4827, + "mean_token_accuracy": 0.8480850458145142, + "num_tokens": 65417875.0, + "step": 1711 + }, + { + "epoch": 0.2177839969469533, + "ewc_loss": 0.01570003293454647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.85001611802727e-06, + "grad_norm": 17.598281860351562, + "learning_rate": 7.253073336159388e-07, + "loss": 0.459, + "mean_token_accuracy": 0.8525604009628296, + "num_tokens": 65456675.0, + "step": 1712 + }, + { + "epoch": 0.21791120722554383, + "ewc_loss": 0.015715772286057472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.857885975681711e-06, + "grad_norm": 17.757959365844727, + "learning_rate": 7.257312420517168e-07, + "loss": 0.4519, + "mean_token_accuracy": 0.8549657464027405, + "num_tokens": 65487699.0, + "step": 1713 + }, + { + "epoch": 0.21803841750413433, + "ewc_loss": 0.015765439718961716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.88271972851362e-06, + "grad_norm": 17.749074935913086, + "learning_rate": 7.261551504874946e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.8447820544242859, + "num_tokens": 65526143.0, + "step": 1714 + }, + { + "epoch": 0.21816562778272486, + "ewc_loss": 0.015739956870675087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.869978617236484e-06, + "grad_norm": 17.746566772460938, + "learning_rate": 7.265790589232725e-07, + "loss": 0.4757, + "mean_token_accuracy": 0.8488014340400696, + "num_tokens": 65562969.0, + "step": 1715 + }, + { + "epoch": 0.21829283806131536, + "ewc_loss": 0.01574467122554779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.87233602750348e-06, + "grad_norm": 17.672029495239258, + "learning_rate": 7.270029673590504e-07, + "loss": 0.5087, + "mean_token_accuracy": 0.8405098915100098, + "num_tokens": 65605999.0, + "step": 1716 + }, + { + "epoch": 0.21842004833990586, + "ewc_loss": 0.015802426263689995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.901213393779472e-06, + "grad_norm": 17.824140548706055, + "learning_rate": 7.274268757948283e-07, + "loss": 0.5541, + "mean_token_accuracy": 0.8269097805023193, + "num_tokens": 65644877.0, + "step": 1717 + }, + { + "epoch": 0.2185472586184964, + "ewc_loss": 0.015830116346478462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.91505863162456e-06, + "grad_norm": 17.77581214904785, + "learning_rate": 7.278507842306062e-07, + "loss": 0.4743, + "mean_token_accuracy": 0.8505038022994995, + "num_tokens": 65678823.0, + "step": 1718 + }, + { + "epoch": 0.2186744688970869, + "ewc_loss": 0.015811987221240997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.90599369793199e-06, + "grad_norm": 17.8508358001709, + "learning_rate": 7.282746926663841e-07, + "loss": 0.5544, + "mean_token_accuracy": 0.8276776075363159, + "num_tokens": 65717185.0, + "step": 1719 + }, + { + "epoch": 0.2188016791756774, + "ewc_loss": 0.015861963853240013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.930982064863201e-06, + "grad_norm": 17.78665542602539, + "learning_rate": 7.286986011021618e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.8496643304824829, + "num_tokens": 65754422.0, + "step": 1720 + }, + { + "epoch": 0.21892888945426792, + "ewc_loss": 0.01578802615404129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.894012924225535e-06, + "grad_norm": 17.729827880859375, + "learning_rate": 7.291225095379398e-07, + "loss": 0.5107, + "mean_token_accuracy": 0.8400356769561768, + "num_tokens": 65799390.0, + "step": 1721 + }, + { + "epoch": 0.21905609973285842, + "ewc_loss": 0.015861330553889275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.930665560706984e-06, + "grad_norm": 17.83632469177246, + "learning_rate": 7.295464179737176e-07, + "loss": 0.5043, + "mean_token_accuracy": 0.8435696363449097, + "num_tokens": 65837566.0, + "step": 1722 + }, + { + "epoch": 0.21918331001144892, + "ewc_loss": 0.015860382467508316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.930190804472659e-06, + "grad_norm": 17.82705307006836, + "learning_rate": 7.299703264094955e-07, + "loss": 0.4889, + "mean_token_accuracy": 0.845302164554596, + "num_tokens": 65874537.0, + "step": 1723 + }, + { + "epoch": 0.21931052029003945, + "ewc_loss": 0.01587177813053131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.935888788779266e-06, + "grad_norm": 17.83974838256836, + "learning_rate": 7.303942348452734e-07, + "loss": 0.5157, + "mean_token_accuracy": 0.8366090059280396, + "num_tokens": 65915616.0, + "step": 1724 + }, + { + "epoch": 0.21943773056862995, + "ewc_loss": 0.015881668776273727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.940834620967507e-06, + "grad_norm": 17.942224502563477, + "learning_rate": 7.308181432810513e-07, + "loss": 0.4633, + "mean_token_accuracy": 0.8507187962532043, + "num_tokens": 65954263.0, + "step": 1725 + }, + { + "epoch": 0.21956494084722045, + "ewc_loss": 0.01590091735124588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.950458893901668e-06, + "grad_norm": 17.852996826171875, + "learning_rate": 7.312420517168292e-07, + "loss": 0.4959, + "mean_token_accuracy": 0.8399662971496582, + "num_tokens": 65992880.0, + "step": 1726 + }, + { + "epoch": 0.21969215112581097, + "ewc_loss": 0.015904204919934273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.952102350827772e-06, + "grad_norm": 17.89827537536621, + "learning_rate": 7.31665960152607e-07, + "loss": 0.5145, + "mean_token_accuracy": 0.8337269425392151, + "num_tokens": 66033958.0, + "step": 1727 + }, + { + "epoch": 0.21981936140440148, + "ewc_loss": 0.01592712104320526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.963560165080708e-06, + "grad_norm": 17.838163375854492, + "learning_rate": 7.320898685883848e-07, + "loss": 0.4854, + "mean_token_accuracy": 0.8446138501167297, + "num_tokens": 66072448.0, + "step": 1728 + }, + { + "epoch": 0.21994657168299198, + "ewc_loss": 0.01591384783387184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.95692358224187e-06, + "grad_norm": 17.82878875732422, + "learning_rate": 7.325137770241628e-07, + "loss": 0.5968, + "mean_token_accuracy": 0.8146445751190186, + "num_tokens": 66112648.0, + "step": 1729 + }, + { + "epoch": 0.2200737819615825, + "ewc_loss": 0.015951938927173615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.975969310791697e-06, + "grad_norm": 17.878633499145508, + "learning_rate": 7.329376854599406e-07, + "loss": 0.5078, + "mean_token_accuracy": 0.8398422598838806, + "num_tokens": 66154791.0, + "step": 1730 + }, + { + "epoch": 0.220200992240173, + "ewc_loss": 0.015935253351926804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.967626515892334e-06, + "grad_norm": 17.84385108947754, + "learning_rate": 7.333615938957184e-07, + "loss": 0.4975, + "mean_token_accuracy": 0.8419212102890015, + "num_tokens": 66192450.0, + "step": 1731 + }, + { + "epoch": 0.2203282025187635, + "ewc_loss": 0.015977632254362106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.98881592345424e-06, + "grad_norm": 17.845016479492188, + "learning_rate": 7.337855023314964e-07, + "loss": 0.4911, + "mean_token_accuracy": 0.8418936133384705, + "num_tokens": 66231173.0, + "step": 1732 + }, + { + "epoch": 0.22045541279735403, + "ewc_loss": 0.015979520976543427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 7.989760888449382e-06, + "grad_norm": 17.868135452270508, + "learning_rate": 7.342094107672742e-07, + "loss": 0.5442, + "mean_token_accuracy": 0.8266631960868835, + "num_tokens": 66272956.0, + "step": 1733 + }, + { + "epoch": 0.22058262307594453, + "ewc_loss": 0.016020430251955986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.010215424292255e-06, + "grad_norm": 17.92588233947754, + "learning_rate": 7.346333192030522e-07, + "loss": 0.4982, + "mean_token_accuracy": 0.84769207239151, + "num_tokens": 66309450.0, + "step": 1734 + }, + { + "epoch": 0.22070983335453503, + "ewc_loss": 0.016007209196686745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.003604307305068e-06, + "grad_norm": 17.854490280151367, + "learning_rate": 7.350572276388299e-07, + "loss": 0.5485, + "mean_token_accuracy": 0.8246605396270752, + "num_tokens": 66350947.0, + "step": 1735 + }, + { + "epoch": 0.22083704363312556, + "ewc_loss": 0.01600864715874195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.00432371761417e-06, + "grad_norm": 18.022048950195312, + "learning_rate": 7.354811360746078e-07, + "loss": 0.506, + "mean_token_accuracy": 0.8378652334213257, + "num_tokens": 66388364.0, + "step": 1736 + }, + { + "epoch": 0.22096425391171606, + "ewc_loss": 0.01607542298734188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.037711268116254e-06, + "grad_norm": 17.965951919555664, + "learning_rate": 7.359050445103857e-07, + "loss": 0.5206, + "mean_token_accuracy": 0.8342862725257874, + "num_tokens": 66425029.0, + "step": 1737 + }, + { + "epoch": 0.22109146419030656, + "ewc_loss": 0.016000783070921898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.000391972018406e-06, + "grad_norm": 17.89434814453125, + "learning_rate": 7.363289529461636e-07, + "loss": 0.496, + "mean_token_accuracy": 0.8439133763313293, + "num_tokens": 66463924.0, + "step": 1738 + }, + { + "epoch": 0.2212186744688971, + "ewc_loss": 0.01605336368083954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.026681825867854e-06, + "grad_norm": 17.93737030029297, + "learning_rate": 7.367528613819415e-07, + "loss": 0.4507, + "mean_token_accuracy": 0.855903685092926, + "num_tokens": 66504824.0, + "step": 1739 + }, + { + "epoch": 0.2213458847474876, + "ewc_loss": 0.016059938818216324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.029969649214763e-06, + "grad_norm": 17.97309112548828, + "learning_rate": 7.371767698177194e-07, + "loss": 0.4695, + "mean_token_accuracy": 0.8508721590042114, + "num_tokens": 66539308.0, + "step": 1740 + }, + { + "epoch": 0.22147309502607812, + "ewc_loss": 0.016050677746534348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.025338502193335e-06, + "grad_norm": 17.985353469848633, + "learning_rate": 7.376006782534972e-07, + "loss": 0.466, + "mean_token_accuracy": 0.8518232107162476, + "num_tokens": 66572785.0, + "step": 1741 + }, + { + "epoch": 0.22160030530466862, + "ewc_loss": 0.016062051057815552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031025572563522e-06, + "grad_norm": 17.96000099182129, + "learning_rate": 7.380245866892751e-07, + "loss": 0.4836, + "mean_token_accuracy": 0.8459149599075317, + "num_tokens": 66607890.0, + "step": 1742 + }, + { + "epoch": 0.22172751558325912, + "ewc_loss": 0.016063697636127472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.031848665268626e-06, + "grad_norm": 18.130998611450195, + "learning_rate": 7.384484951250529e-07, + "loss": 0.486, + "mean_token_accuracy": 0.8482040166854858, + "num_tokens": 66650553.0, + "step": 1743 + }, + { + "epoch": 0.22185472586184965, + "ewc_loss": 0.016118217259645462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059108949964866e-06, + "grad_norm": 17.987564086914062, + "learning_rate": 7.388724035608308e-07, + "loss": 0.4792, + "mean_token_accuracy": 0.8487381339073181, + "num_tokens": 66685330.0, + "step": 1744 + }, + { + "epoch": 0.22198193614044015, + "ewc_loss": 0.016049908474087715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.024953785934485e-06, + "grad_norm": 17.973655700683594, + "learning_rate": 7.392963119966087e-07, + "loss": 0.463, + "mean_token_accuracy": 0.8519443869590759, + "num_tokens": 66716816.0, + "step": 1745 + }, + { + "epoch": 0.22210914641903065, + "ewc_loss": 0.01611924171447754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.059620995481964e-06, + "grad_norm": 18.061237335205078, + "learning_rate": 7.397202204323866e-07, + "loss": 0.508, + "mean_token_accuracy": 0.8400940895080566, + "num_tokens": 66754030.0, + "step": 1746 + }, + { + "epoch": 0.22223635669762118, + "ewc_loss": 0.016105473041534424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.052736120589543e-06, + "grad_norm": 17.995525360107422, + "learning_rate": 7.401441288681645e-07, + "loss": 0.4649, + "mean_token_accuracy": 0.8528035879135132, + "num_tokens": 66793650.0, + "step": 1747 + }, + { + "epoch": 0.22236356697621168, + "ewc_loss": 0.016113784164190292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.056892511376645e-06, + "grad_norm": 17.968042373657227, + "learning_rate": 7.405680373039424e-07, + "loss": 0.449, + "mean_token_accuracy": 0.8602085709571838, + "num_tokens": 66833709.0, + "step": 1748 + }, + { + "epoch": 0.22249077725480218, + "ewc_loss": 0.01611739583313465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.058697858359665e-06, + "grad_norm": 18.006975173950195, + "learning_rate": 7.409919457397202e-07, + "loss": 0.5181, + "mean_token_accuracy": 0.8347947597503662, + "num_tokens": 66866456.0, + "step": 1749 + }, + { + "epoch": 0.2226179875333927, + "ewc_loss": 0.016138732433319092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.069366231211461e-06, + "grad_norm": 18.05686378479004, + "learning_rate": 7.414158541754981e-07, + "loss": 0.544, + "mean_token_accuracy": 0.8269015550613403, + "num_tokens": 66905875.0, + "step": 1750 + }, + { + "epoch": 0.2227451978119832, + "ewc_loss": 0.01613626256585121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068131137406453e-06, + "grad_norm": 17.94675636291504, + "learning_rate": 7.418397626112759e-07, + "loss": 0.4976, + "mean_token_accuracy": 0.8506325483322144, + "num_tokens": 66936009.0, + "step": 1751 + }, + { + "epoch": 0.2228724080905737, + "ewc_loss": 0.01613706536591053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.068533134064637e-06, + "grad_norm": 18.02395248413086, + "learning_rate": 7.422636710470537e-07, + "loss": 0.5036, + "mean_token_accuracy": 0.8409414291381836, + "num_tokens": 66978111.0, + "step": 1752 + }, + { + "epoch": 0.22299961836916424, + "ewc_loss": 0.01618857868015766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.094289114524145e-06, + "grad_norm": 18.005720138549805, + "learning_rate": 7.426875794828317e-07, + "loss": 0.5166, + "mean_token_accuracy": 0.8403980731964111, + "num_tokens": 67015938.0, + "step": 1753 + }, + { + "epoch": 0.22312682864775474, + "ewc_loss": 0.016169849783182144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.08492495707469e-06, + "grad_norm": 18.02085304260254, + "learning_rate": 7.431114879186095e-07, + "loss": 0.5364, + "mean_token_accuracy": 0.8313533067703247, + "num_tokens": 67055456.0, + "step": 1754 + }, + { + "epoch": 0.22325403892634524, + "ewc_loss": 0.016217190772294998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.108595466183033e-06, + "grad_norm": 17.962989807128906, + "learning_rate": 7.435353963543875e-07, + "loss": 0.4927, + "mean_token_accuracy": 0.8444960713386536, + "num_tokens": 67096298.0, + "step": 1755 + }, + { + "epoch": 0.22338124920493577, + "ewc_loss": 0.016206230968236923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.103115760604851e-06, + "grad_norm": 18.12497901916504, + "learning_rate": 7.439593047901653e-07, + "loss": 0.448, + "mean_token_accuracy": 0.8563141822814941, + "num_tokens": 67135280.0, + "step": 1756 + }, + { + "epoch": 0.22350845948352627, + "ewc_loss": 0.016231246292591095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.115623131743632e-06, + "grad_norm": 17.99342155456543, + "learning_rate": 7.443832132259431e-07, + "loss": 0.5135, + "mean_token_accuracy": 0.8361730575561523, + "num_tokens": 67171936.0, + "step": 1757 + }, + { + "epoch": 0.22363566976211677, + "ewc_loss": 0.016228118911385536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.114059710351285e-06, + "grad_norm": 18.04095458984375, + "learning_rate": 7.44807121661721e-07, + "loss": 0.5023, + "mean_token_accuracy": 0.8408076763153076, + "num_tokens": 67209854.0, + "step": 1758 + }, + { + "epoch": 0.2237628800407073, + "ewc_loss": 0.01628289371728897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.141447324305773e-06, + "grad_norm": 17.93855094909668, + "learning_rate": 7.452310300974989e-07, + "loss": 0.4566, + "mean_token_accuracy": 0.8503226041793823, + "num_tokens": 67244369.0, + "step": 1759 + }, + { + "epoch": 0.2238900903192978, + "ewc_loss": 0.016271822154521942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.135911230056081e-06, + "grad_norm": 18.15008544921875, + "learning_rate": 7.456549385332767e-07, + "loss": 0.5355, + "mean_token_accuracy": 0.8310539126396179, + "num_tokens": 67283994.0, + "step": 1760 + }, + { + "epoch": 0.2240173005978883, + "ewc_loss": 0.01633339561522007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.166697625711095e-06, + "grad_norm": 17.983280181884766, + "learning_rate": 7.460788469690547e-07, + "loss": 0.42, + "mean_token_accuracy": 0.8632420897483826, + "num_tokens": 67323230.0, + "step": 1761 + }, + { + "epoch": 0.22414451087647883, + "ewc_loss": 0.016271792352199554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.13589576864615e-06, + "grad_norm": 18.074644088745117, + "learning_rate": 7.465027554048325e-07, + "loss": 0.5753, + "mean_token_accuracy": 0.8222783803939819, + "num_tokens": 67366198.0, + "step": 1762 + }, + { + "epoch": 0.22427172115506933, + "ewc_loss": 0.016334569081664085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.167284249793738e-06, + "grad_norm": 18.05044174194336, + "learning_rate": 7.469266638406105e-07, + "loss": 0.5586, + "mean_token_accuracy": 0.8273690938949585, + "num_tokens": 67410685.0, + "step": 1763 + }, + { + "epoch": 0.22439893143365983, + "ewc_loss": 0.016297753900289536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.148876986524556e-06, + "grad_norm": 17.974328994750977, + "learning_rate": 7.473505722763883e-07, + "loss": 0.4799, + "mean_token_accuracy": 0.8473402261734009, + "num_tokens": 67445818.0, + "step": 1764 + }, + { + "epoch": 0.22452614171225035, + "ewc_loss": 0.01631857268512249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.159286153386347e-06, + "grad_norm": 18.1527099609375, + "learning_rate": 7.477744807121661e-07, + "loss": 0.5229, + "mean_token_accuracy": 0.8351303935050964, + "num_tokens": 67483873.0, + "step": 1765 + }, + { + "epoch": 0.22465335199084085, + "ewc_loss": 0.016340328380465508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.170164619514253e-06, + "grad_norm": 17.999927520751953, + "learning_rate": 7.48198389147944e-07, + "loss": 0.5013, + "mean_token_accuracy": 0.8448407649993896, + "num_tokens": 67519524.0, + "step": 1766 + }, + { + "epoch": 0.22478056226943138, + "ewc_loss": 0.016328465193510056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.16423289506929e-06, + "grad_norm": 18.080150604248047, + "learning_rate": 7.486222975837219e-07, + "loss": 0.4856, + "mean_token_accuracy": 0.8444196581840515, + "num_tokens": 67560920.0, + "step": 1767 + }, + { + "epoch": 0.22490777254802188, + "ewc_loss": 0.01638057455420494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.190287189790979e-06, + "grad_norm": 18.06298828125, + "learning_rate": 7.490462060194997e-07, + "loss": 0.4552, + "mean_token_accuracy": 0.851385235786438, + "num_tokens": 67594948.0, + "step": 1768 + }, + { + "epoch": 0.22503498282661238, + "ewc_loss": 0.016381552442908287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.190776497940533e-06, + "grad_norm": 18.149805068969727, + "learning_rate": 7.494701144552777e-07, + "loss": 0.5462, + "mean_token_accuracy": 0.82779860496521, + "num_tokens": 67628867.0, + "step": 1769 + }, + { + "epoch": 0.2251621931052029, + "ewc_loss": 0.016390468925237656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.195234840968624e-06, + "grad_norm": 18.18303108215332, + "learning_rate": 7.498940228910555e-07, + "loss": 0.5048, + "mean_token_accuracy": 0.8383010029792786, + "num_tokens": 67669762.0, + "step": 1770 + }, + { + "epoch": 0.2252894033837934, + "ewc_loss": 0.016390573233366013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.195286682166625e-06, + "grad_norm": 18.16000747680664, + "learning_rate": 7.503179313268335e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.8409872055053711, + "num_tokens": 67704750.0, + "step": 1771 + }, + { + "epoch": 0.2254166136623839, + "ewc_loss": 0.016400635242462158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.200318006856833e-06, + "grad_norm": 18.07032585144043, + "learning_rate": 7.507418397626113e-07, + "loss": 0.5309, + "mean_token_accuracy": 0.8329913020133972, + "num_tokens": 67745181.0, + "step": 1772 + }, + { + "epoch": 0.22554382394097444, + "ewc_loss": 0.01637677475810051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.188387255358975e-06, + "grad_norm": 18.196796417236328, + "learning_rate": 7.51165748198389e-07, + "loss": 0.5176, + "mean_token_accuracy": 0.8349084854125977, + "num_tokens": 67785929.0, + "step": 1773 + }, + { + "epoch": 0.22567103421956494, + "ewc_loss": 0.01643279939889908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.216399692173582e-06, + "grad_norm": 18.14979362487793, + "learning_rate": 7.51589656634167e-07, + "loss": 0.4718, + "mean_token_accuracy": 0.8455973863601685, + "num_tokens": 67821887.0, + "step": 1774 + }, + { + "epoch": 0.22579824449815544, + "ewc_loss": 0.0164019875228405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.20099376142025e-06, + "grad_norm": 18.124439239501953, + "learning_rate": 7.520135650699448e-07, + "loss": 0.5061, + "mean_token_accuracy": 0.8390963077545166, + "num_tokens": 67859608.0, + "step": 1775 + }, + { + "epoch": 0.22592545477674597, + "ewc_loss": 0.016448408365249634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.224204066209495e-06, + "grad_norm": 18.15768814086914, + "learning_rate": 7.524374735057227e-07, + "loss": 0.5166, + "mean_token_accuracy": 0.8375091552734375, + "num_tokens": 67895287.0, + "step": 1776 + }, + { + "epoch": 0.22605266505533647, + "ewc_loss": 0.016453122720122337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.22656147647649e-06, + "grad_norm": 18.118921279907227, + "learning_rate": 7.528613819415006e-07, + "loss": 0.4726, + "mean_token_accuracy": 0.8535729646682739, + "num_tokens": 67930358.0, + "step": 1777 + }, + { + "epoch": 0.22617987533392697, + "ewc_loss": 0.01643317937850952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.216589776566252e-06, + "grad_norm": 18.142465591430664, + "learning_rate": 7.532852903772785e-07, + "loss": 0.5144, + "mean_token_accuracy": 0.8357157707214355, + "num_tokens": 67966112.0, + "step": 1778 + }, + { + "epoch": 0.2263070856125175, + "ewc_loss": 0.016466360539197922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.233179869421292e-06, + "grad_norm": 18.19365692138672, + "learning_rate": 7.537091988130564e-07, + "loss": 0.5233, + "mean_token_accuracy": 0.8316640853881836, + "num_tokens": 68001297.0, + "step": 1779 + }, + { + "epoch": 0.226434295891108, + "ewc_loss": 0.016467245295643806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.233622793341056e-06, + "grad_norm": 18.10869026184082, + "learning_rate": 7.541331072488342e-07, + "loss": 0.504, + "mean_token_accuracy": 0.8401809930801392, + "num_tokens": 68037346.0, + "step": 1780 + }, + { + "epoch": 0.2265615061696985, + "ewc_loss": 0.01649250090122223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.246250217780471e-06, + "grad_norm": 18.18056297302246, + "learning_rate": 7.54557015684612e-07, + "loss": 0.5015, + "mean_token_accuracy": 0.8417091965675354, + "num_tokens": 68076566.0, + "step": 1781 + }, + { + "epoch": 0.22668871644828903, + "ewc_loss": 0.016504082828760147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.252041880041361e-06, + "grad_norm": 18.129487991333008, + "learning_rate": 7.5498092412039e-07, + "loss": 0.4795, + "mean_token_accuracy": 0.8506267070770264, + "num_tokens": 68116834.0, + "step": 1782 + }, + { + "epoch": 0.22681592672687953, + "ewc_loss": 0.016520798206329346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.260399226855952e-06, + "grad_norm": 18.173202514648438, + "learning_rate": 7.554048325561678e-07, + "loss": 0.4561, + "mean_token_accuracy": 0.855926513671875, + "num_tokens": 68155573.0, + "step": 1783 + }, + { + "epoch": 0.22694313700547003, + "ewc_loss": 0.016543515026569366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.271757906186394e-06, + "grad_norm": 18.25554084777832, + "learning_rate": 7.558287409919457e-07, + "loss": 0.55, + "mean_token_accuracy": 0.8287477493286133, + "num_tokens": 68193378.0, + "step": 1784 + }, + { + "epoch": 0.22707034728406056, + "ewc_loss": 0.016519634053111076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.259817150246818e-06, + "grad_norm": 18.123441696166992, + "learning_rate": 7.562526494277236e-07, + "loss": 0.5204, + "mean_token_accuracy": 0.8328238725662231, + "num_tokens": 68230183.0, + "step": 1785 + }, + { + "epoch": 0.22719755756265106, + "ewc_loss": 0.016528507694602013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.264253665402066e-06, + "grad_norm": 18.26426887512207, + "learning_rate": 7.566765578635015e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.8448926210403442, + "num_tokens": 68270526.0, + "step": 1786 + }, + { + "epoch": 0.22732476784124156, + "ewc_loss": 0.01657642051577568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.288210665341467e-06, + "grad_norm": 18.112627029418945, + "learning_rate": 7.571004662992794e-07, + "loss": 0.5158, + "mean_token_accuracy": 0.8397809267044067, + "num_tokens": 68308690.0, + "step": 1787 + }, + { + "epoch": 0.2274519781198321, + "ewc_loss": 0.016537148505449295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.268574674730189e-06, + "grad_norm": 18.234859466552734, + "learning_rate": 7.575243747350572e-07, + "loss": 0.5583, + "mean_token_accuracy": 0.8267714977264404, + "num_tokens": 68348660.0, + "step": 1788 + }, + { + "epoch": 0.2275791883984226, + "ewc_loss": 0.016607539728283882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.303770300699398e-06, + "grad_norm": 18.255168914794922, + "learning_rate": 7.57948283170835e-07, + "loss": 0.5069, + "mean_token_accuracy": 0.8430567979812622, + "num_tokens": 68389129.0, + "step": 1789 + }, + { + "epoch": 0.2277063986770131, + "ewc_loss": 0.016576072201132774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.288036042358726e-06, + "grad_norm": 18.165632247924805, + "learning_rate": 7.58372191606613e-07, + "loss": 0.4981, + "mean_token_accuracy": 0.8436580896377563, + "num_tokens": 68423234.0, + "step": 1790 + }, + { + "epoch": 0.22783360895560362, + "ewc_loss": 0.016621459275484085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.310729754157364e-06, + "grad_norm": 18.348464965820312, + "learning_rate": 7.587961000423908e-07, + "loss": 0.5056, + "mean_token_accuracy": 0.8407289981842041, + "num_tokens": 68460834.0, + "step": 1791 + }, + { + "epoch": 0.22796081923419412, + "ewc_loss": 0.016616828739643097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.30841418064665e-06, + "grad_norm": 18.105012893676758, + "learning_rate": 7.592200084781686e-07, + "loss": 0.5496, + "mean_token_accuracy": 0.8306472301483154, + "num_tokens": 68495605.0, + "step": 1792 + }, + { + "epoch": 0.22808802951278465, + "ewc_loss": 0.016586219891905785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.293110113299917e-06, + "grad_norm": 18.299352645874023, + "learning_rate": 7.596439169139466e-07, + "loss": 0.4957, + "mean_token_accuracy": 0.8422958254814148, + "num_tokens": 68530838.0, + "step": 1793 + }, + { + "epoch": 0.22821523979137515, + "ewc_loss": 0.01670532114803791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.352660188393202e-06, + "grad_norm": 18.163169860839844, + "learning_rate": 7.600678253497244e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.8461387157440186, + "num_tokens": 68571225.0, + "step": 1794 + }, + { + "epoch": 0.22834245006996565, + "ewc_loss": 0.016639437526464462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.319719199789688e-06, + "grad_norm": 18.281023025512695, + "learning_rate": 7.604917337855023e-07, + "loss": 0.4731, + "mean_token_accuracy": 0.8525749444961548, + "num_tokens": 68608284.0, + "step": 1795 + }, + { + "epoch": 0.22846966034855618, + "ewc_loss": 0.016694411635398865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.34720594866667e-06, + "grad_norm": 18.11323356628418, + "learning_rate": 7.609156422212801e-07, + "loss": 0.4534, + "mean_token_accuracy": 0.8560138940811157, + "num_tokens": 68645943.0, + "step": 1796 + }, + { + "epoch": 0.22859687062714668, + "ewc_loss": 0.01667664758861065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.338323823409155e-06, + "grad_norm": 18.31329345703125, + "learning_rate": 7.61339550657058e-07, + "loss": 0.4767, + "mean_token_accuracy": 0.8476088643074036, + "num_tokens": 68684245.0, + "step": 1797 + }, + { + "epoch": 0.22872408090573718, + "ewc_loss": 0.01669779047369957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.348894880327862e-06, + "grad_norm": 18.206218719482422, + "learning_rate": 7.617634590928359e-07, + "loss": 0.4909, + "mean_token_accuracy": 0.8475156426429749, + "num_tokens": 68717532.0, + "step": 1798 + }, + { + "epoch": 0.2288512911843277, + "ewc_loss": 0.016712935641407967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.356468242709525e-06, + "grad_norm": 18.279516220092773, + "learning_rate": 7.621873675286138e-07, + "loss": 0.5095, + "mean_token_accuracy": 0.8409324288368225, + "num_tokens": 68759515.0, + "step": 1799 + }, + { + "epoch": 0.2289785014629182, + "ewc_loss": 0.016761481761932373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38074083731044e-06, + "grad_norm": 18.36707878112793, + "learning_rate": 7.626112759643916e-07, + "loss": 0.5168, + "mean_token_accuracy": 0.8356473445892334, + "num_tokens": 68796969.0, + "step": 1800 + }, + { + "epoch": 0.2291057117415087, + "ewc_loss": 0.016723163425922394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.361581421922892e-06, + "grad_norm": 18.22796058654785, + "learning_rate": 7.630351844001696e-07, + "loss": 0.4978, + "mean_token_accuracy": 0.8408419489860535, + "num_tokens": 68831138.0, + "step": 1801 + }, + { + "epoch": 0.22923292202009923, + "ewc_loss": 0.01668817363679409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.344086381839588e-06, + "grad_norm": 18.25008773803711, + "learning_rate": 7.634590928359474e-07, + "loss": 0.4892, + "mean_token_accuracy": 0.8466261029243469, + "num_tokens": 68868741.0, + "step": 1802 + }, + { + "epoch": 0.22936013229868973, + "ewc_loss": 0.016764577478170395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.382288797292858e-06, + "grad_norm": 18.259021759033203, + "learning_rate": 7.638830012717253e-07, + "loss": 0.5069, + "mean_token_accuracy": 0.8382358551025391, + "num_tokens": 68909191.0, + "step": 1803 + }, + { + "epoch": 0.22948734257728023, + "ewc_loss": 0.016755174845457077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.377587619179394e-06, + "grad_norm": 18.29311752319336, + "learning_rate": 7.643069097075031e-07, + "loss": 0.4826, + "mean_token_accuracy": 0.8455286026000977, + "num_tokens": 68949261.0, + "step": 1804 + }, + { + "epoch": 0.22961455285587076, + "ewc_loss": 0.016714954748749733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.357477781828493e-06, + "grad_norm": 18.175186157226562, + "learning_rate": 7.64730818143281e-07, + "loss": 0.4574, + "mean_token_accuracy": 0.8507522344589233, + "num_tokens": 68990434.0, + "step": 1805 + }, + { + "epoch": 0.22974176313446126, + "ewc_loss": 0.01677451841533184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.387259185838047e-06, + "grad_norm": 18.367149353027344, + "learning_rate": 7.651547265790589e-07, + "loss": 0.5405, + "mean_token_accuracy": 0.8306188583374023, + "num_tokens": 69029482.0, + "step": 1806 + }, + { + "epoch": 0.22986897341305176, + "ewc_loss": 0.01679251156747341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.396255907427985e-06, + "grad_norm": 18.315998077392578, + "learning_rate": 7.655786350148368e-07, + "loss": 0.5205, + "mean_token_accuracy": 0.8356186151504517, + "num_tokens": 69069766.0, + "step": 1807 + }, + { + "epoch": 0.2299961836916423, + "ewc_loss": 0.016731003299355507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.365501344087534e-06, + "grad_norm": 18.286251068115234, + "learning_rate": 7.660025434506146e-07, + "loss": 0.5066, + "mean_token_accuracy": 0.8404815793037415, + "num_tokens": 69114592.0, + "step": 1808 + }, + { + "epoch": 0.2301233939702328, + "ewc_loss": 0.01678396761417389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.391984010813758e-06, + "grad_norm": 18.321857452392578, + "learning_rate": 7.664264518863926e-07, + "loss": 0.4947, + "mean_token_accuracy": 0.8442789316177368, + "num_tokens": 69152599.0, + "step": 1809 + }, + { + "epoch": 0.2302506042488233, + "ewc_loss": 0.016791513189673424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.395756594836712e-06, + "grad_norm": 18.366966247558594, + "learning_rate": 7.668503603221704e-07, + "loss": 0.5475, + "mean_token_accuracy": 0.8248158097267151, + "num_tokens": 69193157.0, + "step": 1810 + }, + { + "epoch": 0.23037781452741382, + "ewc_loss": 0.016764886677265167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.38244341139216e-06, + "grad_norm": 18.311965942382812, + "learning_rate": 7.672742687579483e-07, + "loss": 0.4603, + "mean_token_accuracy": 0.851084291934967, + "num_tokens": 69230681.0, + "step": 1811 + }, + { + "epoch": 0.23050502480600432, + "ewc_loss": 0.016797807067632675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.398903446504846e-06, + "grad_norm": 18.337055206298828, + "learning_rate": 7.676981771937261e-07, + "loss": 0.4374, + "mean_token_accuracy": 0.8562571406364441, + "num_tokens": 69261292.0, + "step": 1812 + }, + { + "epoch": 0.23063223508459482, + "ewc_loss": 0.016814768314361572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.407384484598879e-06, + "grad_norm": 18.300090789794922, + "learning_rate": 7.681220856295039e-07, + "loss": 0.4692, + "mean_token_accuracy": 0.8514863848686218, + "num_tokens": 69301795.0, + "step": 1813 + }, + { + "epoch": 0.23075944536318535, + "ewc_loss": 0.016788678243756294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.394338692596648e-06, + "grad_norm": 18.33057403564453, + "learning_rate": 7.685459940652819e-07, + "loss": 0.4915, + "mean_token_accuracy": 0.8415913581848145, + "num_tokens": 69337038.0, + "step": 1814 + }, + { + "epoch": 0.23088665564177585, + "ewc_loss": 0.01686655730009079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.433278708253056e-06, + "grad_norm": 18.406299591064453, + "learning_rate": 7.689699025010597e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.8433068990707397, + "num_tokens": 69379871.0, + "step": 1815 + }, + { + "epoch": 0.23101386592036638, + "ewc_loss": 0.016827188432216644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.413594514422584e-06, + "grad_norm": 18.273637771606445, + "learning_rate": 7.693938109368376e-07, + "loss": 0.511, + "mean_token_accuracy": 0.836944580078125, + "num_tokens": 69418021.0, + "step": 1816 + }, + { + "epoch": 0.23114107619895688, + "ewc_loss": 0.01686047576367855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.430238267465029e-06, + "grad_norm": 18.38813018798828, + "learning_rate": 7.698177193726155e-07, + "loss": 0.4874, + "mean_token_accuracy": 0.8435090780258179, + "num_tokens": 69455574.0, + "step": 1817 + }, + { + "epoch": 0.23126828647754738, + "ewc_loss": 0.016872981563210487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.436491043539718e-06, + "grad_norm": 18.342445373535156, + "learning_rate": 7.702416278083933e-07, + "loss": 0.5128, + "mean_token_accuracy": 0.8389707803726196, + "num_tokens": 69489822.0, + "step": 1818 + }, + { + "epoch": 0.2313954967561379, + "ewc_loss": 0.016871377825737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.435688869212754e-06, + "grad_norm": 18.452199935913086, + "learning_rate": 7.706655362441712e-07, + "loss": 0.4845, + "mean_token_accuracy": 0.8448719382286072, + "num_tokens": 69532821.0, + "step": 1819 + }, + { + "epoch": 0.2315227070347284, + "ewc_loss": 0.01687745191156864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.438725672021974e-06, + "grad_norm": 18.298748016357422, + "learning_rate": 7.710894446799491e-07, + "loss": 0.4672, + "mean_token_accuracy": 0.8498246073722839, + "num_tokens": 69565674.0, + "step": 1820 + }, + { + "epoch": 0.2316499173133189, + "ewc_loss": 0.01690005138516426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.450026143691503e-06, + "grad_norm": 18.407602310180664, + "learning_rate": 7.715133531157269e-07, + "loss": 0.4663, + "mean_token_accuracy": 0.8543137311935425, + "num_tokens": 69606118.0, + "step": 1821 + }, + { + "epoch": 0.23177712759190944, + "ewc_loss": 0.016927571967244148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.463785889034625e-06, + "grad_norm": 18.42066764831543, + "learning_rate": 7.719372615515049e-07, + "loss": 0.5395, + "mean_token_accuracy": 0.8312815427780151, + "num_tokens": 69650102.0, + "step": 1822 + }, + { + "epoch": 0.23190433787049994, + "ewc_loss": 0.01687934622168541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.439673365501221e-06, + "grad_norm": 18.353565216064453, + "learning_rate": 7.723611699872827e-07, + "loss": 0.4846, + "mean_token_accuracy": 0.8447693586349487, + "num_tokens": 69693440.0, + "step": 1823 + }, + { + "epoch": 0.23203154814909044, + "ewc_loss": 0.016927307471632957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.463654012302868e-06, + "grad_norm": 18.389265060424805, + "learning_rate": 7.727850784230606e-07, + "loss": 0.5031, + "mean_token_accuracy": 0.8365966081619263, + "num_tokens": 69732651.0, + "step": 1824 + }, + { + "epoch": 0.23215875842768097, + "ewc_loss": 0.016931096091866493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.465548489766661e-06, + "grad_norm": 18.348302841186523, + "learning_rate": 7.732089868588385e-07, + "loss": 0.5166, + "mean_token_accuracy": 0.8404137492179871, + "num_tokens": 69769915.0, + "step": 1825 + }, + { + "epoch": 0.23228596870627147, + "ewc_loss": 0.016934433951973915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.467217412544414e-06, + "grad_norm": 18.422592163085938, + "learning_rate": 7.736328952946163e-07, + "loss": 0.439, + "mean_token_accuracy": 0.8601129055023193, + "num_tokens": 69813943.0, + "step": 1826 + }, + { + "epoch": 0.23241317898486197, + "ewc_loss": 0.016973840072751045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.486919796268921e-06, + "grad_norm": 18.375898361206055, + "learning_rate": 7.740568037303942e-07, + "loss": 0.4697, + "mean_token_accuracy": 0.8472250699996948, + "num_tokens": 69847039.0, + "step": 1827 + }, + { + "epoch": 0.2325403892634525, + "ewc_loss": 0.016940228641033173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.470114153169561e-06, + "grad_norm": 18.43060302734375, + "learning_rate": 7.744807121661721e-07, + "loss": 0.4357, + "mean_token_accuracy": 0.8623896241188049, + "num_tokens": 69884588.0, + "step": 1828 + }, + { + "epoch": 0.232667599542043, + "ewc_loss": 0.017007341608405113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.503670869686175e-06, + "grad_norm": 21.15591812133789, + "learning_rate": 7.749046206019499e-07, + "loss": 0.4997, + "mean_token_accuracy": 0.8421770334243774, + "num_tokens": 69925552.0, + "step": 1829 + }, + { + "epoch": 0.2327948098206335, + "ewc_loss": 0.017503080889582634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.75154000823386e-06, + "grad_norm": 18.40085792541504, + "learning_rate": 7.753285290377279e-07, + "loss": 0.4323, + "mean_token_accuracy": 0.8603797554969788, + "num_tokens": 69961776.0, + "step": 1830 + }, + { + "epoch": 0.23292202009922403, + "ewc_loss": 0.01657945290207863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.289726793009322e-06, + "grad_norm": 18.00699234008789, + "learning_rate": 7.757524374735057e-07, + "loss": 0.5377, + "mean_token_accuracy": 0.8292267918586731, + "num_tokens": 69999775.0, + "step": 1831 + }, + { + "epoch": 0.23304923037781453, + "ewc_loss": 0.01734769530594349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67384733282961e-06, + "grad_norm": 18.55348777770996, + "learning_rate": 7.761763459092836e-07, + "loss": 0.4497, + "mean_token_accuracy": 0.8591318130493164, + "num_tokens": 70037294.0, + "step": 1832 + }, + { + "epoch": 0.23317644065640503, + "ewc_loss": 0.01724706031382084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.623530447948724e-06, + "grad_norm": 18.292638778686523, + "learning_rate": 7.766002543450614e-07, + "loss": 0.4864, + "mean_token_accuracy": 0.8455795049667358, + "num_tokens": 70075014.0, + "step": 1833 + }, + { + "epoch": 0.23330365093499555, + "ewc_loss": 0.017287012189626694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.643505680083763e-06, + "grad_norm": 18.441659927368164, + "learning_rate": 7.770241627808392e-07, + "loss": 0.4412, + "mean_token_accuracy": 0.8593851327896118, + "num_tokens": 70112186.0, + "step": 1834 + }, + { + "epoch": 0.23343086121358606, + "ewc_loss": 0.01737682893872261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.688414709467907e-06, + "grad_norm": 18.39143943786621, + "learning_rate": 7.774480712166172e-07, + "loss": 0.5214, + "mean_token_accuracy": 0.8347902894020081, + "num_tokens": 70153285.0, + "step": 1835 + }, + { + "epoch": 0.23355807149217656, + "ewc_loss": 0.01736694574356079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.683472515258472e-06, + "grad_norm": 18.5136775970459, + "learning_rate": 7.77871979652395e-07, + "loss": 0.503, + "mean_token_accuracy": 0.8412613868713379, + "num_tokens": 70193382.0, + "step": 1836 + }, + { + "epoch": 0.23368528177076708, + "ewc_loss": 0.017438678070902824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.71933934831759e-06, + "grad_norm": 18.508087158203125, + "learning_rate": 7.782958880881729e-07, + "loss": 0.4933, + "mean_token_accuracy": 0.8432924747467041, + "num_tokens": 70227279.0, + "step": 1837 + }, + { + "epoch": 0.23381249204935758, + "ewc_loss": 0.01743067055940628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.715334843145683e-06, + "grad_norm": 18.542150497436523, + "learning_rate": 7.787197965239508e-07, + "loss": 0.4536, + "mean_token_accuracy": 0.8526961803436279, + "num_tokens": 70264395.0, + "step": 1838 + }, + { + "epoch": 0.23393970232794808, + "ewc_loss": 0.01746032014489174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.730160516279284e-06, + "grad_norm": 18.470964431762695, + "learning_rate": 7.791437049597287e-07, + "loss": 0.5715, + "mean_token_accuracy": 0.8208969831466675, + "num_tokens": 70295016.0, + "step": 1839 + }, + { + "epoch": 0.2340669126065386, + "ewc_loss": 0.0174882709980011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.744135811866727e-06, + "grad_norm": 18.60445213317871, + "learning_rate": 7.795676133955065e-07, + "loss": 0.5151, + "mean_token_accuracy": 0.8371069431304932, + "num_tokens": 70332774.0, + "step": 1840 + }, + { + "epoch": 0.2341941228851291, + "ewc_loss": 0.0174872986972332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.743649232201278e-06, + "grad_norm": 18.56361198425293, + "learning_rate": 7.799915218312844e-07, + "loss": 0.4984, + "mean_token_accuracy": 0.8409034609794617, + "num_tokens": 70366880.0, + "step": 1841 + }, + { + "epoch": 0.23432133316371964, + "ewc_loss": 0.017429927363991737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.71496376930736e-06, + "grad_norm": 18.535972595214844, + "learning_rate": 7.804154302670622e-07, + "loss": 0.4741, + "mean_token_accuracy": 0.8491247296333313, + "num_tokens": 70405235.0, + "step": 1842 + }, + { + "epoch": 0.23444854344231014, + "ewc_loss": 0.01749214343726635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.746072126086801e-06, + "grad_norm": 18.632869720458984, + "learning_rate": 7.808393387028402e-07, + "loss": 0.4849, + "mean_token_accuracy": 0.8451881408691406, + "num_tokens": 70446503.0, + "step": 1843 + }, + { + "epoch": 0.23457575372090064, + "ewc_loss": 0.017452644184231758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.726322448637802e-06, + "grad_norm": 18.486082077026367, + "learning_rate": 7.81263247138618e-07, + "loss": 0.5353, + "mean_token_accuracy": 0.8303555846214294, + "num_tokens": 70483514.0, + "step": 1844 + }, + { + "epoch": 0.23470296399949117, + "ewc_loss": 0.017427567392587662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.713783245184459e-06, + "grad_norm": 18.58570671081543, + "learning_rate": 7.816871555743959e-07, + "loss": 0.4964, + "mean_token_accuracy": 0.8414084315299988, + "num_tokens": 70520944.0, + "step": 1845 + }, + { + "epoch": 0.23483017427808167, + "ewc_loss": 0.01747722178697586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.738610631553456e-06, + "grad_norm": 18.56154441833496, + "learning_rate": 7.821110640101738e-07, + "loss": 0.5188, + "mean_token_accuracy": 0.8343108892440796, + "num_tokens": 70564399.0, + "step": 1846 + }, + { + "epoch": 0.23495738455667217, + "ewc_loss": 0.01741989329457283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.70994699653238e-06, + "grad_norm": 18.59070587158203, + "learning_rate": 7.825349724459517e-07, + "loss": 0.492, + "mean_token_accuracy": 0.845401406288147, + "num_tokens": 70604000.0, + "step": 1847 + }, + { + "epoch": 0.2350845948352627, + "ewc_loss": 0.017440984025597572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.720491678104736e-06, + "grad_norm": 18.68209457397461, + "learning_rate": 7.829588808817294e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.8459001779556274, + "num_tokens": 70645133.0, + "step": 1848 + }, + { + "epoch": 0.2352118051138532, + "ewc_loss": 0.017423201352357864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.711600457900204e-06, + "grad_norm": 18.51486587524414, + "learning_rate": 7.833827893175074e-07, + "loss": 0.5193, + "mean_token_accuracy": 0.8307617902755737, + "num_tokens": 70675824.0, + "step": 1849 + }, + { + "epoch": 0.2353390153924437, + "ewc_loss": 0.017399722710251808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.69986160978442e-06, + "grad_norm": 18.76239585876465, + "learning_rate": 7.838066977532852e-07, + "loss": 0.482, + "mean_token_accuracy": 0.8471715450286865, + "num_tokens": 70714103.0, + "step": 1850 + }, + { + "epoch": 0.23546622567103423, + "ewc_loss": 0.0174417644739151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.720881851331796e-06, + "grad_norm": 18.621549606323242, + "learning_rate": 7.842306061890632e-07, + "loss": 0.5331, + "mean_token_accuracy": 0.8332352638244629, + "num_tokens": 70754265.0, + "step": 1851 + }, + { + "epoch": 0.23559343594962473, + "ewc_loss": 0.0173577219247818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.678861377120484e-06, + "grad_norm": 18.616031646728516, + "learning_rate": 7.84654514624841e-07, + "loss": 0.4979, + "mean_token_accuracy": 0.8386633992195129, + "num_tokens": 70791368.0, + "step": 1852 + }, + { + "epoch": 0.23572064622821523, + "ewc_loss": 0.01742309331893921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.7115467977128e-06, + "grad_norm": 18.737754821777344, + "learning_rate": 7.850784230606188e-07, + "loss": 0.5449, + "mean_token_accuracy": 0.8300309777259827, + "num_tokens": 70827397.0, + "step": 1853 + }, + { + "epoch": 0.23584785650680576, + "ewc_loss": 0.01735367812216282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.67683866090374e-06, + "grad_norm": 18.509552001953125, + "learning_rate": 7.855023314963968e-07, + "loss": 0.5153, + "mean_token_accuracy": 0.8402084708213806, + "num_tokens": 70872503.0, + "step": 1854 + }, + { + "epoch": 0.23597506678539626, + "ewc_loss": 0.01737327314913273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.68663664732594e-06, + "grad_norm": 18.747974395751953, + "learning_rate": 7.859262399321746e-07, + "loss": 0.5136, + "mean_token_accuracy": 0.83601975440979, + "num_tokens": 70914376.0, + "step": 1855 + }, + { + "epoch": 0.23610227706398676, + "ewc_loss": 0.017429418861865997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.714709110790864e-06, + "grad_norm": 18.690044403076172, + "learning_rate": 7.863501483679524e-07, + "loss": 0.4701, + "mean_token_accuracy": 0.8499177098274231, + "num_tokens": 70954004.0, + "step": 1856 + }, + { + "epoch": 0.2362294873425773, + "ewc_loss": 0.017352577298879623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.676288416609168e-06, + "grad_norm": 18.87339973449707, + "learning_rate": 7.867740568037303e-07, + "loss": 0.4714, + "mean_token_accuracy": 0.8472802042961121, + "num_tokens": 70991765.0, + "step": 1857 + }, + { + "epoch": 0.2363566976211678, + "ewc_loss": 0.017378751188516617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.68937604536768e-06, + "grad_norm": 18.607866287231445, + "learning_rate": 7.871979652395082e-07, + "loss": 0.5011, + "mean_token_accuracy": 0.8378680944442749, + "num_tokens": 71025478.0, + "step": 1858 + }, + { + "epoch": 0.2364839078997583, + "ewc_loss": 0.017292151227593422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.646075912110973e-06, + "grad_norm": 18.632539749145508, + "learning_rate": 7.876218736752861e-07, + "loss": 0.4552, + "mean_token_accuracy": 0.8573065996170044, + "num_tokens": 71067287.0, + "step": 1859 + }, + { + "epoch": 0.23661111817834882, + "ewc_loss": 0.017423899844288826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.711949703865685e-06, + "grad_norm": 18.93985939025879, + "learning_rate": 7.88045782111064e-07, + "loss": 0.5254, + "mean_token_accuracy": 0.8345829248428345, + "num_tokens": 71108095.0, + "step": 1860 + }, + { + "epoch": 0.23673832845693932, + "ewc_loss": 0.017356833443045616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.678416634211317e-06, + "grad_norm": 18.64366912841797, + "learning_rate": 7.884696905468418e-07, + "loss": 0.4581, + "mean_token_accuracy": 0.8554986715316772, + "num_tokens": 71150297.0, + "step": 1861 + }, + { + "epoch": 0.23686553873552982, + "ewc_loss": 0.017273783683776855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.63689183461247e-06, + "grad_norm": 18.617069244384766, + "learning_rate": 7.888935989826198e-07, + "loss": 0.5055, + "mean_token_accuracy": 0.8369036316871643, + "num_tokens": 71189205.0, + "step": 1862 + }, + { + "epoch": 0.23699274901412035, + "ewc_loss": 0.017380302771925926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.690150934853591e-06, + "grad_norm": 18.815279006958008, + "learning_rate": 7.893175074183976e-07, + "loss": 0.4953, + "mean_token_accuracy": 0.8415587544441223, + "num_tokens": 71228141.0, + "step": 1863 + }, + { + "epoch": 0.23711995929271085, + "ewc_loss": 0.017378199845552444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.689099558978342e-06, + "grad_norm": 18.71803092956543, + "learning_rate": 7.897414158541754e-07, + "loss": 0.4962, + "mean_token_accuracy": 0.8436555862426758, + "num_tokens": 71267854.0, + "step": 1864 + }, + { + "epoch": 0.23724716957130135, + "ewc_loss": 0.017295895144343376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.647947652207222e-06, + "grad_norm": 18.707679748535156, + "learning_rate": 7.901653242899533e-07, + "loss": 0.5316, + "mean_token_accuracy": 0.8359222412109375, + "num_tokens": 71305030.0, + "step": 1865 + }, + { + "epoch": 0.23737437984989188, + "ewc_loss": 0.017363090068101883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.681545295985416e-06, + "grad_norm": 18.63304901123047, + "learning_rate": 7.905892327257312e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.8458362817764282, + "num_tokens": 71342202.0, + "step": 1866 + }, + { + "epoch": 0.23750159012848238, + "ewc_loss": 0.017353007569909096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.676503966853488e-06, + "grad_norm": 18.69871711730957, + "learning_rate": 7.910131411615091e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.8477351069450378, + "num_tokens": 71379702.0, + "step": 1867 + }, + { + "epoch": 0.2376288004070729, + "ewc_loss": 0.017415212467312813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.707605957170017e-06, + "grad_norm": 18.744611740112305, + "learning_rate": 7.91437049597287e-07, + "loss": 0.4894, + "mean_token_accuracy": 0.845172107219696, + "num_tokens": 71417785.0, + "step": 1868 + }, + { + "epoch": 0.2377560106856634, + "ewc_loss": 0.01735582947731018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.677914593135938e-06, + "grad_norm": 18.564905166625977, + "learning_rate": 7.918609580330648e-07, + "loss": 0.5325, + "mean_token_accuracy": 0.8312935829162598, + "num_tokens": 71461358.0, + "step": 1869 + }, + { + "epoch": 0.2378832209642539, + "ewc_loss": 0.01738239824771881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.691198672750033e-06, + "grad_norm": 18.668371200561523, + "learning_rate": 7.922848664688428e-07, + "loss": 0.5298, + "mean_token_accuracy": 0.8348819613456726, + "num_tokens": 71500059.0, + "step": 1870 + }, + { + "epoch": 0.23801043124284443, + "ewc_loss": 0.01748744584619999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.743722901272122e-06, + "grad_norm": 18.687074661254883, + "learning_rate": 7.927087749046205e-07, + "loss": 0.4401, + "mean_token_accuracy": 0.8604527711868286, + "num_tokens": 71537043.0, + "step": 1871 + }, + { + "epoch": 0.23813764152143493, + "ewc_loss": 0.017418639734387398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.709319445188157e-06, + "grad_norm": 18.692218780517578, + "learning_rate": 7.931326833403983e-07, + "loss": 0.4747, + "mean_token_accuracy": 0.8466505408287048, + "num_tokens": 71570671.0, + "step": 1872 + }, + { + "epoch": 0.23826485180002543, + "ewc_loss": 0.017463214695453644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.731607522349805e-06, + "grad_norm": 18.67546272277832, + "learning_rate": 7.935565917761763e-07, + "loss": 0.4865, + "mean_token_accuracy": 0.8466918468475342, + "num_tokens": 71614813.0, + "step": 1873 + }, + { + "epoch": 0.23839206207861596, + "ewc_loss": 0.01750318706035614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.751593668421265e-06, + "grad_norm": 18.901561737060547, + "learning_rate": 7.939805002119541e-07, + "loss": 0.5015, + "mean_token_accuracy": 0.8384360074996948, + "num_tokens": 71650441.0, + "step": 1874 + }, + { + "epoch": 0.23851927235720646, + "ewc_loss": 0.017498917877674103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.749459084356204e-06, + "grad_norm": 18.80097198486328, + "learning_rate": 7.944044086477321e-07, + "loss": 0.5069, + "mean_token_accuracy": 0.8400039076805115, + "num_tokens": 71690425.0, + "step": 1875 + }, + { + "epoch": 0.23864648263579696, + "ewc_loss": 0.017487961798906326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.743981197767425e-06, + "grad_norm": 19.016334533691406, + "learning_rate": 7.948283170835099e-07, + "loss": 0.455, + "mean_token_accuracy": 0.8536853790283203, + "num_tokens": 71727349.0, + "step": 1876 + }, + { + "epoch": 0.2387736929143875, + "ewc_loss": 0.017493512481451035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.746756066102535e-06, + "grad_norm": 18.661001205444336, + "learning_rate": 7.952522255192878e-07, + "loss": 0.4192, + "mean_token_accuracy": 0.8664137125015259, + "num_tokens": 71770386.0, + "step": 1877 + }, + { + "epoch": 0.238900903192978, + "ewc_loss": 0.017406007274985313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.703003913979046e-06, + "grad_norm": 18.695051193237305, + "learning_rate": 7.956761339550657e-07, + "loss": 0.4535, + "mean_token_accuracy": 0.8557486534118652, + "num_tokens": 71808946.0, + "step": 1878 + }, + { + "epoch": 0.2390281134715685, + "ewc_loss": 0.017545493319630623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.7727466961951e-06, + "grad_norm": 18.926292419433594, + "learning_rate": 7.961000423908435e-07, + "loss": 0.4937, + "mean_token_accuracy": 0.8492375612258911, + "num_tokens": 71848605.0, + "step": 1879 + }, + { + "epoch": 0.23915532375015902, + "ewc_loss": 0.017504263669252396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.752132089284714e-06, + "grad_norm": 18.830326080322266, + "learning_rate": 7.965239508266214e-07, + "loss": 0.4724, + "mean_token_accuracy": 0.8487653136253357, + "num_tokens": 71886771.0, + "step": 1880 + }, + { + "epoch": 0.23928253402874952, + "ewc_loss": 0.01751786656677723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.758933290664572e-06, + "grad_norm": 18.77386474609375, + "learning_rate": 7.969478592623993e-07, + "loss": 0.4498, + "mean_token_accuracy": 0.8553439378738403, + "num_tokens": 71924905.0, + "step": 1881 + }, + { + "epoch": 0.23940974430734002, + "ewc_loss": 0.017535392194986343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.767696272116154e-06, + "grad_norm": 18.90404510498047, + "learning_rate": 7.973717676981771e-07, + "loss": 0.4752, + "mean_token_accuracy": 0.8505574464797974, + "num_tokens": 71964513.0, + "step": 1882 + }, + { + "epoch": 0.23953695458593055, + "ewc_loss": 0.01757117547094822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.785587851889431e-06, + "grad_norm": 18.859420776367188, + "learning_rate": 7.977956761339551e-07, + "loss": 0.4448, + "mean_token_accuracy": 0.8582001328468323, + "num_tokens": 72000918.0, + "step": 1883 + }, + { + "epoch": 0.23966416486452105, + "ewc_loss": 0.01749305985867977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.746529601921793e-06, + "grad_norm": 18.688695907592773, + "learning_rate": 7.982195845697329e-07, + "loss": 0.5762, + "mean_token_accuracy": 0.8185489177703857, + "num_tokens": 72040840.0, + "step": 1884 + }, + { + "epoch": 0.23979137514311155, + "ewc_loss": 0.01756116934120655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.780584721534979e-06, + "grad_norm": 18.841798782348633, + "learning_rate": 7.986434930055108e-07, + "loss": 0.4479, + "mean_token_accuracy": 0.8545088171958923, + "num_tokens": 72076757.0, + "step": 1885 + }, + { + "epoch": 0.23991858542170208, + "ewc_loss": 0.01756925880908966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.784629244473763e-06, + "grad_norm": 18.843923568725586, + "learning_rate": 7.990674014412886e-07, + "loss": 0.5067, + "mean_token_accuracy": 0.8410725593566895, + "num_tokens": 72120760.0, + "step": 1886 + }, + { + "epoch": 0.24004579570029258, + "ewc_loss": 0.017573397606611252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.786699254414998e-06, + "grad_norm": 18.73711395263672, + "learning_rate": 7.994913098770665e-07, + "loss": 0.4923, + "mean_token_accuracy": 0.8412843346595764, + "num_tokens": 72159985.0, + "step": 1887 + }, + { + "epoch": 0.24017300597888308, + "ewc_loss": 0.01754254288971424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.771271495788824e-06, + "grad_norm": 18.76725959777832, + "learning_rate": 7.999152183128444e-07, + "loss": 0.5263, + "mean_token_accuracy": 0.8330956101417542, + "num_tokens": 72196377.0, + "step": 1888 + }, + { + "epoch": 0.2403002162574736, + "ewc_loss": 0.017633842304348946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.81692085386021e-06, + "grad_norm": 18.705947875976562, + "learning_rate": 8.003391267486223e-07, + "loss": 0.4753, + "mean_token_accuracy": 0.8488385677337646, + "num_tokens": 72236585.0, + "step": 1889 + }, + { + "epoch": 0.2404274265360641, + "ewc_loss": 0.017588669434189796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.794334462436382e-06, + "grad_norm": 18.91399383544922, + "learning_rate": 8.007630351844001e-07, + "loss": 0.4892, + "mean_token_accuracy": 0.843902587890625, + "num_tokens": 72268450.0, + "step": 1890 + }, + { + "epoch": 0.24055463681465464, + "ewc_loss": 0.017676835879683495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.838417670631316e-06, + "grad_norm": 18.995487213134766, + "learning_rate": 8.011869436201781e-07, + "loss": 0.4716, + "mean_token_accuracy": 0.8493874669075012, + "num_tokens": 72304092.0, + "step": 1891 + }, + { + "epoch": 0.24068184709324514, + "ewc_loss": 0.017594458535313606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.797229384072125e-06, + "grad_norm": 18.813844680786133, + "learning_rate": 8.016108520559559e-07, + "loss": 0.4985, + "mean_token_accuracy": 0.8401573300361633, + "num_tokens": 72347312.0, + "step": 1892 + }, + { + "epoch": 0.24080905737183564, + "ewc_loss": 0.017638694494962692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.81934738572454e-06, + "grad_norm": 19.104150772094727, + "learning_rate": 8.020347604917338e-07, + "loss": 0.4601, + "mean_token_accuracy": 0.8532813787460327, + "num_tokens": 72382135.0, + "step": 1893 + }, + { + "epoch": 0.24093626765042617, + "ewc_loss": 0.017625585198402405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.812792657408863e-06, + "grad_norm": 18.718387603759766, + "learning_rate": 8.024586689275116e-07, + "loss": 0.4885, + "mean_token_accuracy": 0.8455177545547485, + "num_tokens": 72421526.0, + "step": 1894 + }, + { + "epoch": 0.24106347792901667, + "ewc_loss": 0.01761884242296219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.809421160549391e-06, + "grad_norm": 18.996471405029297, + "learning_rate": 8.028825773632894e-07, + "loss": 0.4873, + "mean_token_accuracy": 0.8472065925598145, + "num_tokens": 72458592.0, + "step": 1895 + }, + { + "epoch": 0.24119068820760717, + "ewc_loss": 0.017729200422763824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.864600204105955e-06, + "grad_norm": 18.978605270385742, + "learning_rate": 8.033064857990674e-07, + "loss": 0.4193, + "mean_token_accuracy": 0.8650743961334229, + "num_tokens": 72494734.0, + "step": 1896 + }, + { + "epoch": 0.2413178984861977, + "ewc_loss": 0.017620939761400223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.810469807940535e-06, + "grad_norm": 18.701858520507812, + "learning_rate": 8.037303942348452e-07, + "loss": 0.4695, + "mean_token_accuracy": 0.8514156341552734, + "num_tokens": 72531560.0, + "step": 1897 + }, + { + "epoch": 0.2414451087647882, + "ewc_loss": 0.01766384392976761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.831922059471253e-06, + "grad_norm": 18.937108993530273, + "learning_rate": 8.041543026706231e-07, + "loss": 0.4678, + "mean_token_accuracy": 0.8512715101242065, + "num_tokens": 72570144.0, + "step": 1898 + }, + { + "epoch": 0.2415723190433787, + "ewc_loss": 0.017757929861545563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.878964763425756e-06, + "grad_norm": 18.86185073852539, + "learning_rate": 8.04578211106401e-07, + "loss": 0.4781, + "mean_token_accuracy": 0.845271110534668, + "num_tokens": 72606738.0, + "step": 1899 + }, + { + "epoch": 0.24169952932196923, + "ewc_loss": 0.017699696123600006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.849848200043198e-06, + "grad_norm": 18.874561309814453, + "learning_rate": 8.050021195421789e-07, + "loss": 0.5214, + "mean_token_accuracy": 0.8335627317428589, + "num_tokens": 72646230.0, + "step": 1900 + }, + { + "epoch": 0.24182673960055973, + "ewc_loss": 0.017756262794137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.878131666278932e-06, + "grad_norm": 18.793893814086914, + "learning_rate": 8.054260279779567e-07, + "loss": 0.4903, + "mean_token_accuracy": 0.8419616222381592, + "num_tokens": 72681811.0, + "step": 1901 + }, + { + "epoch": 0.24195394987915023, + "ewc_loss": 0.017752110958099365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.876055289874785e-06, + "grad_norm": 18.856483459472656, + "learning_rate": 8.058499364137346e-07, + "loss": 0.5267, + "mean_token_accuracy": 0.832604706287384, + "num_tokens": 72721805.0, + "step": 1902 + }, + { + "epoch": 0.24208116015774075, + "ewc_loss": 0.017828429117798805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.914214959077071e-06, + "grad_norm": 18.989974975585938, + "learning_rate": 8.062738448495124e-07, + "loss": 0.4389, + "mean_token_accuracy": 0.8593698143959045, + "num_tokens": 72763142.0, + "step": 1903 + }, + { + "epoch": 0.24220837043633126, + "ewc_loss": 0.01777513325214386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.887566764315125e-06, + "grad_norm": 18.762027740478516, + "learning_rate": 8.066977532852904e-07, + "loss": 0.4307, + "mean_token_accuracy": 0.8651695847511292, + "num_tokens": 72802047.0, + "step": 1904 + }, + { + "epoch": 0.24233558071492176, + "ewc_loss": 0.017793986946344376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.896993676899001e-06, + "grad_norm": 18.950984954833984, + "learning_rate": 8.071216617210682e-07, + "loss": 0.5108, + "mean_token_accuracy": 0.8378572463989258, + "num_tokens": 72834470.0, + "step": 1905 + }, + { + "epoch": 0.24246279099351228, + "ewc_loss": 0.017863979563117027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.931990123528522e-06, + "grad_norm": 18.80038070678711, + "learning_rate": 8.075455701568461e-07, + "loss": 0.4788, + "mean_token_accuracy": 0.8486413955688477, + "num_tokens": 72873101.0, + "step": 1906 + }, + { + "epoch": 0.24259000127210278, + "ewc_loss": 0.017843957990407944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.921979315346107e-06, + "grad_norm": 18.93380355834961, + "learning_rate": 8.07969478592624e-07, + "loss": 0.4577, + "mean_token_accuracy": 0.8574927449226379, + "num_tokens": 72915504.0, + "step": 1907 + }, + { + "epoch": 0.24271721155069328, + "ewc_loss": 0.017906948924064636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.953474207373802e-06, + "grad_norm": 18.85910987854004, + "learning_rate": 8.083933870284019e-07, + "loss": 0.4318, + "mean_token_accuracy": 0.8624130487442017, + "num_tokens": 72947179.0, + "step": 1908 + }, + { + "epoch": 0.2428444218292838, + "ewc_loss": 0.017875241115689278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.937620805227198e-06, + "grad_norm": 18.8536434173584, + "learning_rate": 8.088172954641796e-07, + "loss": 0.4731, + "mean_token_accuracy": 0.8505600690841675, + "num_tokens": 72983313.0, + "step": 1909 + }, + { + "epoch": 0.2429716321078743, + "ewc_loss": 0.017908643931150436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.954321856435854e-06, + "grad_norm": 18.859474182128906, + "learning_rate": 8.092412038999576e-07, + "loss": 0.5245, + "mean_token_accuracy": 0.8339400291442871, + "num_tokens": 73014218.0, + "step": 1910 + }, + { + "epoch": 0.24309884238646481, + "ewc_loss": 0.017924290150403976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.962145329860505e-06, + "grad_norm": 18.860809326171875, + "learning_rate": 8.096651123357354e-07, + "loss": 0.5334, + "mean_token_accuracy": 0.8294137716293335, + "num_tokens": 73048282.0, + "step": 1911 + }, + { + "epoch": 0.24322605266505534, + "ewc_loss": 0.017965693026781082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.982846338767558e-06, + "grad_norm": 18.943925857543945, + "learning_rate": 8.100890207715134e-07, + "loss": 0.5184, + "mean_token_accuracy": 0.8359287977218628, + "num_tokens": 73084390.0, + "step": 1912 + }, + { + "epoch": 0.24335326294364584, + "ewc_loss": 0.017972653731703758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.986326974991243e-06, + "grad_norm": 18.89064598083496, + "learning_rate": 8.105129292072912e-07, + "loss": 0.4857, + "mean_token_accuracy": 0.8454623818397522, + "num_tokens": 73123296.0, + "step": 1913 + }, + { + "epoch": 0.24348047322223634, + "ewc_loss": 0.017966246232390404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.983122825156897e-06, + "grad_norm": 18.986591339111328, + "learning_rate": 8.10936837643069e-07, + "loss": 0.5326, + "mean_token_accuracy": 0.8337451815605164, + "num_tokens": 73157533.0, + "step": 1914 + }, + { + "epoch": 0.24360768350082687, + "ewc_loss": 0.017958521842956543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.97926111065317e-06, + "grad_norm": 18.808429718017578, + "learning_rate": 8.11360746078847e-07, + "loss": 0.464, + "mean_token_accuracy": 0.8515357971191406, + "num_tokens": 73195314.0, + "step": 1915 + }, + { + "epoch": 0.24373489377941737, + "ewc_loss": 0.017941651865839958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.970825547294226e-06, + "grad_norm": 19.010360717773438, + "learning_rate": 8.117846545146248e-07, + "loss": 0.4331, + "mean_token_accuracy": 0.8608340620994568, + "num_tokens": 73236763.0, + "step": 1916 + }, + { + "epoch": 0.2438621040580079, + "ewc_loss": 0.01804789900779724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.023949132824782e-06, + "grad_norm": 18.886117935180664, + "learning_rate": 8.122085629504026e-07, + "loss": 0.5233, + "mean_token_accuracy": 0.8316762447357178, + "num_tokens": 73276719.0, + "step": 1917 + }, + { + "epoch": 0.2439893143365984, + "ewc_loss": 0.017979521304368973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 8.989760317490436e-06, + "grad_norm": 18.94011688232422, + "learning_rate": 8.126324713861805e-07, + "loss": 0.4883, + "mean_token_accuracy": 0.84698086977005, + "num_tokens": 73318594.0, + "step": 1918 + }, + { + "epoch": 0.2441165246151889, + "ewc_loss": 0.018041308969259262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.020654033520259e-06, + "grad_norm": 18.956857681274414, + "learning_rate": 8.130563798219584e-07, + "loss": 0.4618, + "mean_token_accuracy": 0.8553241491317749, + "num_tokens": 73358760.0, + "step": 1919 + }, + { + "epoch": 0.24424373489377943, + "ewc_loss": 0.01803267002105713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.01633484318154e-06, + "grad_norm": 18.958757400512695, + "learning_rate": 8.134802882577363e-07, + "loss": 0.5059, + "mean_token_accuracy": 0.8445242643356323, + "num_tokens": 73398623.0, + "step": 1920 + }, + { + "epoch": 0.24437094517236993, + "ewc_loss": 0.01805669628083706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.028348358697258e-06, + "grad_norm": 18.974529266357422, + "learning_rate": 8.139041966935142e-07, + "loss": 0.4869, + "mean_token_accuracy": 0.8465580940246582, + "num_tokens": 73446369.0, + "step": 1921 + }, + { + "epoch": 0.24449815545096043, + "ewc_loss": 0.018024977296590805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.012488590087742e-06, + "grad_norm": 18.864126205444336, + "learning_rate": 8.14328105129292e-07, + "loss": 0.5479, + "mean_token_accuracy": 0.832186222076416, + "num_tokens": 73483152.0, + "step": 1922 + }, + { + "epoch": 0.24462536572955096, + "ewc_loss": 0.018061436712741852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.030718501890078e-06, + "grad_norm": 19.064983367919922, + "learning_rate": 8.1475201356507e-07, + "loss": 0.5026, + "mean_token_accuracy": 0.8422271013259888, + "num_tokens": 73519179.0, + "step": 1923 + }, + { + "epoch": 0.24475257600814146, + "ewc_loss": 0.018144380301237106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.072190550796222e-06, + "grad_norm": 19.076866149902344, + "learning_rate": 8.151759220008477e-07, + "loss": 0.4744, + "mean_token_accuracy": 0.8474761843681335, + "num_tokens": 73561947.0, + "step": 1924 + }, + { + "epoch": 0.24487978628673196, + "ewc_loss": 0.0180324949324131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.016247531690169e-06, + "grad_norm": 18.931020736694336, + "learning_rate": 8.155998304366256e-07, + "loss": 0.4899, + "mean_token_accuracy": 0.8491129279136658, + "num_tokens": 73595841.0, + "step": 1925 + }, + { + "epoch": 0.2450069965653225, + "ewc_loss": 0.018066639080643654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.033319656737149e-06, + "grad_norm": 19.122879028320312, + "learning_rate": 8.160237388724035e-07, + "loss": 0.5152, + "mean_token_accuracy": 0.8335633277893066, + "num_tokens": 73629716.0, + "step": 1926 + }, + { + "epoch": 0.245134206843913, + "ewc_loss": 0.01816241629421711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.0812081907643e-06, + "grad_norm": 19.09677505493164, + "learning_rate": 8.164476473081814e-07, + "loss": 0.5023, + "mean_token_accuracy": 0.8412557244300842, + "num_tokens": 73667285.0, + "step": 1927 + }, + { + "epoch": 0.2452614171225035, + "ewc_loss": 0.018057409673929214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.028704880620353e-06, + "grad_norm": 18.974855422973633, + "learning_rate": 8.168715557439593e-07, + "loss": 0.4464, + "mean_token_accuracy": 0.8583306074142456, + "num_tokens": 73708059.0, + "step": 1928 + }, + { + "epoch": 0.24538862740109402, + "ewc_loss": 0.01810644380748272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.053222129296046e-06, + "grad_norm": 19.084280014038086, + "learning_rate": 8.172954641797372e-07, + "loss": 0.4579, + "mean_token_accuracy": 0.8526325821876526, + "num_tokens": 73748553.0, + "step": 1929 + }, + { + "epoch": 0.24551583767968452, + "ewc_loss": 0.018113669008016586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.056834642251488e-06, + "grad_norm": 18.971677780151367, + "learning_rate": 8.17719372615515e-07, + "loss": 0.5592, + "mean_token_accuracy": 0.8252475261688232, + "num_tokens": 73796295.0, + "step": 1930 + }, + { + "epoch": 0.24564304795827502, + "ewc_loss": 0.01810995489358902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.054977454070468e-06, + "grad_norm": 19.10014533996582, + "learning_rate": 8.18143281051293e-07, + "loss": 0.5401, + "mean_token_accuracy": 0.8321173191070557, + "num_tokens": 73835960.0, + "step": 1931 + }, + { + "epoch": 0.24577025823686555, + "ewc_loss": 0.018139736726880074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.069868610822596e-06, + "grad_norm": 19.127059936523438, + "learning_rate": 8.185671894870707e-07, + "loss": 0.5555, + "mean_token_accuracy": 0.8266491889953613, + "num_tokens": 73875302.0, + "step": 1932 + }, + { + "epoch": 0.24589746851545605, + "ewc_loss": 0.018151823431253433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.075911293621175e-06, + "grad_norm": 18.988622665405273, + "learning_rate": 8.189910979228485e-07, + "loss": 0.4455, + "mean_token_accuracy": 0.8562504053115845, + "num_tokens": 73907703.0, + "step": 1933 + }, + { + "epoch": 0.24602467879404655, + "ewc_loss": 0.01811724714934826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.058623618329875e-06, + "grad_norm": 19.120548248291016, + "learning_rate": 8.194150063586265e-07, + "loss": 0.5404, + "mean_token_accuracy": 0.8320064544677734, + "num_tokens": 73945449.0, + "step": 1934 + }, + { + "epoch": 0.24615188907263708, + "ewc_loss": 0.01813708432018757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.06854256754741e-06, + "grad_norm": 18.96502113342285, + "learning_rate": 8.198389147944043e-07, + "loss": 0.529, + "mean_token_accuracy": 0.8296166658401489, + "num_tokens": 73982461.0, + "step": 1935 + }, + { + "epoch": 0.24627909935122758, + "ewc_loss": 0.018133647739887238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.06682362256106e-06, + "grad_norm": 19.033267974853516, + "learning_rate": 8.202628232301823e-07, + "loss": 0.5017, + "mean_token_accuracy": 0.8455951809883118, + "num_tokens": 74018252.0, + "step": 1936 + }, + { + "epoch": 0.24640630962981808, + "ewc_loss": 0.018250059336423874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.125029464485124e-06, + "grad_norm": 19.241077423095703, + "learning_rate": 8.206867316659601e-07, + "loss": 0.5217, + "mean_token_accuracy": 0.833279013633728, + "num_tokens": 74065843.0, + "step": 1937 + }, + { + "epoch": 0.2465335199084086, + "ewc_loss": 0.01821274124085903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.10637027118355e-06, + "grad_norm": 19.14508819580078, + "learning_rate": 8.21110640101738e-07, + "loss": 0.4119, + "mean_token_accuracy": 0.8654042482376099, + "num_tokens": 74106389.0, + "step": 1938 + }, + { + "epoch": 0.2466607301869991, + "ewc_loss": 0.018199359998106956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.099680028157309e-06, + "grad_norm": 19.168935775756836, + "learning_rate": 8.215345485375159e-07, + "loss": 0.5328, + "mean_token_accuracy": 0.8333008885383606, + "num_tokens": 74150786.0, + "step": 1939 + }, + { + "epoch": 0.2467879404655896, + "ewc_loss": 0.018193431198596954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.09671598492423e-06, + "grad_norm": 19.255451202392578, + "learning_rate": 8.219584569732937e-07, + "loss": 0.5021, + "mean_token_accuracy": 0.838340699672699, + "num_tokens": 74190474.0, + "step": 1940 + }, + { + "epoch": 0.24691515074418013, + "ewc_loss": 0.0182020403444767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.10101971385302e-06, + "grad_norm": 19.059030532836914, + "learning_rate": 8.223823654090715e-07, + "loss": 0.4522, + "mean_token_accuracy": 0.8559007048606873, + "num_tokens": 74226437.0, + "step": 1941 + }, + { + "epoch": 0.24704236102277063, + "ewc_loss": 0.018181895837187767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.090947969525587e-06, + "grad_norm": 19.425125122070312, + "learning_rate": 8.228062738448495e-07, + "loss": 0.4826, + "mean_token_accuracy": 0.8439698815345764, + "num_tokens": 74267292.0, + "step": 1942 + }, + { + "epoch": 0.24716957130136116, + "ewc_loss": 0.018241552636027336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.120776667259634e-06, + "grad_norm": 19.195791244506836, + "learning_rate": 8.232301822806273e-07, + "loss": 0.507, + "mean_token_accuracy": 0.8396314382553101, + "num_tokens": 74306738.0, + "step": 1943 + }, + { + "epoch": 0.24729678157995166, + "ewc_loss": 0.018068213015794754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.034106369654182e-06, + "grad_norm": 19.067493438720703, + "learning_rate": 8.236540907164053e-07, + "loss": 0.4968, + "mean_token_accuracy": 0.8454263806343079, + "num_tokens": 74343151.0, + "step": 1944 + }, + { + "epoch": 0.24742399185854216, + "ewc_loss": 0.01823660358786583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.11830193217611e-06, + "grad_norm": 19.25922393798828, + "learning_rate": 8.240779991521831e-07, + "loss": 0.4732, + "mean_token_accuracy": 0.8480373620986938, + "num_tokens": 74382755.0, + "step": 1945 + }, + { + "epoch": 0.2475512021371327, + "ewc_loss": 0.01820874586701393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.104373020818457e-06, + "grad_norm": 19.138774871826172, + "learning_rate": 8.24501907587961e-07, + "loss": 0.5102, + "mean_token_accuracy": 0.8388046622276306, + "num_tokens": 74426129.0, + "step": 1946 + }, + { + "epoch": 0.2476784124157232, + "ewc_loss": 0.01823311485350132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.116557521338109e-06, + "grad_norm": 19.340545654296875, + "learning_rate": 8.249258160237388e-07, + "loss": 0.522, + "mean_token_accuracy": 0.8401124477386475, + "num_tokens": 74460904.0, + "step": 1947 + }, + { + "epoch": 0.2478056226943137, + "ewc_loss": 0.01825258694589138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.126293662120588e-06, + "grad_norm": 19.28481101989746, + "learning_rate": 8.253497244595167e-07, + "loss": 0.5106, + "mean_token_accuracy": 0.8397107720375061, + "num_tokens": 74497993.0, + "step": 1948 + }, + { + "epoch": 0.24793283297290422, + "ewc_loss": 0.018183622509241104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.09181107999757e-06, + "grad_norm": 19.255556106567383, + "learning_rate": 8.257736328952945e-07, + "loss": 0.5266, + "mean_token_accuracy": 0.8338937759399414, + "num_tokens": 74541108.0, + "step": 1949 + }, + { + "epoch": 0.24806004325149472, + "ewc_loss": 0.018242307007312775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.121153198066168e-06, + "grad_norm": 19.214176177978516, + "learning_rate": 8.261975413310725e-07, + "loss": 0.4977, + "mean_token_accuracy": 0.8436269760131836, + "num_tokens": 74579389.0, + "step": 1950 + }, + { + "epoch": 0.24818725353008522, + "ewc_loss": 0.018209943547844887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.104971468332224e-06, + "grad_norm": 19.222108840942383, + "learning_rate": 8.266214497668503e-07, + "loss": 0.4502, + "mean_token_accuracy": 0.8598849773406982, + "num_tokens": 74621546.0, + "step": 1951 + }, + { + "epoch": 0.24831446380867575, + "ewc_loss": 0.018293896690011024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.146948286797851e-06, + "grad_norm": 19.59052276611328, + "learning_rate": 8.270453582026283e-07, + "loss": 0.5314, + "mean_token_accuracy": 0.8313251733779907, + "num_tokens": 74660178.0, + "step": 1952 + }, + { + "epoch": 0.24844167408726625, + "ewc_loss": 0.0182271059602499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.113552550843451e-06, + "grad_norm": 19.058935165405273, + "learning_rate": 8.274692666384061e-07, + "loss": 0.4775, + "mean_token_accuracy": 0.8486113548278809, + "num_tokens": 74693867.0, + "step": 1953 + }, + { + "epoch": 0.24856888436585675, + "ewc_loss": 0.018130993470549583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.065496669791173e-06, + "grad_norm": 19.173831939697266, + "learning_rate": 8.27893175074184e-07, + "loss": 0.4916, + "mean_token_accuracy": 0.8431365489959717, + "num_tokens": 74727561.0, + "step": 1954 + }, + { + "epoch": 0.24869609464444728, + "ewc_loss": 0.018343843519687653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.171922101813834e-06, + "grad_norm": 19.437055587768555, + "learning_rate": 8.283170835099618e-07, + "loss": 0.4925, + "mean_token_accuracy": 0.8458409905433655, + "num_tokens": 74763719.0, + "step": 1955 + }, + { + "epoch": 0.24882330492303778, + "ewc_loss": 0.018259745091199875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.129872523772065e-06, + "grad_norm": 19.062334060668945, + "learning_rate": 8.287409919457396e-07, + "loss": 0.4579, + "mean_token_accuracy": 0.8530120849609375, + "num_tokens": 74802666.0, + "step": 1956 + }, + { + "epoch": 0.24895051520162828, + "ewc_loss": 0.018249087035655975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.124543794314377e-06, + "grad_norm": 19.273862838745117, + "learning_rate": 8.291649003815175e-07, + "loss": 0.5072, + "mean_token_accuracy": 0.8405565023422241, + "num_tokens": 74840307.0, + "step": 1957 + }, + { + "epoch": 0.2490777254802188, + "ewc_loss": 0.018356909975409508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.178455002256669e-06, + "grad_norm": 19.08884620666504, + "learning_rate": 8.295888088172954e-07, + "loss": 0.5184, + "mean_token_accuracy": 0.8381832838058472, + "num_tokens": 74880181.0, + "step": 1958 + }, + { + "epoch": 0.2492049357588093, + "ewc_loss": 0.01832500286400318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.16250155569287e-06, + "grad_norm": 19.411333084106445, + "learning_rate": 8.300127172530733e-07, + "loss": 0.4275, + "mean_token_accuracy": 0.8608276844024658, + "num_tokens": 74917148.0, + "step": 1959 + }, + { + "epoch": 0.2493321460373998, + "ewc_loss": 0.01841447316110134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.207236871588975e-06, + "grad_norm": 19.285484313964844, + "learning_rate": 8.304366256888512e-07, + "loss": 0.4638, + "mean_token_accuracy": 0.850436270236969, + "num_tokens": 74953124.0, + "step": 1960 + }, + { + "epoch": 0.24945935631599034, + "ewc_loss": 0.018303321674466133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.151660378847737e-06, + "grad_norm": 19.12155532836914, + "learning_rate": 8.308605341246291e-07, + "loss": 0.4782, + "mean_token_accuracy": 0.8470544219017029, + "num_tokens": 74989726.0, + "step": 1961 + }, + { + "epoch": 0.24958656659458084, + "ewc_loss": 0.01846296153962612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.231480362359434e-06, + "grad_norm": 19.468339920043945, + "learning_rate": 8.312844425604068e-07, + "loss": 0.4795, + "mean_token_accuracy": 0.8540457487106323, + "num_tokens": 75024849.0, + "step": 1962 + }, + { + "epoch": 0.24971377687317134, + "ewc_loss": 0.01840701885521412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.20350976230111e-06, + "grad_norm": 19.147035598754883, + "learning_rate": 8.317083509961848e-07, + "loss": 0.5076, + "mean_token_accuracy": 0.8426969051361084, + "num_tokens": 75066464.0, + "step": 1963 + }, + { + "epoch": 0.24984098715176187, + "ewc_loss": 0.018379226326942444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.189613592752721e-06, + "grad_norm": 19.27499008178711, + "learning_rate": 8.321322594319626e-07, + "loss": 0.4591, + "mean_token_accuracy": 0.8530481457710266, + "num_tokens": 75106829.0, + "step": 1964 + }, + { + "epoch": 0.24996819743035237, + "ewc_loss": 0.01845496892929077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.227484042639844e-06, + "grad_norm": 19.170734405517578, + "learning_rate": 8.325561678677405e-07, + "loss": 0.512, + "mean_token_accuracy": 0.8390622138977051, + "num_tokens": 75148227.0, + "step": 1965 + }, + { + "epoch": 0.2500954077089429, + "ewc_loss": 0.0184489618986845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.22448089113459e-06, + "grad_norm": 19.280086517333984, + "learning_rate": 8.329800763035184e-07, + "loss": 0.5268, + "mean_token_accuracy": 0.8324174880981445, + "num_tokens": 75188719.0, + "step": 1966 + }, + { + "epoch": 0.25022261798753337, + "ewc_loss": 0.01845729723572731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.228648195858113e-06, + "grad_norm": 19.332378387451172, + "learning_rate": 8.334039847392963e-07, + "loss": 0.477, + "mean_token_accuracy": 0.8457542657852173, + "num_tokens": 75222260.0, + "step": 1967 + }, + { + "epoch": 0.2503498282661239, + "ewc_loss": 0.018471740186214447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.235870493284892e-06, + "grad_norm": 19.186241149902344, + "learning_rate": 8.338278931750742e-07, + "loss": 0.4499, + "mean_token_accuracy": 0.8565989136695862, + "num_tokens": 75255489.0, + "step": 1968 + }, + { + "epoch": 0.2504770385447144, + "ewc_loss": 0.018479807302355766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.239903192792553e-06, + "grad_norm": 19.355857849121094, + "learning_rate": 8.342518016108521e-07, + "loss": 0.4788, + "mean_token_accuracy": 0.8442335724830627, + "num_tokens": 75290548.0, + "step": 1969 + }, + { + "epoch": 0.2506042488233049, + "ewc_loss": 0.018505102023482323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.252550626115408e-06, + "grad_norm": 19.299259185791016, + "learning_rate": 8.346757100466298e-07, + "loss": 0.4785, + "mean_token_accuracy": 0.8487359881401062, + "num_tokens": 75323912.0, + "step": 1970 + }, + { + "epoch": 0.2507314591018954, + "ewc_loss": 0.01847630739212036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.238153324986342e-06, + "grad_norm": 19.30623435974121, + "learning_rate": 8.350996184824078e-07, + "loss": 0.5197, + "mean_token_accuracy": 0.8339766263961792, + "num_tokens": 75361206.0, + "step": 1971 + }, + { + "epoch": 0.25085866938048595, + "ewc_loss": 0.01855219341814518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.276096534449607e-06, + "grad_norm": 19.250102996826172, + "learning_rate": 8.355235269181856e-07, + "loss": 0.5007, + "mean_token_accuracy": 0.8402485847473145, + "num_tokens": 75397292.0, + "step": 1972 + }, + { + "epoch": 0.2509858796590764, + "ewc_loss": 0.01853630691766739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.268153007724322e-06, + "grad_norm": 19.268997192382812, + "learning_rate": 8.359474353539635e-07, + "loss": 0.4568, + "mean_token_accuracy": 0.855311393737793, + "num_tokens": 75437616.0, + "step": 1973 + }, + { + "epoch": 0.25111308993766696, + "ewc_loss": 0.018588246777653694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.294123628933448e-06, + "grad_norm": 19.301780700683594, + "learning_rate": 8.363713437897414e-07, + "loss": 0.5503, + "mean_token_accuracy": 0.8281217217445374, + "num_tokens": 75486172.0, + "step": 1974 + }, + { + "epoch": 0.2512403002162575, + "ewc_loss": 0.018602075055241585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.301037607656326e-06, + "grad_norm": 19.35382843017578, + "learning_rate": 8.367952522255193e-07, + "loss": 0.484, + "mean_token_accuracy": 0.8423548936843872, + "num_tokens": 75526637.0, + "step": 1975 + }, + { + "epoch": 0.25136751049484796, + "ewc_loss": 0.018612688407301903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.306344509241171e-06, + "grad_norm": 19.332101821899414, + "learning_rate": 8.372191606612972e-07, + "loss": 0.4853, + "mean_token_accuracy": 0.8468260765075684, + "num_tokens": 75564632.0, + "step": 1976 + }, + { + "epoch": 0.2514947207734385, + "ewc_loss": 0.01857145130634308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.285725354857277e-06, + "grad_norm": 19.226329803466797, + "learning_rate": 8.376430690970749e-07, + "loss": 0.4527, + "mean_token_accuracy": 0.8563729524612427, + "num_tokens": 75595553.0, + "step": 1977 + }, + { + "epoch": 0.251621931052029, + "ewc_loss": 0.018636232241988182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.318116099166218e-06, + "grad_norm": 19.3773136138916, + "learning_rate": 8.380669775328528e-07, + "loss": 0.4474, + "mean_token_accuracy": 0.8588650226593018, + "num_tokens": 75635847.0, + "step": 1978 + }, + { + "epoch": 0.25174914133061954, + "ewc_loss": 0.018658095970749855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.329048225481529e-06, + "grad_norm": 19.274580001831055, + "learning_rate": 8.384908859686307e-07, + "loss": 0.4453, + "mean_token_accuracy": 0.8593671917915344, + "num_tokens": 75677403.0, + "step": 1979 + }, + { + "epoch": 0.25187635160921, + "ewc_loss": 0.01864907518029213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.324537131760735e-06, + "grad_norm": 19.340599060058594, + "learning_rate": 8.389147944044086e-07, + "loss": 0.4877, + "mean_token_accuracy": 0.843625545501709, + "num_tokens": 75720703.0, + "step": 1980 + }, + { + "epoch": 0.25200356188780054, + "ewc_loss": 0.01869393326342106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.346966180601157e-06, + "grad_norm": 19.334468841552734, + "learning_rate": 8.393387028401864e-07, + "loss": 0.5109, + "mean_token_accuracy": 0.8388949036598206, + "num_tokens": 75758000.0, + "step": 1981 + }, + { + "epoch": 0.25213077216639107, + "ewc_loss": 0.018682368099689484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.341183613287285e-06, + "grad_norm": 19.42844009399414, + "learning_rate": 8.397626112759644e-07, + "loss": 0.5396, + "mean_token_accuracy": 0.8316543102264404, + "num_tokens": 75791113.0, + "step": 1982 + }, + { + "epoch": 0.25225798244498154, + "ewc_loss": 0.018720081076025963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.360040166939143e-06, + "grad_norm": 19.33284568786621, + "learning_rate": 8.401865197117422e-07, + "loss": 0.485, + "mean_token_accuracy": 0.8470004200935364, + "num_tokens": 75821733.0, + "step": 1983 + }, + { + "epoch": 0.25238519272357207, + "ewc_loss": 0.01866506226360798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.332530680694617e-06, + "grad_norm": 19.35476303100586, + "learning_rate": 8.406104281475202e-07, + "loss": 0.4614, + "mean_token_accuracy": 0.8517491817474365, + "num_tokens": 75861404.0, + "step": 1984 + }, + { + "epoch": 0.2525124030021626, + "ewc_loss": 0.018747346475720406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.37367349251872e-06, + "grad_norm": 19.37535285949707, + "learning_rate": 8.410343365832979e-07, + "loss": 0.4616, + "mean_token_accuracy": 0.8520717620849609, + "num_tokens": 75903059.0, + "step": 1985 + }, + { + "epoch": 0.2526396132807531, + "ewc_loss": 0.0186784528195858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.33922638068907e-06, + "grad_norm": 19.35366439819336, + "learning_rate": 8.414582450190758e-07, + "loss": 0.4746, + "mean_token_accuracy": 0.8491967916488647, + "num_tokens": 75947125.0, + "step": 1986 + }, + { + "epoch": 0.2527668235593436, + "ewc_loss": 0.01873820275068283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.369101462652907e-06, + "grad_norm": 19.387426376342773, + "learning_rate": 8.418821534548537e-07, + "loss": 0.56, + "mean_token_accuracy": 0.8248641490936279, + "num_tokens": 75987519.0, + "step": 1987 + }, + { + "epoch": 0.25289403383793413, + "ewc_loss": 0.01872570626437664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.362853234051727e-06, + "grad_norm": 19.287704467773438, + "learning_rate": 8.423060618906316e-07, + "loss": 0.4873, + "mean_token_accuracy": 0.8456392288208008, + "num_tokens": 76027728.0, + "step": 1988 + }, + { + "epoch": 0.2530212441165246, + "ewc_loss": 0.018772393465042114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.38619632506743e-06, + "grad_norm": 19.378618240356445, + "learning_rate": 8.427299703264095e-07, + "loss": 0.4851, + "mean_token_accuracy": 0.8491264581680298, + "num_tokens": 76070942.0, + "step": 1989 + }, + { + "epoch": 0.25314845439511513, + "ewc_loss": 0.018778981640934944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.389490514877252e-06, + "grad_norm": 19.37618637084961, + "learning_rate": 8.431538787621874e-07, + "loss": 0.5117, + "mean_token_accuracy": 0.8398250937461853, + "num_tokens": 76113330.0, + "step": 1990 + }, + { + "epoch": 0.25327566467370566, + "ewc_loss": 0.018782364204525948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.39118217502255e-06, + "grad_norm": 19.381011962890625, + "learning_rate": 8.435777871979652e-07, + "loss": 0.4627, + "mean_token_accuracy": 0.8542101383209229, + "num_tokens": 76153511.0, + "step": 1991 + }, + { + "epoch": 0.25340287495229613, + "ewc_loss": 0.01879206858575344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.396034329256508e-06, + "grad_norm": 19.359052658081055, + "learning_rate": 8.440016956337432e-07, + "loss": 0.4885, + "mean_token_accuracy": 0.846860945224762, + "num_tokens": 76191505.0, + "step": 1992 + }, + { + "epoch": 0.25353008523088666, + "ewc_loss": 0.018832746893167496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.416373359272256e-06, + "grad_norm": 19.43790054321289, + "learning_rate": 8.444256040695209e-07, + "loss": 0.5043, + "mean_token_accuracy": 0.8426210880279541, + "num_tokens": 76232479.0, + "step": 1993 + }, + { + "epoch": 0.2536572955094772, + "ewc_loss": 0.01880647987127304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.403240255778655e-06, + "grad_norm": 19.300594329833984, + "learning_rate": 8.448495125052988e-07, + "loss": 0.4988, + "mean_token_accuracy": 0.8439147472381592, + "num_tokens": 76270192.0, + "step": 1994 + }, + { + "epoch": 0.25378450578806766, + "ewc_loss": 0.0188176017254591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.408800906385295e-06, + "grad_norm": 19.38414764404297, + "learning_rate": 8.452734209410767e-07, + "loss": 0.4384, + "mean_token_accuracy": 0.861055850982666, + "num_tokens": 76303981.0, + "step": 1995 + }, + { + "epoch": 0.2539117160666582, + "ewc_loss": 0.01885608583688736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.428043085790705e-06, + "grad_norm": 19.387954711914062, + "learning_rate": 8.456973293768545e-07, + "loss": 0.4639, + "mean_token_accuracy": 0.8507146835327148, + "num_tokens": 76338496.0, + "step": 1996 + }, + { + "epoch": 0.2540389263452487, + "ewc_loss": 0.01886885054409504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.434425010113046e-06, + "grad_norm": 19.384859085083008, + "learning_rate": 8.461212378126325e-07, + "loss": 0.5175, + "mean_token_accuracy": 0.836520254611969, + "num_tokens": 76377904.0, + "step": 1997 + }, + { + "epoch": 0.2541661366238392, + "ewc_loss": 0.01888020522892475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.440102985536214e-06, + "grad_norm": 19.36812973022461, + "learning_rate": 8.465451462484103e-07, + "loss": 0.5405, + "mean_token_accuracy": 0.8311389684677124, + "num_tokens": 76413905.0, + "step": 1998 + }, + { + "epoch": 0.2542933469024297, + "ewc_loss": 0.018908482044935226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.454241080675274e-06, + "grad_norm": 19.505054473876953, + "learning_rate": 8.469690546841882e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.8446773886680603, + "num_tokens": 76451591.0, + "step": 1999 + }, + { + "epoch": 0.25442055718102025, + "ewc_loss": 0.018883120268583298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.441559996048454e-06, + "grad_norm": 19.401535034179688, + "learning_rate": 8.47392963119966e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8500230312347412, + "num_tokens": 76486191.0, + "step": 2000 + }, + { + "epoch": 0.2545477674596107, + "ewc_loss": 0.018871136009693146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.435568244953174e-06, + "grad_norm": 19.483898162841797, + "learning_rate": 8.478168715557439e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.8483562469482422, + "num_tokens": 76523317.0, + "step": 2001 + }, + { + "epoch": 0.25467497773820125, + "ewc_loss": 0.018905414268374443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.452706763113383e-06, + "grad_norm": 19.427288055419922, + "learning_rate": 8.482407799915217e-07, + "loss": 0.4957, + "mean_token_accuracy": 0.842951238155365, + "num_tokens": 76562661.0, + "step": 2002 + }, + { + "epoch": 0.2548021880167918, + "ewc_loss": 0.018914271146059036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.457136002311017e-06, + "grad_norm": 19.52082633972168, + "learning_rate": 8.486646884272997e-07, + "loss": 0.4899, + "mean_token_accuracy": 0.8430756330490112, + "num_tokens": 76600914.0, + "step": 2003 + }, + { + "epoch": 0.25492939829538225, + "ewc_loss": 0.018940797075629234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.47039825405227e-06, + "grad_norm": 19.404705047607422, + "learning_rate": 8.490885968630775e-07, + "loss": 0.51, + "mean_token_accuracy": 0.8374419212341309, + "num_tokens": 76636310.0, + "step": 2004 + }, + { + "epoch": 0.2550566085739728, + "ewc_loss": 0.018920833244919777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.460416549700312e-06, + "grad_norm": 19.39334487915039, + "learning_rate": 8.495125052988555e-07, + "loss": 0.4934, + "mean_token_accuracy": 0.8468724489212036, + "num_tokens": 76680889.0, + "step": 2005 + }, + { + "epoch": 0.2551838188525633, + "ewc_loss": 0.01896640472114086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.48320212046383e-06, + "grad_norm": 19.450546264648438, + "learning_rate": 8.499364137346333e-07, + "loss": 0.487, + "mean_token_accuracy": 0.8508857488632202, + "num_tokens": 76722469.0, + "step": 2006 + }, + { + "epoch": 0.2553110291311538, + "ewc_loss": 0.018983233720064163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.491616765444633e-06, + "grad_norm": 19.465530395507812, + "learning_rate": 8.503603221704112e-07, + "loss": 0.5234, + "mean_token_accuracy": 0.8357254862785339, + "num_tokens": 76757668.0, + "step": 2007 + }, + { + "epoch": 0.2554382394097443, + "ewc_loss": 0.018994688987731934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.497344763076399e-06, + "grad_norm": 19.421743392944336, + "learning_rate": 8.50784230606189e-07, + "loss": 0.4834, + "mean_token_accuracy": 0.8476305603981018, + "num_tokens": 76795607.0, + "step": 2008 + }, + { + "epoch": 0.25556544968833483, + "ewc_loss": 0.019007042050361633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.503521141596138e-06, + "grad_norm": 19.40558624267578, + "learning_rate": 8.512081390419669e-07, + "loss": 0.5133, + "mean_token_accuracy": 0.8453547358512878, + "num_tokens": 76837161.0, + "step": 2009 + }, + { + "epoch": 0.2556926599669253, + "ewc_loss": 0.019086690619587898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.54334518610267e-06, + "grad_norm": 19.638696670532227, + "learning_rate": 8.516320474777447e-07, + "loss": 0.4902, + "mean_token_accuracy": 0.8452622890472412, + "num_tokens": 76872198.0, + "step": 2010 + }, + { + "epoch": 0.25581987024551583, + "ewc_loss": 0.019064951688051224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.532475814921781e-06, + "grad_norm": 19.516427993774414, + "learning_rate": 8.520559559135227e-07, + "loss": 0.4566, + "mean_token_accuracy": 0.8578503131866455, + "num_tokens": 76908976.0, + "step": 2011 + }, + { + "epoch": 0.25594708052410636, + "ewc_loss": 0.019041739404201508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.520869753032457e-06, + "grad_norm": 19.655073165893555, + "learning_rate": 8.524798643493005e-07, + "loss": 0.509, + "mean_token_accuracy": 0.8446797728538513, + "num_tokens": 76947298.0, + "step": 2012 + }, + { + "epoch": 0.25607429080269684, + "ewc_loss": 0.019091328606009483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.54566439759219e-06, + "grad_norm": 19.574012756347656, + "learning_rate": 8.529037727850785e-07, + "loss": 0.4636, + "mean_token_accuracy": 0.8516455292701721, + "num_tokens": 76985112.0, + "step": 2013 + }, + { + "epoch": 0.25620150108128736, + "ewc_loss": 0.019025415182113647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.512707947578747e-06, + "grad_norm": 19.454973220825195, + "learning_rate": 8.533276812208563e-07, + "loss": 0.4358, + "mean_token_accuracy": 0.8606240153312683, + "num_tokens": 77020309.0, + "step": 2014 + }, + { + "epoch": 0.2563287113598779, + "ewc_loss": 0.01906755194067955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.533776392345317e-06, + "grad_norm": 19.5355167388916, + "learning_rate": 8.53751589656634e-07, + "loss": 0.4774, + "mean_token_accuracy": 0.8470420837402344, + "num_tokens": 77063245.0, + "step": 2015 + }, + { + "epoch": 0.25645592163846836, + "ewc_loss": 0.01910477876663208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.5523892014171e-06, + "grad_norm": 19.485544204711914, + "learning_rate": 8.54175498092412e-07, + "loss": 0.4747, + "mean_token_accuracy": 0.8493102192878723, + "num_tokens": 77096791.0, + "step": 2016 + }, + { + "epoch": 0.2565831319170589, + "ewc_loss": 0.019120050594210625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.560025318933185e-06, + "grad_norm": 19.49282455444336, + "learning_rate": 8.545994065281898e-07, + "loss": 0.4516, + "mean_token_accuracy": 0.8574315309524536, + "num_tokens": 77129281.0, + "step": 2017 + }, + { + "epoch": 0.2567103421956494, + "ewc_loss": 0.01914506033062935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.572529961587861e-06, + "grad_norm": 19.558788299560547, + "learning_rate": 8.550233149639677e-07, + "loss": 0.5411, + "mean_token_accuracy": 0.8283481597900391, + "num_tokens": 77168785.0, + "step": 2018 + }, + { + "epoch": 0.2568375524742399, + "ewc_loss": 0.01918770931661129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.593854883860331e-06, + "grad_norm": 19.477869033813477, + "learning_rate": 8.554472233997456e-07, + "loss": 0.5082, + "mean_token_accuracy": 0.8364646434783936, + "num_tokens": 77208360.0, + "step": 2019 + }, + { + "epoch": 0.2569647627528304, + "ewc_loss": 0.019135791808366776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.567896086082328e-06, + "grad_norm": 19.463394165039062, + "learning_rate": 8.558711318355235e-07, + "loss": 0.541, + "mean_token_accuracy": 0.8280388116836548, + "num_tokens": 77245539.0, + "step": 2020 + }, + { + "epoch": 0.25709197303142095, + "ewc_loss": 0.019201869145035744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.600934390618932e-06, + "grad_norm": 19.567617416381836, + "learning_rate": 8.562950402713014e-07, + "loss": 0.4433, + "mean_token_accuracy": 0.8642327189445496, + "num_tokens": 77279118.0, + "step": 2021 + }, + { + "epoch": 0.2572191833100114, + "ewc_loss": 0.019194556400179863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.597278221917804e-06, + "grad_norm": 19.56667709350586, + "learning_rate": 8.567189487070793e-07, + "loss": 0.5319, + "mean_token_accuracy": 0.8350644111633301, + "num_tokens": 77316259.0, + "step": 2022 + }, + { + "epoch": 0.25734639358860195, + "ewc_loss": 0.019229959696531296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.61497971729841e-06, + "grad_norm": 19.8333683013916, + "learning_rate": 8.57142857142857e-07, + "loss": 0.5507, + "mean_token_accuracy": 0.8286046385765076, + "num_tokens": 77359712.0, + "step": 2023 + }, + { + "epoch": 0.2574736038671925, + "ewc_loss": 0.019243329763412476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.621664503356442e-06, + "grad_norm": 19.609851837158203, + "learning_rate": 8.57566765578635e-07, + "loss": 0.4518, + "mean_token_accuracy": 0.8597568273544312, + "num_tokens": 77402269.0, + "step": 2024 + }, + { + "epoch": 0.25760081414578295, + "ewc_loss": 0.019242215901613235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.621107892598957e-06, + "grad_norm": 20.282188415527344, + "learning_rate": 8.579906740144128e-07, + "loss": 0.4795, + "mean_token_accuracy": 0.8490778207778931, + "num_tokens": 77445791.0, + "step": 2025 + }, + { + "epoch": 0.2577280244243735, + "ewc_loss": 0.019221581518650055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.610790584702045e-06, + "grad_norm": 19.430509567260742, + "learning_rate": 8.584145824501907e-07, + "loss": 0.5126, + "mean_token_accuracy": 0.8373408317565918, + "num_tokens": 77485376.0, + "step": 2026 + }, + { + "epoch": 0.257855234702964, + "ewc_loss": 0.019020022824406624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.51001129578799e-06, + "grad_norm": 19.587631225585938, + "learning_rate": 8.588384908859686e-07, + "loss": 0.5102, + "mean_token_accuracy": 0.8417130708694458, + "num_tokens": 77524344.0, + "step": 2027 + }, + { + "epoch": 0.25798244498155454, + "ewc_loss": 0.019290262833237648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.645131285651587e-06, + "grad_norm": 19.789037704467773, + "learning_rate": 8.592623993217465e-07, + "loss": 0.5047, + "mean_token_accuracy": 0.8436099290847778, + "num_tokens": 77562746.0, + "step": 2028 + }, + { + "epoch": 0.258109655260145, + "ewc_loss": 0.019147304818034172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.573652278049849e-06, + "grad_norm": 19.499332427978516, + "learning_rate": 8.596863077575244e-07, + "loss": 0.5266, + "mean_token_accuracy": 0.8365649580955505, + "num_tokens": 77601261.0, + "step": 2029 + }, + { + "epoch": 0.25823686553873554, + "ewc_loss": 0.019170038402080536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.585019142832607e-06, + "grad_norm": 19.530771255493164, + "learning_rate": 8.601102161933023e-07, + "loss": 0.508, + "mean_token_accuracy": 0.8403569459915161, + "num_tokens": 77640054.0, + "step": 2030 + }, + { + "epoch": 0.25836407581732607, + "ewc_loss": 0.019266294315457344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.633146873966325e-06, + "grad_norm": 19.652103424072266, + "learning_rate": 8.6053412462908e-07, + "loss": 0.4818, + "mean_token_accuracy": 0.8482944965362549, + "num_tokens": 77680098.0, + "step": 2031 + }, + { + "epoch": 0.25849128609591654, + "ewc_loss": 0.019223405048251152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.611702807887923e-06, + "grad_norm": 19.547863006591797, + "learning_rate": 8.60958033064858e-07, + "loss": 0.4739, + "mean_token_accuracy": 0.8511661291122437, + "num_tokens": 77712517.0, + "step": 2032 + }, + { + "epoch": 0.25861849637450707, + "ewc_loss": 0.01925054006278515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.62526974035427e-06, + "grad_norm": 19.512296676635742, + "learning_rate": 8.613819415006358e-07, + "loss": 0.5378, + "mean_token_accuracy": 0.8372504115104675, + "num_tokens": 77751335.0, + "step": 2033 + }, + { + "epoch": 0.2587457066530976, + "ewc_loss": 0.019268758594989777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.634379239287227e-06, + "grad_norm": 19.57634735107422, + "learning_rate": 8.618058499364137e-07, + "loss": 0.5231, + "mean_token_accuracy": 0.8356598019599915, + "num_tokens": 77792300.0, + "step": 2034 + }, + { + "epoch": 0.25887291693168807, + "ewc_loss": 0.01934005320072174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.670026884123217e-06, + "grad_norm": 19.677331924438477, + "learning_rate": 8.622297583721916e-07, + "loss": 0.482, + "mean_token_accuracy": 0.8508273363113403, + "num_tokens": 77836391.0, + "step": 2035 + }, + { + "epoch": 0.2590001272102786, + "ewc_loss": 0.019304266199469566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.652133485360537e-06, + "grad_norm": 19.59110450744629, + "learning_rate": 8.626536668079695e-07, + "loss": 0.4935, + "mean_token_accuracy": 0.8415316343307495, + "num_tokens": 77871600.0, + "step": 2036 + }, + { + "epoch": 0.2591273374888691, + "ewc_loss": 0.019320517778396606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.660258911026176e-06, + "grad_norm": 19.57696533203125, + "learning_rate": 8.630775752437474e-07, + "loss": 0.4832, + "mean_token_accuracy": 0.8482920527458191, + "num_tokens": 77904623.0, + "step": 2037 + }, + { + "epoch": 0.2592545477674596, + "ewc_loss": 0.01930246502161026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.65123217611108e-06, + "grad_norm": 19.508737564086914, + "learning_rate": 8.635014836795251e-07, + "loss": 0.5215, + "mean_token_accuracy": 0.836562991142273, + "num_tokens": 77941145.0, + "step": 2038 + }, + { + "epoch": 0.2593817580460501, + "ewc_loss": 0.019394846633076668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.697423593024723e-06, + "grad_norm": 19.620670318603516, + "learning_rate": 8.63925392115303e-07, + "loss": 0.4594, + "mean_token_accuracy": 0.8521183133125305, + "num_tokens": 77975087.0, + "step": 2039 + }, + { + "epoch": 0.25950896832464065, + "ewc_loss": 0.01940269023180008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.701345334178768e-06, + "grad_norm": 19.643871307373047, + "learning_rate": 8.643493005510809e-07, + "loss": 0.4663, + "mean_token_accuracy": 0.8514055013656616, + "num_tokens": 78009526.0, + "step": 2040 + }, + { + "epoch": 0.2596361786032311, + "ewc_loss": 0.01942039281129837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.710196536616422e-06, + "grad_norm": 19.51323890686035, + "learning_rate": 8.647732089868588e-07, + "loss": 0.5437, + "mean_token_accuracy": 0.8295251131057739, + "num_tokens": 78050918.0, + "step": 2041 + }, + { + "epoch": 0.25976338888182166, + "ewc_loss": 0.019423669204115868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.711834536574315e-06, + "grad_norm": 19.710769653320312, + "learning_rate": 8.651971174226366e-07, + "loss": 0.4608, + "mean_token_accuracy": 0.8544415235519409, + "num_tokens": 78085547.0, + "step": 2042 + }, + { + "epoch": 0.2598905991604122, + "ewc_loss": 0.0194752998650074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.737649634189438e-06, + "grad_norm": 19.55115509033203, + "learning_rate": 8.656210258584146e-07, + "loss": 0.5302, + "mean_token_accuracy": 0.8321516513824463, + "num_tokens": 78122916.0, + "step": 2043 + }, + { + "epoch": 0.26001780943900266, + "ewc_loss": 0.019452642649412155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.726320968184154e-06, + "grad_norm": 19.748899459838867, + "learning_rate": 8.660449342941924e-07, + "loss": 0.5228, + "mean_token_accuracy": 0.8325446844100952, + "num_tokens": 78161800.0, + "step": 2044 + }, + { + "epoch": 0.2601450197175932, + "ewc_loss": 0.019521310925483704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.760655302670784e-06, + "grad_norm": 19.611671447753906, + "learning_rate": 8.664688427299704e-07, + "loss": 0.501, + "mean_token_accuracy": 0.841462254524231, + "num_tokens": 78199961.0, + "step": 2045 + }, + { + "epoch": 0.2602722299961837, + "ewc_loss": 0.019501732662320137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.750866411195602e-06, + "grad_norm": 19.64341926574707, + "learning_rate": 8.668927511657481e-07, + "loss": 0.4586, + "mean_token_accuracy": 0.8529665470123291, + "num_tokens": 78231332.0, + "step": 2046 + }, + { + "epoch": 0.2603994402747742, + "ewc_loss": 0.019532442092895508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.766221410245635e-06, + "grad_norm": 19.595075607299805, + "learning_rate": 8.67316659601526e-07, + "loss": 0.5144, + "mean_token_accuracy": 0.8368527889251709, + "num_tokens": 78265054.0, + "step": 2047 + }, + { + "epoch": 0.2605266505533647, + "ewc_loss": 0.019542111083865166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.771055374585558e-06, + "grad_norm": 19.566757202148438, + "learning_rate": 8.677405680373039e-07, + "loss": 0.4714, + "mean_token_accuracy": 0.8533490300178528, + "num_tokens": 78305445.0, + "step": 2048 + }, + { + "epoch": 0.26065386083195524, + "ewc_loss": 0.01955501362681389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.777506420505233e-06, + "grad_norm": 19.734899520874023, + "learning_rate": 8.681644764730818e-07, + "loss": 0.5019, + "mean_token_accuracy": 0.8404309153556824, + "num_tokens": 78339174.0, + "step": 2049 + }, + { + "epoch": 0.2607810711105457, + "ewc_loss": 0.019601253792643547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.800626685319003e-06, + "grad_norm": 19.658960342407227, + "learning_rate": 8.685883849088596e-07, + "loss": 0.4432, + "mean_token_accuracy": 0.8594886064529419, + "num_tokens": 78377630.0, + "step": 2050 + }, + { + "epoch": 0.26090828138913624, + "ewc_loss": 0.019590336829423904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.795168807613663e-06, + "grad_norm": 19.750898361206055, + "learning_rate": 8.690122933446376e-07, + "loss": 0.4988, + "mean_token_accuracy": 0.8458734154701233, + "num_tokens": 78419908.0, + "step": 2051 + }, + { + "epoch": 0.26103549166772677, + "ewc_loss": 0.019629064947366714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.81453285930911e-06, + "grad_norm": 19.655933380126953, + "learning_rate": 8.694362017804154e-07, + "loss": 0.4655, + "mean_token_accuracy": 0.8538831472396851, + "num_tokens": 78459561.0, + "step": 2052 + }, + { + "epoch": 0.26116270194631724, + "ewc_loss": 0.019581254571676254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.790627700567711e-06, + "grad_norm": 19.66050148010254, + "learning_rate": 8.698601102161933e-07, + "loss": 0.4977, + "mean_token_accuracy": 0.8433328866958618, + "num_tokens": 78502500.0, + "step": 2053 + }, + { + "epoch": 0.26128991222490777, + "ewc_loss": 0.019599292427301407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.799646250030492e-06, + "grad_norm": 19.723772048950195, + "learning_rate": 8.702840186519711e-07, + "loss": 0.4544, + "mean_token_accuracy": 0.8565613627433777, + "num_tokens": 78545411.0, + "step": 2054 + }, + { + "epoch": 0.2614171225034983, + "ewc_loss": 0.019613003358244896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.806502021092456e-06, + "grad_norm": 19.585859298706055, + "learning_rate": 8.70707927087749e-07, + "loss": 0.4538, + "mean_token_accuracy": 0.8565161228179932, + "num_tokens": 78585878.0, + "step": 2055 + }, + { + "epoch": 0.2615443327820888, + "ewc_loss": 0.019603896886110306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.801948181120679e-06, + "grad_norm": 19.71345329284668, + "learning_rate": 8.711318355235269e-07, + "loss": 0.5432, + "mean_token_accuracy": 0.8276225328445435, + "num_tokens": 78621233.0, + "step": 2056 + }, + { + "epoch": 0.2616715430606793, + "ewc_loss": 0.019705424085259438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.852711627900135e-06, + "grad_norm": 19.768245697021484, + "learning_rate": 8.715557439593047e-07, + "loss": 0.4635, + "mean_token_accuracy": 0.8512505292892456, + "num_tokens": 78654644.0, + "step": 2057 + }, + { + "epoch": 0.26179875333926983, + "ewc_loss": 0.01965784840285778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.828923793975264e-06, + "grad_norm": 19.835702896118164, + "learning_rate": 8.719796523950826e-07, + "loss": 0.5415, + "mean_token_accuracy": 0.8289141654968262, + "num_tokens": 78692851.0, + "step": 2058 + }, + { + "epoch": 0.2619259636178603, + "ewc_loss": 0.01968058943748474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.840294296736829e-06, + "grad_norm": 19.738357543945312, + "learning_rate": 8.724035608308605e-07, + "loss": 0.4356, + "mean_token_accuracy": 0.8620310425758362, + "num_tokens": 78725329.0, + "step": 2059 + }, + { + "epoch": 0.26205317389645083, + "ewc_loss": 0.019686520099639893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.84326015895931e-06, + "grad_norm": 19.75180435180664, + "learning_rate": 8.728274692666384e-07, + "loss": 0.4913, + "mean_token_accuracy": 0.8446713089942932, + "num_tokens": 78766645.0, + "step": 2060 + }, + { + "epoch": 0.26218038417504136, + "ewc_loss": 0.019713478162884712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.856738870439585e-06, + "grad_norm": 19.743165969848633, + "learning_rate": 8.732513777024162e-07, + "loss": 0.4843, + "mean_token_accuracy": 0.848565399646759, + "num_tokens": 78801576.0, + "step": 2061 + }, + { + "epoch": 0.26230759445363183, + "ewc_loss": 0.01972837559878826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.864187632047106e-06, + "grad_norm": 19.8785400390625, + "learning_rate": 8.736752861381941e-07, + "loss": 0.5298, + "mean_token_accuracy": 0.8327349424362183, + "num_tokens": 78845577.0, + "step": 2062 + }, + { + "epoch": 0.26243480473222236, + "ewc_loss": 0.019708532840013504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.854266863840166e-06, + "grad_norm": 19.806995391845703, + "learning_rate": 8.740991945739719e-07, + "loss": 0.4949, + "mean_token_accuracy": 0.845032811164856, + "num_tokens": 78884229.0, + "step": 2063 + }, + { + "epoch": 0.2625620150108129, + "ewc_loss": 0.019676456227898598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.838227924774401e-06, + "grad_norm": 19.7076358795166, + "learning_rate": 8.745231030097499e-07, + "loss": 0.4806, + "mean_token_accuracy": 0.8488296270370483, + "num_tokens": 78926948.0, + "step": 2064 + }, + { + "epoch": 0.26268922528940336, + "ewc_loss": 0.019680067896842957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.840034181252122e-06, + "grad_norm": 19.88252830505371, + "learning_rate": 8.749470114455277e-07, + "loss": 0.4565, + "mean_token_accuracy": 0.8546031713485718, + "num_tokens": 78963582.0, + "step": 2065 + }, + { + "epoch": 0.2628164355679939, + "ewc_loss": 0.019743872806429863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.871936526906211e-06, + "grad_norm": 19.760704040527344, + "learning_rate": 8.753709198813056e-07, + "loss": 0.5656, + "mean_token_accuracy": 0.8228634595870972, + "num_tokens": 79007345.0, + "step": 2066 + }, + { + "epoch": 0.2629436458465844, + "ewc_loss": 0.019712679088115692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.856339602265507e-06, + "grad_norm": 20.00754165649414, + "learning_rate": 8.757948283170835e-07, + "loss": 0.4621, + "mean_token_accuracy": 0.8563259840011597, + "num_tokens": 79049505.0, + "step": 2067 + }, + { + "epoch": 0.2630708561251749, + "ewc_loss": 0.01975037343800068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.875187060970347e-06, + "grad_norm": 19.721670150756836, + "learning_rate": 8.762187367528613e-07, + "loss": 0.4565, + "mean_token_accuracy": 0.8537048101425171, + "num_tokens": 79087419.0, + "step": 2068 + }, + { + "epoch": 0.2631980664037654, + "ewc_loss": 0.01967081055045128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.835404853220098e-06, + "grad_norm": 19.906612396240234, + "learning_rate": 8.766426451886392e-07, + "loss": 0.5047, + "mean_token_accuracy": 0.8391536474227905, + "num_tokens": 79131444.0, + "step": 2069 + }, + { + "epoch": 0.26332527668235595, + "ewc_loss": 0.019784223288297653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.89211184787564e-06, + "grad_norm": 19.848276138305664, + "learning_rate": 8.770665536244171e-07, + "loss": 0.5316, + "mean_token_accuracy": 0.8366795182228088, + "num_tokens": 79171976.0, + "step": 2070 + }, + { + "epoch": 0.2634524869609464, + "ewc_loss": 0.019695697352290154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.847848559729755e-06, + "grad_norm": 19.780019760131836, + "learning_rate": 8.774904620601949e-07, + "loss": 0.4592, + "mean_token_accuracy": 0.852990448474884, + "num_tokens": 79206273.0, + "step": 2071 + }, + { + "epoch": 0.26357969723953695, + "ewc_loss": 0.01976172812283039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.880864126898814e-06, + "grad_norm": 20.036409378051758, + "learning_rate": 8.779143704959729e-07, + "loss": 0.4632, + "mean_token_accuracy": 0.8531334400177002, + "num_tokens": 79245564.0, + "step": 2072 + }, + { + "epoch": 0.2637069075181275, + "ewc_loss": 0.019773215055465698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.88660758594051e-06, + "grad_norm": 19.863079071044922, + "learning_rate": 8.783382789317507e-07, + "loss": 0.4889, + "mean_token_accuracy": 0.8455735445022583, + "num_tokens": 79281857.0, + "step": 2073 + }, + { + "epoch": 0.26383411779671795, + "ewc_loss": 0.019737662747502327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.868831511994358e-06, + "grad_norm": 20.02796745300293, + "learning_rate": 8.787621873675286e-07, + "loss": 0.526, + "mean_token_accuracy": 0.8306432962417603, + "num_tokens": 79312482.0, + "step": 2074 + }, + { + "epoch": 0.2639613280753085, + "ewc_loss": 0.01979760266840458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.89880118140718e-06, + "grad_norm": 19.97087860107422, + "learning_rate": 8.791860958033065e-07, + "loss": 0.4651, + "mean_token_accuracy": 0.8537116050720215, + "num_tokens": 79348157.0, + "step": 2075 + }, + { + "epoch": 0.264088538353899, + "ewc_loss": 0.019729599356651306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.864799721981399e-06, + "grad_norm": 19.997718811035156, + "learning_rate": 8.796100042390842e-07, + "loss": 0.4507, + "mean_token_accuracy": 0.8577635288238525, + "num_tokens": 79387540.0, + "step": 2076 + }, + { + "epoch": 0.2642157486324895, + "ewc_loss": 0.019755996763706207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.877998309093527e-06, + "grad_norm": 19.827795028686523, + "learning_rate": 8.800339126748622e-07, + "loss": 0.4698, + "mean_token_accuracy": 0.8498420119285583, + "num_tokens": 79421424.0, + "step": 2077 + }, + { + "epoch": 0.26434295891108, + "ewc_loss": 0.019746657460927963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.873328963294625e-06, + "grad_norm": 19.956130981445312, + "learning_rate": 8.8045782111064e-07, + "loss": 0.5402, + "mean_token_accuracy": 0.8300699591636658, + "num_tokens": 79458733.0, + "step": 2078 + }, + { + "epoch": 0.26447016918967053, + "ewc_loss": 0.0198379959911108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.918998330249451e-06, + "grad_norm": 19.891752243041992, + "learning_rate": 8.808817295464179e-07, + "loss": 0.4926, + "mean_token_accuracy": 0.84622722864151, + "num_tokens": 79495394.0, + "step": 2079 + }, + { + "epoch": 0.26459737946826106, + "ewc_loss": 0.019792016595602036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.896008123178035e-06, + "grad_norm": 19.875, + "learning_rate": 8.813056379821958e-07, + "loss": 0.4624, + "mean_token_accuracy": 0.855060338973999, + "num_tokens": 79529799.0, + "step": 2080 + }, + { + "epoch": 0.26472458974685154, + "ewc_loss": 0.019856007769703865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.928004146786407e-06, + "grad_norm": 19.96805191040039, + "learning_rate": 8.817295464179737e-07, + "loss": 0.4879, + "mean_token_accuracy": 0.844935953617096, + "num_tokens": 79564664.0, + "step": 2081 + }, + { + "epoch": 0.26485180002544206, + "ewc_loss": 0.019840171560645103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.920086085912772e-06, + "grad_norm": 19.8532772064209, + "learning_rate": 8.821534548537515e-07, + "loss": 0.5667, + "mean_token_accuracy": 0.8200036287307739, + "num_tokens": 79606104.0, + "step": 2082 + }, + { + "epoch": 0.2649790103040326, + "ewc_loss": 0.019893839955329895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.94691981759388e-06, + "grad_norm": 19.993282318115234, + "learning_rate": 8.825773632895295e-07, + "loss": 0.4609, + "mean_token_accuracy": 0.8548060059547424, + "num_tokens": 79647688.0, + "step": 2083 + }, + { + "epoch": 0.26510622058262306, + "ewc_loss": 0.01993831992149353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.969159691536333e-06, + "grad_norm": 20.131044387817383, + "learning_rate": 8.830012717253072e-07, + "loss": 0.4786, + "mean_token_accuracy": 0.8510844111442566, + "num_tokens": 79685500.0, + "step": 2084 + }, + { + "epoch": 0.2652334308612136, + "ewc_loss": 0.019892172887921333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.946086720447056e-06, + "grad_norm": 19.91572380065918, + "learning_rate": 8.834251801610852e-07, + "loss": 0.5077, + "mean_token_accuracy": 0.8396071195602417, + "num_tokens": 79729912.0, + "step": 2085 + }, + { + "epoch": 0.2653606411398041, + "ewc_loss": 0.01984529383480549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.922647222992964e-06, + "grad_norm": 19.955238342285156, + "learning_rate": 8.83849088596863e-07, + "loss": 0.4647, + "mean_token_accuracy": 0.8538945913314819, + "num_tokens": 79767487.0, + "step": 2086 + }, + { + "epoch": 0.2654878514183946, + "ewc_loss": 0.01993337832391262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.966689503926318e-06, + "grad_norm": 19.990890502929688, + "learning_rate": 8.842729970326409e-07, + "loss": 0.442, + "mean_token_accuracy": 0.862661600112915, + "num_tokens": 79804802.0, + "step": 2087 + }, + { + "epoch": 0.2656150616969851, + "ewc_loss": 0.019919175654649734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.959588169294875e-06, + "grad_norm": 19.925180435180664, + "learning_rate": 8.846969054684188e-07, + "loss": 0.5609, + "mean_token_accuracy": 0.8272262811660767, + "num_tokens": 79845685.0, + "step": 2088 + }, + { + "epoch": 0.26574227197557565, + "ewc_loss": 0.01991157792508602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.95578920992557e-06, + "grad_norm": 19.946128845214844, + "learning_rate": 8.851208139041967e-07, + "loss": 0.4511, + "mean_token_accuracy": 0.8561201095581055, + "num_tokens": 79884692.0, + "step": 2089 + }, + { + "epoch": 0.2658694822541661, + "ewc_loss": 0.0199382696300745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.969135135179386e-06, + "grad_norm": 19.95004653930664, + "learning_rate": 8.855447223399745e-07, + "loss": 0.5109, + "mean_token_accuracy": 0.8362945318222046, + "num_tokens": 79919780.0, + "step": 2090 + }, + { + "epoch": 0.26599669253275665, + "ewc_loss": 0.019970793277025223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.985396900447085e-06, + "grad_norm": 20.006196975708008, + "learning_rate": 8.859686307757524e-07, + "loss": 0.5354, + "mean_token_accuracy": 0.8317075967788696, + "num_tokens": 79958317.0, + "step": 2091 + }, + { + "epoch": 0.2661239028113472, + "ewc_loss": 0.019979383796453476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.989691534428857e-06, + "grad_norm": 19.927135467529297, + "learning_rate": 8.863925392115302e-07, + "loss": 0.468, + "mean_token_accuracy": 0.8512671589851379, + "num_tokens": 79994126.0, + "step": 2092 + }, + { + "epoch": 0.26625111308993765, + "ewc_loss": 0.019982866942882538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.991433216782752e-06, + "grad_norm": 19.941150665283203, + "learning_rate": 8.868164476473082e-07, + "loss": 0.4389, + "mean_token_accuracy": 0.8629165887832642, + "num_tokens": 80033157.0, + "step": 2093 + }, + { + "epoch": 0.2663783233685282, + "ewc_loss": 0.020006656646728516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.000332849798724e-05, + "grad_norm": 19.832904815673828, + "learning_rate": 8.87240356083086e-07, + "loss": 0.4666, + "mean_token_accuracy": 0.8539936542510986, + "num_tokens": 80073354.0, + "step": 2094 + }, + { + "epoch": 0.2665055336471187, + "ewc_loss": 0.01998964324593544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 9.994821994041558e-06, + "grad_norm": 19.957027435302734, + "learning_rate": 8.876642645188639e-07, + "loss": 0.529, + "mean_token_accuracy": 0.8335568308830261, + "num_tokens": 80111819.0, + "step": 2095 + }, + { + "epoch": 0.2666327439257092, + "ewc_loss": 0.020032448694109917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0016224223363679e-05, + "grad_norm": 19.966663360595703, + "learning_rate": 8.880881729546418e-07, + "loss": 0.4437, + "mean_token_accuracy": 0.8585296869277954, + "num_tokens": 80147951.0, + "step": 2096 + }, + { + "epoch": 0.2667599542042997, + "ewc_loss": 0.0200000312179327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0000015208788682e-05, + "grad_norm": 19.902301788330078, + "learning_rate": 8.885120813904197e-07, + "loss": 0.456, + "mean_token_accuracy": 0.8546686768531799, + "num_tokens": 80191319.0, + "step": 2097 + }, + { + "epoch": 0.26688716448289024, + "ewc_loss": 0.020011600106954575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0005799595091958e-05, + "grad_norm": 19.90875244140625, + "learning_rate": 8.889359898261976e-07, + "loss": 0.5442, + "mean_token_accuracy": 0.8326320648193359, + "num_tokens": 80226300.0, + "step": 2098 + }, + { + "epoch": 0.2670143747614807, + "ewc_loss": 0.02008875645697117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0044378541351762e-05, + "grad_norm": 19.999916076660156, + "learning_rate": 8.893598982619753e-07, + "loss": 0.4695, + "mean_token_accuracy": 0.8513463735580444, + "num_tokens": 80265465.0, + "step": 2099 + }, + { + "epoch": 0.26714158504007124, + "ewc_loss": 0.020081983879208565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.004099158308236e-05, + "grad_norm": 19.930936813354492, + "learning_rate": 8.897838066977532e-07, + "loss": 0.4697, + "mean_token_accuracy": 0.8514912128448486, + "num_tokens": 80312696.0, + "step": 2100 + }, + { + "epoch": 0.26726879531866177, + "ewc_loss": 0.02008853852748871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0044269401987549e-05, + "grad_norm": 20.017494201660156, + "learning_rate": 8.902077151335311e-07, + "loss": 0.4932, + "mean_token_accuracy": 0.8432573080062866, + "num_tokens": 80353507.0, + "step": 2101 + }, + { + "epoch": 0.26739600559725224, + "ewc_loss": 0.02010369673371315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0051848221337423e-05, + "grad_norm": 20.026691436767578, + "learning_rate": 8.90631623569309e-07, + "loss": 0.5753, + "mean_token_accuracy": 0.8173366189002991, + "num_tokens": 80390795.0, + "step": 2102 + }, + { + "epoch": 0.26752321587584277, + "ewc_loss": 0.020112233236432076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0056116479972843e-05, + "grad_norm": 20.050607681274414, + "learning_rate": 8.910555320050868e-07, + "loss": 0.4842, + "mean_token_accuracy": 0.8468566536903381, + "num_tokens": 80427599.0, + "step": 2103 + }, + { + "epoch": 0.2676504261544333, + "ewc_loss": 0.020078474655747414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.003923716780264e-05, + "grad_norm": 19.98621368408203, + "learning_rate": 8.914794404408648e-07, + "loss": 0.4872, + "mean_token_accuracy": 0.8468877077102661, + "num_tokens": 80468084.0, + "step": 2104 + }, + { + "epoch": 0.26777763643302377, + "ewc_loss": 0.020094353705644608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0047177056549117e-05, + "grad_norm": 20.029245376586914, + "learning_rate": 8.919033488766426e-07, + "loss": 0.4319, + "mean_token_accuracy": 0.8634228706359863, + "num_tokens": 80507218.0, + "step": 2105 + }, + { + "epoch": 0.2679048467116143, + "ewc_loss": 0.020141219720244408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0070610187540296e-05, + "grad_norm": 20.043684005737305, + "learning_rate": 8.923272573124204e-07, + "loss": 0.5153, + "mean_token_accuracy": 0.8389217853546143, + "num_tokens": 80541730.0, + "step": 2106 + }, + { + "epoch": 0.2680320569902048, + "ewc_loss": 0.02010650932788849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0053254300146364e-05, + "grad_norm": 19.913190841674805, + "learning_rate": 8.927511657481983e-07, + "loss": 0.5259, + "mean_token_accuracy": 0.8336887359619141, + "num_tokens": 80584677.0, + "step": 2107 + }, + { + "epoch": 0.2681592672687953, + "ewc_loss": 0.02015657164156437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0078285413328558e-05, + "grad_norm": 20.029260635375977, + "learning_rate": 8.931750741839762e-07, + "loss": 0.4614, + "mean_token_accuracy": 0.8560027480125427, + "num_tokens": 80616332.0, + "step": 2108 + }, + { + "epoch": 0.2682864775473858, + "ewc_loss": 0.020162440836429596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.008122035273118e-05, + "grad_norm": 19.879240036010742, + "learning_rate": 8.935989826197541e-07, + "loss": 0.469, + "mean_token_accuracy": 0.8534594774246216, + "num_tokens": 80654435.0, + "step": 2109 + }, + { + "epoch": 0.26841368782597635, + "ewc_loss": 0.02018011175096035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0090056093758903e-05, + "grad_norm": 20.096508026123047, + "learning_rate": 8.94022891055532e-07, + "loss": 0.4461, + "mean_token_accuracy": 0.863080620765686, + "num_tokens": 80691246.0, + "step": 2110 + }, + { + "epoch": 0.2685408981045668, + "ewc_loss": 0.02021787501871586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.010893720376771e-05, + "grad_norm": 20.00901985168457, + "learning_rate": 8.944467994913098e-07, + "loss": 0.4971, + "mean_token_accuracy": 0.8394763469696045, + "num_tokens": 80730510.0, + "step": 2111 + }, + { + "epoch": 0.26866810838315736, + "ewc_loss": 0.0202139001339674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0106949957844336e-05, + "grad_norm": 20.038267135620117, + "learning_rate": 8.948707079270878e-07, + "loss": 0.4151, + "mean_token_accuracy": 0.8676477670669556, + "num_tokens": 80769369.0, + "step": 2112 + }, + { + "epoch": 0.2687953186617479, + "ewc_loss": 0.02019430510699749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0097152880916838e-05, + "grad_norm": 20.052148818969727, + "learning_rate": 8.952946163628656e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8492206335067749, + "num_tokens": 80802168.0, + "step": 2113 + }, + { + "epoch": 0.26892252894033836, + "ewc_loss": 0.020218687132000923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0109343747899402e-05, + "grad_norm": 19.95448112487793, + "learning_rate": 8.957185247986434e-07, + "loss": 0.4198, + "mean_token_accuracy": 0.8673241138458252, + "num_tokens": 80839389.0, + "step": 2114 + }, + { + "epoch": 0.2690497392189289, + "ewc_loss": 0.020264778286218643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0132389434147626e-05, + "grad_norm": 20.133771896362305, + "learning_rate": 8.961424332344213e-07, + "loss": 0.4751, + "mean_token_accuracy": 0.8514634370803833, + "num_tokens": 80881139.0, + "step": 2115 + }, + { + "epoch": 0.2691769494975194, + "ewc_loss": 0.020279955118894577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0139977348444518e-05, + "grad_norm": 20.083847045898438, + "learning_rate": 8.965663416701992e-07, + "loss": 0.4783, + "mean_token_accuracy": 0.8474996089935303, + "num_tokens": 80919809.0, + "step": 2116 + }, + { + "epoch": 0.2693041597761099, + "ewc_loss": 0.020220216363668442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0110107723448891e-05, + "grad_norm": 20.054344177246094, + "learning_rate": 8.969902501059771e-07, + "loss": 0.4534, + "mean_token_accuracy": 0.8585268259048462, + "num_tokens": 80956860.0, + "step": 2117 + }, + { + "epoch": 0.2694313700547004, + "ewc_loss": 0.020305437967181206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0152719369216356e-05, + "grad_norm": 20.051841735839844, + "learning_rate": 8.97414158541755e-07, + "loss": 0.4206, + "mean_token_accuracy": 0.8665264844894409, + "num_tokens": 80994322.0, + "step": 2118 + }, + { + "epoch": 0.26955858033329094, + "ewc_loss": 0.020266523584723473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0133261639566626e-05, + "grad_norm": 20.12176513671875, + "learning_rate": 8.978380669775328e-07, + "loss": 0.5035, + "mean_token_accuracy": 0.840469479560852, + "num_tokens": 81032792.0, + "step": 2119 + }, + { + "epoch": 0.2696857906118814, + "ewc_loss": 0.020294256508350372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0147127795789856e-05, + "grad_norm": 20.023942947387695, + "learning_rate": 8.982619754133107e-07, + "loss": 0.5056, + "mean_token_accuracy": 0.8412290215492249, + "num_tokens": 81070011.0, + "step": 2120 + }, + { + "epoch": 0.26981300089047194, + "ewc_loss": 0.02028823085129261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0144115549337585e-05, + "grad_norm": 20.12999153137207, + "learning_rate": 8.986858838490886e-07, + "loss": 0.5066, + "mean_token_accuracy": 0.8411718606948853, + "num_tokens": 81113145.0, + "step": 2121 + }, + { + "epoch": 0.26994021116906247, + "ewc_loss": 0.02031737007200718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0158684744965285e-05, + "grad_norm": 20.139869689941406, + "learning_rate": 8.991097922848663e-07, + "loss": 0.4761, + "mean_token_accuracy": 0.8510090112686157, + "num_tokens": 81154175.0, + "step": 2122 + }, + { + "epoch": 0.27006742144765294, + "ewc_loss": 0.02025451883673668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0127259884029627e-05, + "grad_norm": 20.124109268188477, + "learning_rate": 8.995337007206443e-07, + "loss": 0.499, + "mean_token_accuracy": 0.8448071479797363, + "num_tokens": 81193319.0, + "step": 2123 + }, + { + "epoch": 0.2701946317262435, + "ewc_loss": 0.020276334136724472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.013816745398799e-05, + "grad_norm": 20.139732360839844, + "learning_rate": 8.999576091564221e-07, + "loss": 0.4662, + "mean_token_accuracy": 0.8524671792984009, + "num_tokens": 81229057.0, + "step": 2124 + }, + { + "epoch": 0.270321842004834, + "ewc_loss": 0.020309455692768097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0154727533517871e-05, + "grad_norm": 20.09377670288086, + "learning_rate": 9.003815175922001e-07, + "loss": 0.4749, + "mean_token_accuracy": 0.8496049642562866, + "num_tokens": 81262380.0, + "step": 2125 + }, + { + "epoch": 0.2704490522834245, + "ewc_loss": 0.02033483237028122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0167415894102305e-05, + "grad_norm": 20.160297393798828, + "learning_rate": 9.008054260279779e-07, + "loss": 0.4784, + "mean_token_accuracy": 0.8500810861587524, + "num_tokens": 81304080.0, + "step": 2126 + }, + { + "epoch": 0.270576262562015, + "ewc_loss": 0.020339712500572205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0169856068387162e-05, + "grad_norm": 20.126245498657227, + "learning_rate": 9.012293344637558e-07, + "loss": 0.51, + "mean_token_accuracy": 0.8421326279640198, + "num_tokens": 81340226.0, + "step": 2127 + }, + { + "epoch": 0.27070347284060553, + "ewc_loss": 0.020373499020934105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0186749932472594e-05, + "grad_norm": 20.267295837402344, + "learning_rate": 9.016532428995337e-07, + "loss": 0.4559, + "mean_token_accuracy": 0.8548438549041748, + "num_tokens": 81378046.0, + "step": 2128 + }, + { + "epoch": 0.27083068311919606, + "ewc_loss": 0.020328225567936897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0164112609345466e-05, + "grad_norm": 20.054553985595703, + "learning_rate": 9.020771513353115e-07, + "loss": 0.4993, + "mean_token_accuracy": 0.8432579040527344, + "num_tokens": 81419486.0, + "step": 2129 + }, + { + "epoch": 0.27095789339778653, + "ewc_loss": 0.020313965156674385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0156982170883566e-05, + "grad_norm": 20.203445434570312, + "learning_rate": 9.025010597710894e-07, + "loss": 0.4997, + "mean_token_accuracy": 0.8417232632637024, + "num_tokens": 81457488.0, + "step": 2130 + }, + { + "epoch": 0.27108510367637706, + "ewc_loss": 0.020439479500055313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0219740033790004e-05, + "grad_norm": 20.16486358642578, + "learning_rate": 9.029249682068673e-07, + "loss": 0.4686, + "mean_token_accuracy": 0.8521747589111328, + "num_tokens": 81494812.0, + "step": 2131 + }, + { + "epoch": 0.2712123139549676, + "ewc_loss": 0.020332179963588715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.016608985082712e-05, + "grad_norm": 20.14605140686035, + "learning_rate": 9.033488766426451e-07, + "loss": 0.4862, + "mean_token_accuracy": 0.8444262146949768, + "num_tokens": 81531421.0, + "step": 2132 + }, + { + "epoch": 0.27133952423355806, + "ewc_loss": 0.020398952066898346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0199476491834503e-05, + "grad_norm": 20.171483993530273, + "learning_rate": 9.037727850784231e-07, + "loss": 0.4506, + "mean_token_accuracy": 0.855014979839325, + "num_tokens": 81567221.0, + "step": 2133 + }, + { + "epoch": 0.2714667345121486, + "ewc_loss": 0.020399846136569977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0199923053733073e-05, + "grad_norm": 20.175554275512695, + "learning_rate": 9.041966935142009e-07, + "loss": 0.484, + "mean_token_accuracy": 0.8471556901931763, + "num_tokens": 81605832.0, + "step": 2134 + }, + { + "epoch": 0.2715939447907391, + "ewc_loss": 0.020446058362722397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0223028766631614e-05, + "grad_norm": 20.133197784423828, + "learning_rate": 9.046206019499788e-07, + "loss": 0.4389, + "mean_token_accuracy": 0.8629190921783447, + "num_tokens": 81646919.0, + "step": 2135 + }, + { + "epoch": 0.2717211550693296, + "ewc_loss": 0.0204352717846632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0217635463050101e-05, + "grad_norm": 20.196456909179688, + "learning_rate": 9.050445103857567e-07, + "loss": 0.5193, + "mean_token_accuracy": 0.8418407440185547, + "num_tokens": 81687214.0, + "step": 2136 + }, + { + "epoch": 0.2718483653479201, + "ewc_loss": 0.020476898178458214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0238449249300174e-05, + "grad_norm": 20.19397735595703, + "learning_rate": 9.054684188215344e-07, + "loss": 0.4235, + "mean_token_accuracy": 0.8661314845085144, + "num_tokens": 81721437.0, + "step": 2137 + }, + { + "epoch": 0.27197557562651065, + "ewc_loss": 0.02049652487039566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0248262697132304e-05, + "grad_norm": 20.15416145324707, + "learning_rate": 9.058923272573124e-07, + "loss": 0.4966, + "mean_token_accuracy": 0.8437647223472595, + "num_tokens": 81756595.0, + "step": 2138 + }, + { + "epoch": 0.2721027859051011, + "ewc_loss": 0.020481299608945847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0240650226478465e-05, + "grad_norm": 20.16661834716797, + "learning_rate": 9.063162356930902e-07, + "loss": 0.4617, + "mean_token_accuracy": 0.8557026386260986, + "num_tokens": 81796198.0, + "step": 2139 + }, + { + "epoch": 0.27222999618369165, + "ewc_loss": 0.020495334640145302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0247666978102643e-05, + "grad_norm": 20.226024627685547, + "learning_rate": 9.067401441288681e-07, + "loss": 0.4926, + "mean_token_accuracy": 0.8473474383354187, + "num_tokens": 81833002.0, + "step": 2140 + }, + { + "epoch": 0.2723572064622822, + "ewc_loss": 0.02055760845541954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0278804438712541e-05, + "grad_norm": 20.22521209716797, + "learning_rate": 9.07164052564646e-07, + "loss": 0.4639, + "mean_token_accuracy": 0.8491322994232178, + "num_tokens": 81872089.0, + "step": 2141 + }, + { + "epoch": 0.27248441674087265, + "ewc_loss": 0.020519187673926353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0259594091621693e-05, + "grad_norm": 20.23672103881836, + "learning_rate": 9.075879610004239e-07, + "loss": 0.499, + "mean_token_accuracy": 0.841457724571228, + "num_tokens": 81908356.0, + "step": 2142 + }, + { + "epoch": 0.2726116270194632, + "ewc_loss": 0.020537955686450005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0268978257954586e-05, + "grad_norm": 20.23056983947754, + "learning_rate": 9.080118694362017e-07, + "loss": 0.5047, + "mean_token_accuracy": 0.8388416767120361, + "num_tokens": 81950251.0, + "step": 2143 + }, + { + "epoch": 0.2727388372980537, + "ewc_loss": 0.020568450912833214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0284225027135108e-05, + "grad_norm": 20.21514892578125, + "learning_rate": 9.084357778719796e-07, + "loss": 0.4975, + "mean_token_accuracy": 0.8408372402191162, + "num_tokens": 81987157.0, + "step": 2144 + }, + { + "epoch": 0.2728660475766442, + "ewc_loss": 0.02061222679913044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0306113836122677e-05, + "grad_norm": 20.314531326293945, + "learning_rate": 9.088596863077574e-07, + "loss": 0.4811, + "mean_token_accuracy": 0.8526605367660522, + "num_tokens": 82023633.0, + "step": 2145 + }, + { + "epoch": 0.2729932578552347, + "ewc_loss": 0.020610980689525604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.030548992275726e-05, + "grad_norm": 20.203012466430664, + "learning_rate": 9.092835947435354e-07, + "loss": 0.4853, + "mean_token_accuracy": 0.843556821346283, + "num_tokens": 82061311.0, + "step": 2146 + }, + { + "epoch": 0.27312046813382523, + "ewc_loss": 0.020627055317163467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.031352803693153e-05, + "grad_norm": 20.316757202148438, + "learning_rate": 9.097075031793132e-07, + "loss": 0.5183, + "mean_token_accuracy": 0.8381121754646301, + "num_tokens": 82098370.0, + "step": 2147 + }, + { + "epoch": 0.2732476784124157, + "ewc_loss": 0.020669113844633102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0334557373425923e-05, + "grad_norm": 20.289655685424805, + "learning_rate": 9.101314116150911e-07, + "loss": 0.4463, + "mean_token_accuracy": 0.8625993728637695, + "num_tokens": 82134931.0, + "step": 2148 + }, + { + "epoch": 0.27337488869100623, + "ewc_loss": 0.02059798501431942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0298992492607795e-05, + "grad_norm": 20.2137393951416, + "learning_rate": 9.10555320050869e-07, + "loss": 0.4548, + "mean_token_accuracy": 0.8566156029701233, + "num_tokens": 82171391.0, + "step": 2149 + }, + { + "epoch": 0.27350209896959676, + "ewc_loss": 0.0206792913377285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0339645996282343e-05, + "grad_norm": 20.514345169067383, + "learning_rate": 9.109792284866469e-07, + "loss": 0.4734, + "mean_token_accuracy": 0.8541526794433594, + "num_tokens": 82214064.0, + "step": 2150 + }, + { + "epoch": 0.27362930924818724, + "ewc_loss": 0.020645365118980408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0322682101104874e-05, + "grad_norm": 20.495933532714844, + "learning_rate": 9.114031369224247e-07, + "loss": 0.4908, + "mean_token_accuracy": 0.845565915107727, + "num_tokens": 82252774.0, + "step": 2151 + }, + { + "epoch": 0.27375651952677776, + "ewc_loss": 0.020574934780597687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0287467375746928e-05, + "grad_norm": 20.192317962646484, + "learning_rate": 9.118270453582026e-07, + "loss": 0.5449, + "mean_token_accuracy": 0.8329070806503296, + "num_tokens": 82291645.0, + "step": 2152 + }, + { + "epoch": 0.2738837298053683, + "ewc_loss": 0.02057521976530552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0287610166415107e-05, + "grad_norm": 20.2440185546875, + "learning_rate": 9.122509537939804e-07, + "loss": 0.4757, + "mean_token_accuracy": 0.8449642658233643, + "num_tokens": 82333643.0, + "step": 2153 + }, + { + "epoch": 0.27401094008395877, + "ewc_loss": 0.020705342292785645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0352670869906433e-05, + "grad_norm": 20.452600479125977, + "learning_rate": 9.126748622297584e-07, + "loss": 0.5153, + "mean_token_accuracy": 0.8345049619674683, + "num_tokens": 82370792.0, + "step": 2154 + }, + { + "epoch": 0.2741381503625493, + "ewc_loss": 0.02066683955490589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0333419595554005e-05, + "grad_norm": 20.23042106628418, + "learning_rate": 9.130987706655362e-07, + "loss": 0.5067, + "mean_token_accuracy": 0.8401433229446411, + "num_tokens": 82408626.0, + "step": 2155 + }, + { + "epoch": 0.2742653606411398, + "ewc_loss": 0.020653456449508667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0326728443033062e-05, + "grad_norm": 20.239511489868164, + "learning_rate": 9.135226791013141e-07, + "loss": 0.5249, + "mean_token_accuracy": 0.8398538827896118, + "num_tokens": 82448401.0, + "step": 2156 + }, + { + "epoch": 0.2743925709197303, + "ewc_loss": 0.02073582075536251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.036791036312934e-05, + "grad_norm": 20.517000198364258, + "learning_rate": 9.13946587537092e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.8446402549743652, + "num_tokens": 82476644.0, + "step": 2157 + }, + { + "epoch": 0.2745197811983208, + "ewc_loss": 0.02069789171218872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0348945579607971e-05, + "grad_norm": 20.202951431274414, + "learning_rate": 9.143704959728699e-07, + "loss": 0.4983, + "mean_token_accuracy": 0.8379902839660645, + "num_tokens": 82512722.0, + "step": 2158 + }, + { + "epoch": 0.27464699147691135, + "ewc_loss": 0.020734146237373352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0367072718509007e-05, + "grad_norm": 20.403079986572266, + "learning_rate": 9.147944044086476e-07, + "loss": 0.5194, + "mean_token_accuracy": 0.8362939953804016, + "num_tokens": 82545393.0, + "step": 2159 + }, + { + "epoch": 0.2747742017555018, + "ewc_loss": 0.020743679255247116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0371839380241e-05, + "grad_norm": 20.3182315826416, + "learning_rate": 9.152183128444255e-07, + "loss": 0.5154, + "mean_token_accuracy": 0.8432485461235046, + "num_tokens": 82586329.0, + "step": 2160 + }, + { + "epoch": 0.27490141203409235, + "ewc_loss": 0.02076103165745735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0380515959695913e-05, + "grad_norm": 20.327232360839844, + "learning_rate": 9.156422212802034e-07, + "loss": 0.5222, + "mean_token_accuracy": 0.8326183557510376, + "num_tokens": 82625563.0, + "step": 2161 + }, + { + "epoch": 0.2750286223126829, + "ewc_loss": 0.020786510780453682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0393255251983646e-05, + "grad_norm": 20.349628448486328, + "learning_rate": 9.160661297159813e-07, + "loss": 0.4602, + "mean_token_accuracy": 0.853864848613739, + "num_tokens": 82660030.0, + "step": 2162 + }, + { + "epoch": 0.27515583259127335, + "ewc_loss": 0.020839236676692963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0419618774903938e-05, + "grad_norm": 20.636171340942383, + "learning_rate": 9.164900381517592e-07, + "loss": 0.483, + "mean_token_accuracy": 0.8471206426620483, + "num_tokens": 82698794.0, + "step": 2163 + }, + { + "epoch": 0.2752830428698639, + "ewc_loss": 0.020765405148267746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0382702384958975e-05, + "grad_norm": 20.21588134765625, + "learning_rate": 9.16913946587537e-07, + "loss": 0.4792, + "mean_token_accuracy": 0.8470509052276611, + "num_tokens": 82740015.0, + "step": 2164 + }, + { + "epoch": 0.2754102531484544, + "ewc_loss": 0.020763250067830086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0381624633737374e-05, + "grad_norm": 20.471193313598633, + "learning_rate": 9.17337855023315e-07, + "loss": 0.4941, + "mean_token_accuracy": 0.8425614833831787, + "num_tokens": 82775765.0, + "step": 2165 + }, + { + "epoch": 0.2755374634270449, + "ewc_loss": 0.020858434960246086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.042921758198645e-05, + "grad_norm": 20.398204803466797, + "learning_rate": 9.177617634590928e-07, + "loss": 0.4433, + "mean_token_accuracy": 0.8594381809234619, + "num_tokens": 82814324.0, + "step": 2166 + }, + { + "epoch": 0.2756646737056354, + "ewc_loss": 0.020808786153793335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0404392924101558e-05, + "grad_norm": 20.26655387878418, + "learning_rate": 9.181856718948706e-07, + "loss": 0.4313, + "mean_token_accuracy": 0.8615138530731201, + "num_tokens": 82851007.0, + "step": 2167 + }, + { + "epoch": 0.27579188398422594, + "ewc_loss": 0.020853033289313316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0426516382722184e-05, + "grad_norm": 20.299942016601562, + "learning_rate": 9.186095803306485e-07, + "loss": 0.475, + "mean_token_accuracy": 0.8480150699615479, + "num_tokens": 82886846.0, + "step": 2168 + }, + { + "epoch": 0.2759190942628164, + "ewc_loss": 0.020900782197713852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0450391528138425e-05, + "grad_norm": 20.421262741088867, + "learning_rate": 9.190334887664264e-07, + "loss": 0.5132, + "mean_token_accuracy": 0.8385841846466064, + "num_tokens": 82929607.0, + "step": 2169 + }, + { + "epoch": 0.27604630454140694, + "ewc_loss": 0.02089453861117363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0447269232827239e-05, + "grad_norm": 20.37260627746582, + "learning_rate": 9.194573972022043e-07, + "loss": 0.5727, + "mean_token_accuracy": 0.8268693685531616, + "num_tokens": 82965089.0, + "step": 2170 + }, + { + "epoch": 0.27617351481999747, + "ewc_loss": 0.020889580249786377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0444789950270206e-05, + "grad_norm": 20.324697494506836, + "learning_rate": 9.198813056379822e-07, + "loss": 0.5454, + "mean_token_accuracy": 0.8278478980064392, + "num_tokens": 83008345.0, + "step": 2171 + }, + { + "epoch": 0.27630072509858794, + "ewc_loss": 0.020927321165800095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.046366014634259e-05, + "grad_norm": 20.454608917236328, + "learning_rate": 9.2030521407376e-07, + "loss": 0.4988, + "mean_token_accuracy": 0.8406215906143188, + "num_tokens": 83043763.0, + "step": 2172 + }, + { + "epoch": 0.27642793537717847, + "ewc_loss": 0.020925769582390785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0462884347361978e-05, + "grad_norm": 20.323307037353516, + "learning_rate": 9.20729122509538e-07, + "loss": 0.4434, + "mean_token_accuracy": 0.860788881778717, + "num_tokens": 83081389.0, + "step": 2173 + }, + { + "epoch": 0.276555145655769, + "ewc_loss": 0.020916536450386047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.045826866175048e-05, + "grad_norm": 20.38777732849121, + "learning_rate": 9.211530309453158e-07, + "loss": 0.4821, + "mean_token_accuracy": 0.8461881875991821, + "num_tokens": 83120547.0, + "step": 2174 + }, + { + "epoch": 0.27668235593435947, + "ewc_loss": 0.02096095308661461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0480476703378372e-05, + "grad_norm": 20.356637954711914, + "learning_rate": 9.215769393810936e-07, + "loss": 0.489, + "mean_token_accuracy": 0.8485578298568726, + "num_tokens": 83157229.0, + "step": 2175 + }, + { + "epoch": 0.27680956621295, + "ewc_loss": 0.020969675853848457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0484837730473373e-05, + "grad_norm": 20.37946128845215, + "learning_rate": 9.220008478168715e-07, + "loss": 0.5682, + "mean_token_accuracy": 0.8215847015380859, + "num_tokens": 83198217.0, + "step": 2176 + }, + { + "epoch": 0.2769367764915405, + "ewc_loss": 0.02097141370177269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0485707207408268e-05, + "grad_norm": 20.337467193603516, + "learning_rate": 9.224247562526494e-07, + "loss": 0.4887, + "mean_token_accuracy": 0.8438809514045715, + "num_tokens": 83238750.0, + "step": 2177 + }, + { + "epoch": 0.277063986770131, + "ewc_loss": 0.020998220890760422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0499110430828296e-05, + "grad_norm": 20.388011932373047, + "learning_rate": 9.228486646884273e-07, + "loss": 0.4754, + "mean_token_accuracy": 0.8484680652618408, + "num_tokens": 83280138.0, + "step": 2178 + }, + { + "epoch": 0.2771911970487215, + "ewc_loss": 0.021022457629442215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0511228538234718e-05, + "grad_norm": 20.39236831665039, + "learning_rate": 9.232725731242052e-07, + "loss": 0.531, + "mean_token_accuracy": 0.8320492506027222, + "num_tokens": 83320633.0, + "step": 2179 + }, + { + "epoch": 0.27731840732731206, + "ewc_loss": 0.021004851907491684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0502425539016258e-05, + "grad_norm": 20.360950469970703, + "learning_rate": 9.23696481559983e-07, + "loss": 0.5123, + "mean_token_accuracy": 0.8365592956542969, + "num_tokens": 83358787.0, + "step": 2180 + }, + { + "epoch": 0.2774456176059026, + "ewc_loss": 0.021040957421064377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.052047900884645e-05, + "grad_norm": 20.387569427490234, + "learning_rate": 9.24120389995761e-07, + "loss": 0.4552, + "mean_token_accuracy": 0.8555948734283447, + "num_tokens": 83400609.0, + "step": 2181 + }, + { + "epoch": 0.27757282788449306, + "ewc_loss": 0.021057816222310066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0528908205742482e-05, + "grad_norm": 20.482698440551758, + "learning_rate": 9.245442984315387e-07, + "loss": 0.4711, + "mean_token_accuracy": 0.8501924276351929, + "num_tokens": 83436058.0, + "step": 2182 + }, + { + "epoch": 0.2777000381630836, + "ewc_loss": 0.02102922834455967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0514614587009419e-05, + "grad_norm": 20.371767044067383, + "learning_rate": 9.249682068673165e-07, + "loss": 0.5063, + "mean_token_accuracy": 0.8386629819869995, + "num_tokens": 83474542.0, + "step": 2183 + }, + { + "epoch": 0.2778272484416741, + "ewc_loss": 0.021037351340055466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0518675480852835e-05, + "grad_norm": 20.42672348022461, + "learning_rate": 9.253921153030945e-07, + "loss": 0.5444, + "mean_token_accuracy": 0.8294022679328918, + "num_tokens": 83515551.0, + "step": 2184 + }, + { + "epoch": 0.2779544587202646, + "ewc_loss": 0.021093957126140594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0546978955972008e-05, + "grad_norm": 20.538976669311523, + "learning_rate": 9.258160237388723e-07, + "loss": 0.4571, + "mean_token_accuracy": 0.8580552339553833, + "num_tokens": 83551826.0, + "step": 2185 + }, + { + "epoch": 0.2780816689988551, + "ewc_loss": 0.021082717925310135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0541359188209753e-05, + "grad_norm": 20.414390563964844, + "learning_rate": 9.262399321746503e-07, + "loss": 0.4763, + "mean_token_accuracy": 0.8495658040046692, + "num_tokens": 83586598.0, + "step": 2186 + }, + { + "epoch": 0.27820887927744564, + "ewc_loss": 0.021021192893385887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0510596439416986e-05, + "grad_norm": 20.406736373901367, + "learning_rate": 9.266638406104281e-07, + "loss": 0.5257, + "mean_token_accuracy": 0.8338297009468079, + "num_tokens": 83625996.0, + "step": 2187 + }, + { + "epoch": 0.2783360895560361, + "ewc_loss": 0.02112678997218609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.056339533533901e-05, + "grad_norm": 20.44051742553711, + "learning_rate": 9.27087749046206e-07, + "loss": 0.4948, + "mean_token_accuracy": 0.8447253704071045, + "num_tokens": 83657322.0, + "step": 2188 + }, + { + "epoch": 0.27846329983462664, + "ewc_loss": 0.02112765982747078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0563830073806457e-05, + "grad_norm": 20.582740783691406, + "learning_rate": 9.275116574819839e-07, + "loss": 0.4892, + "mean_token_accuracy": 0.8450698852539062, + "num_tokens": 83693578.0, + "step": 2189 + }, + { + "epoch": 0.27859051011321717, + "ewc_loss": 0.021097183227539062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0548591490078252e-05, + "grad_norm": 20.410720825195312, + "learning_rate": 9.279355659177617e-07, + "loss": 0.4941, + "mean_token_accuracy": 0.8456887006759644, + "num_tokens": 83727672.0, + "step": 2190 + }, + { + "epoch": 0.27871772039180764, + "ewc_loss": 0.021122286096215248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.056114342645742e-05, + "grad_norm": 20.448673248291016, + "learning_rate": 9.283594743535395e-07, + "loss": 0.5379, + "mean_token_accuracy": 0.8335220217704773, + "num_tokens": 83761405.0, + "step": 2191 + }, + { + "epoch": 0.2788449306703982, + "ewc_loss": 0.021147191524505615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0573596227914095e-05, + "grad_norm": 20.420949935913086, + "learning_rate": 9.287833827893175e-07, + "loss": 0.4763, + "mean_token_accuracy": 0.8461787104606628, + "num_tokens": 83801564.0, + "step": 2192 + }, + { + "epoch": 0.2789721409489887, + "ewc_loss": 0.021150166168808937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0575083251751494e-05, + "grad_norm": 20.461328506469727, + "learning_rate": 9.292072912250953e-07, + "loss": 0.507, + "mean_token_accuracy": 0.8394747376441956, + "num_tokens": 83834260.0, + "step": 2193 + }, + { + "epoch": 0.2790993512275792, + "ewc_loss": 0.02118450030684471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0592249964247458e-05, + "grad_norm": 20.489940643310547, + "learning_rate": 9.296311996608733e-07, + "loss": 0.4342, + "mean_token_accuracy": 0.8615466356277466, + "num_tokens": 83873518.0, + "step": 2194 + }, + { + "epoch": 0.2792265615061697, + "ewc_loss": 0.021222760900855064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.061138027580455e-05, + "grad_norm": 20.479400634765625, + "learning_rate": 9.300551080966511e-07, + "loss": 0.5491, + "mean_token_accuracy": 0.8271375894546509, + "num_tokens": 83913902.0, + "step": 2195 + }, + { + "epoch": 0.27935377178476023, + "ewc_loss": 0.021225912496447563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0612956430122722e-05, + "grad_norm": 20.493667602539062, + "learning_rate": 9.30479016532429e-07, + "loss": 0.4759, + "mean_token_accuracy": 0.8453124761581421, + "num_tokens": 83947952.0, + "step": 2196 + }, + { + "epoch": 0.2794809820633507, + "ewc_loss": 0.02124243602156639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0621218279993627e-05, + "grad_norm": 20.45447540283203, + "learning_rate": 9.309029249682068e-07, + "loss": 0.4849, + "mean_token_accuracy": 0.8475451469421387, + "num_tokens": 83986057.0, + "step": 2197 + }, + { + "epoch": 0.27960819234194123, + "ewc_loss": 0.02123190462589264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0615952305670362e-05, + "grad_norm": 20.42250633239746, + "learning_rate": 9.313268334039847e-07, + "loss": 0.4949, + "mean_token_accuracy": 0.8432261943817139, + "num_tokens": 84023881.0, + "step": 2198 + }, + { + "epoch": 0.27973540262053176, + "ewc_loss": 0.021289313212037086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0644656867953017e-05, + "grad_norm": 20.544857025146484, + "learning_rate": 9.317507418397625e-07, + "loss": 0.4278, + "mean_token_accuracy": 0.8653059601783752, + "num_tokens": 84061480.0, + "step": 2199 + }, + { + "epoch": 0.27986261289912223, + "ewc_loss": 0.02130088210105896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0650441254256293e-05, + "grad_norm": 20.45301628112793, + "learning_rate": 9.321746502755404e-07, + "loss": 0.5312, + "mean_token_accuracy": 0.8346137404441833, + "num_tokens": 84101782.0, + "step": 2200 + }, + { + "epoch": 0.27998982317771276, + "ewc_loss": 0.02130937948822975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0654689504008275e-05, + "grad_norm": 20.537546157836914, + "learning_rate": 9.325985587113183e-07, + "loss": 0.4889, + "mean_token_accuracy": 0.8453060984611511, + "num_tokens": 84136125.0, + "step": 2201 + }, + { + "epoch": 0.2801170334563033, + "ewc_loss": 0.02129310555756092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0646553164406214e-05, + "grad_norm": 20.4080810546875, + "learning_rate": 9.330224671470962e-07, + "loss": 0.5033, + "mean_token_accuracy": 0.8459835052490234, + "num_tokens": 84169748.0, + "step": 2202 + }, + { + "epoch": 0.28024424373489376, + "ewc_loss": 0.02134879119694233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0674395525711589e-05, + "grad_norm": 20.45820426940918, + "learning_rate": 9.334463755828741e-07, + "loss": 0.5186, + "mean_token_accuracy": 0.8336297869682312, + "num_tokens": 84201292.0, + "step": 2203 + }, + { + "epoch": 0.2803714540134843, + "ewc_loss": 0.021394925191998482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0697463039832655e-05, + "grad_norm": 20.54776382446289, + "learning_rate": 9.338702840186519e-07, + "loss": 0.5184, + "mean_token_accuracy": 0.8348649740219116, + "num_tokens": 84240652.0, + "step": 2204 + }, + { + "epoch": 0.2804986642920748, + "ewc_loss": 0.02139868587255478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.069934296538122e-05, + "grad_norm": 20.551496505737305, + "learning_rate": 9.342941924544298e-07, + "loss": 0.5099, + "mean_token_accuracy": 0.8401731252670288, + "num_tokens": 84280252.0, + "step": 2205 + }, + { + "epoch": 0.2806258745706653, + "ewc_loss": 0.021428152918815613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0714076779549941e-05, + "grad_norm": 20.503124237060547, + "learning_rate": 9.347181008902076e-07, + "loss": 0.4469, + "mean_token_accuracy": 0.8615530729293823, + "num_tokens": 84316505.0, + "step": 2206 + }, + { + "epoch": 0.2807530848492558, + "ewc_loss": 0.021425114944577217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0712557013903279e-05, + "grad_norm": 20.590015411376953, + "learning_rate": 9.351420093259855e-07, + "loss": 0.4504, + "mean_token_accuracy": 0.8565901517868042, + "num_tokens": 84349990.0, + "step": 2207 + }, + { + "epoch": 0.28088029512784635, + "ewc_loss": 0.02140691690146923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0703458428906742e-05, + "grad_norm": 20.48016357421875, + "learning_rate": 9.355659177617634e-07, + "loss": 0.4399, + "mean_token_accuracy": 0.8582042455673218, + "num_tokens": 84381066.0, + "step": 2208 + }, + { + "epoch": 0.2810075054064368, + "ewc_loss": 0.021460775285959244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0730387657531537e-05, + "grad_norm": 20.530981063842773, + "learning_rate": 9.359898261975413e-07, + "loss": 0.4305, + "mean_token_accuracy": 0.8637688159942627, + "num_tokens": 84422540.0, + "step": 2209 + }, + { + "epoch": 0.28113471568502735, + "ewc_loss": 0.021442459896206856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0721229955379386e-05, + "grad_norm": 20.50196075439453, + "learning_rate": 9.364137346333192e-07, + "loss": 0.5592, + "mean_token_accuracy": 0.8254223465919495, + "num_tokens": 84462158.0, + "step": 2210 + }, + { + "epoch": 0.2812619259636179, + "ewc_loss": 0.021502507850527763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0751254194474313e-05, + "grad_norm": 20.58467674255371, + "learning_rate": 9.368376430690971e-07, + "loss": 0.4482, + "mean_token_accuracy": 0.8573523759841919, + "num_tokens": 84495887.0, + "step": 2211 + }, + { + "epoch": 0.28138913624220835, + "ewc_loss": 0.02150207944214344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0751039553724695e-05, + "grad_norm": 20.604284286499023, + "learning_rate": 9.372615515048749e-07, + "loss": 0.5219, + "mean_token_accuracy": 0.8366892337799072, + "num_tokens": 84530176.0, + "step": 2212 + }, + { + "epoch": 0.2815163465207989, + "ewc_loss": 0.021491030231118202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0745515282906126e-05, + "grad_norm": 20.534626007080078, + "learning_rate": 9.376854599406528e-07, + "loss": 0.5227, + "mean_token_accuracy": 0.8374467492103577, + "num_tokens": 84574450.0, + "step": 2213 + }, + { + "epoch": 0.2816435567993894, + "ewc_loss": 0.02148190513253212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0740952347987331e-05, + "grad_norm": 20.589313507080078, + "learning_rate": 9.381093683764306e-07, + "loss": 0.4903, + "mean_token_accuracy": 0.8461105227470398, + "num_tokens": 84610661.0, + "step": 2214 + }, + { + "epoch": 0.2817707670779799, + "ewc_loss": 0.021539602428674698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0769801519927569e-05, + "grad_norm": 20.526077270507812, + "learning_rate": 9.385332768122085e-07, + "loss": 0.472, + "mean_token_accuracy": 0.8496163487434387, + "num_tokens": 84651259.0, + "step": 2215 + }, + { + "epoch": 0.2818979773565704, + "ewc_loss": 0.021520715206861496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0760357326944359e-05, + "grad_norm": 20.641202926635742, + "learning_rate": 9.389571852479864e-07, + "loss": 0.477, + "mean_token_accuracy": 0.8501003980636597, + "num_tokens": 84692389.0, + "step": 2216 + }, + { + "epoch": 0.28202518763516093, + "ewc_loss": 0.0215182825922966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0759141332528088e-05, + "grad_norm": 20.69756507873535, + "learning_rate": 9.393810936837643e-07, + "loss": 0.4799, + "mean_token_accuracy": 0.848967432975769, + "num_tokens": 84731905.0, + "step": 2217 + }, + { + "epoch": 0.2821523979137514, + "ewc_loss": 0.02147342637181282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0736713193182368e-05, + "grad_norm": 20.609272003173828, + "learning_rate": 9.398050021195422e-07, + "loss": 0.5082, + "mean_token_accuracy": 0.8441148996353149, + "num_tokens": 84773727.0, + "step": 2218 + }, + { + "epoch": 0.28227960819234194, + "ewc_loss": 0.021536000072956085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0767999810923357e-05, + "grad_norm": 20.658700942993164, + "learning_rate": 9.402289105553201e-07, + "loss": 0.4529, + "mean_token_accuracy": 0.8556951880455017, + "num_tokens": 84816771.0, + "step": 2219 + }, + { + "epoch": 0.28240681847093246, + "ewc_loss": 0.021543370559811592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.077168508345494e-05, + "grad_norm": 20.60053253173828, + "learning_rate": 9.406528189910978e-07, + "loss": 0.4784, + "mean_token_accuracy": 0.8495950698852539, + "num_tokens": 84853172.0, + "step": 2220 + }, + { + "epoch": 0.28253402874952294, + "ewc_loss": 0.02153213880956173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0766068953671493e-05, + "grad_norm": 20.697139739990234, + "learning_rate": 9.410767274268757e-07, + "loss": 0.4594, + "mean_token_accuracy": 0.8515900373458862, + "num_tokens": 84891508.0, + "step": 2221 + }, + { + "epoch": 0.28266123902811346, + "ewc_loss": 0.021544797345995903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0772399036795832e-05, + "grad_norm": 20.61208152770996, + "learning_rate": 9.415006358626536e-07, + "loss": 0.5564, + "mean_token_accuracy": 0.8248635530471802, + "num_tokens": 84931204.0, + "step": 2222 + }, + { + "epoch": 0.282788449306704, + "ewc_loss": 0.021577823907136917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0788911822601222e-05, + "grad_norm": 20.693187713623047, + "learning_rate": 9.419245442984314e-07, + "loss": 0.5391, + "mean_token_accuracy": 0.8309968709945679, + "num_tokens": 84966782.0, + "step": 2223 + }, + { + "epoch": 0.28291565958529447, + "ewc_loss": 0.02156822569668293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0784112419059966e-05, + "grad_norm": 20.645814895629883, + "learning_rate": 9.423484527342094e-07, + "loss": 0.4838, + "mean_token_accuracy": 0.8442562818527222, + "num_tokens": 85001512.0, + "step": 2224 + }, + { + "epoch": 0.283042869863885, + "ewc_loss": 0.02158052660524845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0790263331728056e-05, + "grad_norm": 20.720125198364258, + "learning_rate": 9.427723611699872e-07, + "loss": 0.4925, + "mean_token_accuracy": 0.8460673689842224, + "num_tokens": 85036082.0, + "step": 2225 + }, + { + "epoch": 0.2831700801424755, + "ewc_loss": 0.02158968150615692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.079484081856208e-05, + "grad_norm": 20.652589797973633, + "learning_rate": 9.431962696057652e-07, + "loss": 0.4976, + "mean_token_accuracy": 0.8440492749214172, + "num_tokens": 85079433.0, + "step": 2226 + }, + { + "epoch": 0.283297290421066, + "ewc_loss": 0.021614328026771545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.08071635622764e-05, + "grad_norm": 20.70400047302246, + "learning_rate": 9.43620178041543e-07, + "loss": 0.4489, + "mean_token_accuracy": 0.8598262667655945, + "num_tokens": 85116940.0, + "step": 2227 + }, + { + "epoch": 0.2834245006996565, + "ewc_loss": 0.021609319373965263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.080465972336242e-05, + "grad_norm": 20.644474029541016, + "learning_rate": 9.440440864773208e-07, + "loss": 0.5024, + "mean_token_accuracy": 0.8432896137237549, + "num_tokens": 85153328.0, + "step": 2228 + }, + { + "epoch": 0.28355171097824705, + "ewc_loss": 0.021554511040449142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.07772557385033e-05, + "grad_norm": 20.612825393676758, + "learning_rate": 9.444679949130987e-07, + "loss": 0.5038, + "mean_token_accuracy": 0.8401942253112793, + "num_tokens": 85194059.0, + "step": 2229 + }, + { + "epoch": 0.2836789212568376, + "ewc_loss": 0.02162596769630909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0812983418873046e-05, + "grad_norm": 20.69355010986328, + "learning_rate": 9.448919033488766e-07, + "loss": 0.5259, + "mean_token_accuracy": 0.8370428085327148, + "num_tokens": 85230820.0, + "step": 2230 + }, + { + "epoch": 0.28380613153542805, + "ewc_loss": 0.021637190133333206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0818595001182985e-05, + "grad_norm": 20.635080337524414, + "learning_rate": 9.453158117846544e-07, + "loss": 0.5064, + "mean_token_accuracy": 0.8421659469604492, + "num_tokens": 85268864.0, + "step": 2231 + }, + { + "epoch": 0.2839333418140186, + "ewc_loss": 0.021673399955034256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0836700312211178e-05, + "grad_norm": 20.734081268310547, + "learning_rate": 9.457397202204324e-07, + "loss": 0.5342, + "mean_token_accuracy": 0.832905113697052, + "num_tokens": 85311377.0, + "step": 2232 + }, + { + "epoch": 0.2840605520926091, + "ewc_loss": 0.02167666330933571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0838331945706159e-05, + "grad_norm": 20.635635375976562, + "learning_rate": 9.461636286562102e-07, + "loss": 0.4467, + "mean_token_accuracy": 0.8609998226165771, + "num_tokens": 85347082.0, + "step": 2233 + }, + { + "epoch": 0.2841877623711996, + "ewc_loss": 0.021734436973929405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0867218406929169e-05, + "grad_norm": 20.765087127685547, + "learning_rate": 9.465875370919882e-07, + "loss": 0.5366, + "mean_token_accuracy": 0.8343798518180847, + "num_tokens": 85384696.0, + "step": 2234 + }, + { + "epoch": 0.2843149726497901, + "ewc_loss": 0.021706493571400642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0853246749320533e-05, + "grad_norm": 20.726957321166992, + "learning_rate": 9.470114455277659e-07, + "loss": 0.5223, + "mean_token_accuracy": 0.8355691432952881, + "num_tokens": 85420192.0, + "step": 2235 + }, + { + "epoch": 0.28444218292838064, + "ewc_loss": 0.02172119915485382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0860600013984367e-05, + "grad_norm": 20.754785537719727, + "learning_rate": 9.474353539635438e-07, + "loss": 0.4711, + "mean_token_accuracy": 0.849457859992981, + "num_tokens": 85462281.0, + "step": 2236 + }, + { + "epoch": 0.2845693932069711, + "ewc_loss": 0.02173040062189102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0865200238185935e-05, + "grad_norm": 20.7817325592041, + "learning_rate": 9.478592623993217e-07, + "loss": 0.4781, + "mean_token_accuracy": 0.8476302623748779, + "num_tokens": 85495191.0, + "step": 2237 + }, + { + "epoch": 0.28469660348556164, + "ewc_loss": 0.021687133237719536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0843566997209564e-05, + "grad_norm": 20.758005142211914, + "learning_rate": 9.482831708350996e-07, + "loss": 0.4752, + "mean_token_accuracy": 0.8516977429389954, + "num_tokens": 85533803.0, + "step": 2238 + }, + { + "epoch": 0.28482381376415217, + "ewc_loss": 0.021705225110054016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0852612831513397e-05, + "grad_norm": 20.746004104614258, + "learning_rate": 9.487070792708775e-07, + "loss": 0.5223, + "mean_token_accuracy": 0.8380334377288818, + "num_tokens": 85573424.0, + "step": 2239 + }, + { + "epoch": 0.28495102404274264, + "ewc_loss": 0.021741362288594246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0870680853258818e-05, + "grad_norm": 20.77446746826172, + "learning_rate": 9.491309877066554e-07, + "loss": 0.5052, + "mean_token_accuracy": 0.8415855765342712, + "num_tokens": 85616423.0, + "step": 2240 + }, + { + "epoch": 0.28507823432133317, + "ewc_loss": 0.021749600768089294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0874799954763148e-05, + "grad_norm": 20.783796310424805, + "learning_rate": 9.495548961424332e-07, + "loss": 0.5321, + "mean_token_accuracy": 0.8331067562103271, + "num_tokens": 85658410.0, + "step": 2241 + }, + { + "epoch": 0.2852054445999237, + "ewc_loss": 0.02173043228685856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0865215699595865e-05, + "grad_norm": 20.705768585205078, + "learning_rate": 9.499788045782111e-07, + "loss": 0.4894, + "mean_token_accuracy": 0.84552001953125, + "num_tokens": 85695767.0, + "step": 2242 + }, + { + "epoch": 0.28533265487851417, + "ewc_loss": 0.021729258820414543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0864629075513221e-05, + "grad_norm": 20.70793914794922, + "learning_rate": 9.504027130139889e-07, + "loss": 0.505, + "mean_token_accuracy": 0.8410608768463135, + "num_tokens": 85737771.0, + "step": 2243 + }, + { + "epoch": 0.2854598651571047, + "ewc_loss": 0.021802810952067375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0901405403274111e-05, + "grad_norm": 20.794965744018555, + "learning_rate": 9.508266214497667e-07, + "loss": 0.4509, + "mean_token_accuracy": 0.8598580360412598, + "num_tokens": 85774789.0, + "step": 2244 + }, + { + "epoch": 0.2855870754356952, + "ewc_loss": 0.021733609959483147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0866804586839862e-05, + "grad_norm": 20.754865646362305, + "learning_rate": 9.512505298855447e-07, + "loss": 0.4901, + "mean_token_accuracy": 0.8479669690132141, + "num_tokens": 85817833.0, + "step": 2245 + }, + { + "epoch": 0.2857142857142857, + "ewc_loss": 0.021812008693814278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0906004717980977e-05, + "grad_norm": 20.851106643676758, + "learning_rate": 9.516744383213225e-07, + "loss": 0.434, + "mean_token_accuracy": 0.8615931868553162, + "num_tokens": 85854488.0, + "step": 2246 + }, + { + "epoch": 0.2858414959928762, + "ewc_loss": 0.02179539203643799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.089769557438558e-05, + "grad_norm": 20.77151870727539, + "learning_rate": 9.520983467571005e-07, + "loss": 0.5123, + "mean_token_accuracy": 0.8394989967346191, + "num_tokens": 85893746.0, + "step": 2247 + }, + { + "epoch": 0.28596870627146675, + "ewc_loss": 0.02175919897854328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0879599358304404e-05, + "grad_norm": 20.898908615112305, + "learning_rate": 9.525222551928783e-07, + "loss": 0.555, + "mean_token_accuracy": 0.8329229354858398, + "num_tokens": 85932798.0, + "step": 2248 + }, + { + "epoch": 0.2860959165500572, + "ewc_loss": 0.021799258887767792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0899629160121549e-05, + "grad_norm": 20.791019439697266, + "learning_rate": 9.529461636286562e-07, + "loss": 0.4666, + "mean_token_accuracy": 0.8531416654586792, + "num_tokens": 85964446.0, + "step": 2249 + }, + { + "epoch": 0.28622312682864776, + "ewc_loss": 0.02176228165626526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0881140951823909e-05, + "grad_norm": 20.808635711669922, + "learning_rate": 9.533700720644341e-07, + "loss": 0.4432, + "mean_token_accuracy": 0.8600310683250427, + "num_tokens": 86000587.0, + "step": 2250 + }, + { + "epoch": 0.2863503371072383, + "ewc_loss": 0.021760523319244385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0880261470447294e-05, + "grad_norm": 20.713300704956055, + "learning_rate": 9.537939805002118e-07, + "loss": 0.4953, + "mean_token_accuracy": 0.84482741355896, + "num_tokens": 86042891.0, + "step": 2251 + }, + { + "epoch": 0.28647754738582876, + "ewc_loss": 0.021860577166080475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0930288226518314e-05, + "grad_norm": 20.870943069458008, + "learning_rate": 9.542178889359898e-07, + "loss": 0.4726, + "mean_token_accuracy": 0.8516620397567749, + "num_tokens": 86079973.0, + "step": 2252 + }, + { + "epoch": 0.2866047576644193, + "ewc_loss": 0.021864216774702072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0932108125416562e-05, + "grad_norm": 20.763641357421875, + "learning_rate": 9.546417973717677e-07, + "loss": 0.496, + "mean_token_accuracy": 0.8450913429260254, + "num_tokens": 86119767.0, + "step": 2253 + }, + { + "epoch": 0.2867319679430098, + "ewc_loss": 0.02185967192053795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0929836207651533e-05, + "grad_norm": 20.857685089111328, + "learning_rate": 9.550657058075455e-07, + "loss": 0.517, + "mean_token_accuracy": 0.8402068614959717, + "num_tokens": 86163176.0, + "step": 2254 + }, + { + "epoch": 0.2868591782216003, + "ewc_loss": 0.02188650704920292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.094325398298679e-05, + "grad_norm": 20.799741744995117, + "learning_rate": 9.554896142433234e-07, + "loss": 0.5131, + "mean_token_accuracy": 0.8369611501693726, + "num_tokens": 86201523.0, + "step": 2255 + }, + { + "epoch": 0.2869863885001908, + "ewc_loss": 0.021876508370041847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0938254490611143e-05, + "grad_norm": 20.788711547851562, + "learning_rate": 9.559135226791012e-07, + "loss": 0.5305, + "mean_token_accuracy": 0.8295166492462158, + "num_tokens": 86240277.0, + "step": 2256 + }, + { + "epoch": 0.28711359877878134, + "ewc_loss": 0.021928327158093452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0964163266180549e-05, + "grad_norm": 20.83926010131836, + "learning_rate": 9.563374311148793e-07, + "loss": 0.5207, + "mean_token_accuracy": 0.8383408784866333, + "num_tokens": 86289714.0, + "step": 2257 + }, + { + "epoch": 0.2872408090573718, + "ewc_loss": 0.021942326799035072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0971163646900095e-05, + "grad_norm": 20.847396850585938, + "learning_rate": 9.56761339550657e-07, + "loss": 0.4712, + "mean_token_accuracy": 0.8530787229537964, + "num_tokens": 86320786.0, + "step": 2258 + }, + { + "epoch": 0.28736801933596234, + "ewc_loss": 0.021933380514383316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0966689842462074e-05, + "grad_norm": 20.83800506591797, + "learning_rate": 9.57185247986435e-07, + "loss": 0.4615, + "mean_token_accuracy": 0.8548309803009033, + "num_tokens": 86358002.0, + "step": 2259 + }, + { + "epoch": 0.28749522961455287, + "ewc_loss": 0.02192927710711956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0964638931909576e-05, + "grad_norm": 20.85711097717285, + "learning_rate": 9.576091564222128e-07, + "loss": 0.4294, + "mean_token_accuracy": 0.8621869683265686, + "num_tokens": 86395334.0, + "step": 2260 + }, + { + "epoch": 0.28762243989314334, + "ewc_loss": 0.02197921834886074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0989609108946752e-05, + "grad_norm": 20.874616622924805, + "learning_rate": 9.580330648579906e-07, + "loss": 0.5071, + "mean_token_accuracy": 0.8374916315078735, + "num_tokens": 86432616.0, + "step": 2261 + }, + { + "epoch": 0.2877496501717339, + "ewc_loss": 0.02193816937506199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0969084542011842e-05, + "grad_norm": 20.846017837524414, + "learning_rate": 9.584569732937685e-07, + "loss": 0.5003, + "mean_token_accuracy": 0.8410059213638306, + "num_tokens": 86472293.0, + "step": 2262 + }, + { + "epoch": 0.2878768604503244, + "ewc_loss": 0.021991079673171043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0995539923897013e-05, + "grad_norm": 20.913305282592773, + "learning_rate": 9.588808817295463e-07, + "loss": 0.4942, + "mean_token_accuracy": 0.8403077125549316, + "num_tokens": 86506304.0, + "step": 2263 + }, + { + "epoch": 0.2880040707289149, + "ewc_loss": 0.021964088082313538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0982043932017405e-05, + "grad_norm": 20.88068962097168, + "learning_rate": 9.593047901653242e-07, + "loss": 0.4904, + "mean_token_accuracy": 0.8427624702453613, + "num_tokens": 86540520.0, + "step": 2264 + }, + { + "epoch": 0.2881312810075054, + "ewc_loss": 0.022009601816534996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1004801308445167e-05, + "grad_norm": 21.139101028442383, + "learning_rate": 9.597286986011022e-07, + "loss": 0.4506, + "mean_token_accuracy": 0.855800986289978, + "num_tokens": 86576956.0, + "step": 2265 + }, + { + "epoch": 0.28825849128609593, + "ewc_loss": 0.021992681547999382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0996341188729275e-05, + "grad_norm": 20.818735122680664, + "learning_rate": 9.601526070368799e-07, + "loss": 0.5358, + "mean_token_accuracy": 0.832700252532959, + "num_tokens": 86613799.0, + "step": 2266 + }, + { + "epoch": 0.2883857015646864, + "ewc_loss": 0.02195633389055729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.0978166756103747e-05, + "grad_norm": 20.86658477783203, + "learning_rate": 9.60576515472658e-07, + "loss": 0.4727, + "mean_token_accuracy": 0.850495457649231, + "num_tokens": 86650338.0, + "step": 2267 + }, + { + "epoch": 0.28851291184327693, + "ewc_loss": 0.022036850452423096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1018425539077725e-05, + "grad_norm": 20.977092742919922, + "learning_rate": 9.610004239084358e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.8448265194892883, + "num_tokens": 86691605.0, + "step": 2268 + }, + { + "epoch": 0.28864012212186746, + "ewc_loss": 0.02204531989991665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.102266014640918e-05, + "grad_norm": 20.920183181762695, + "learning_rate": 9.614243323442136e-07, + "loss": 0.5545, + "mean_token_accuracy": 0.8239103555679321, + "num_tokens": 86728767.0, + "step": 2269 + }, + { + "epoch": 0.28876733240045793, + "ewc_loss": 0.02202446572482586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1012232789653353e-05, + "grad_norm": 20.925640106201172, + "learning_rate": 9.618482407799915e-07, + "loss": 0.4681, + "mean_token_accuracy": 0.850805938243866, + "num_tokens": 86765240.0, + "step": 2270 + }, + { + "epoch": 0.28889454267904846, + "ewc_loss": 0.02205171436071396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1025857020285912e-05, + "grad_norm": 20.830236434936523, + "learning_rate": 9.622721492157693e-07, + "loss": 0.4347, + "mean_token_accuracy": 0.8632091283798218, + "num_tokens": 86804042.0, + "step": 2271 + }, + { + "epoch": 0.289021752957639, + "ewc_loss": 0.022083759307861328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1041879588447046e-05, + "grad_norm": 20.947406768798828, + "learning_rate": 9.626960576515472e-07, + "loss": 0.4874, + "mean_token_accuracy": 0.8459158539772034, + "num_tokens": 86835220.0, + "step": 2272 + }, + { + "epoch": 0.28914896323622946, + "ewc_loss": 0.022076202556490898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.103810154745588e-05, + "grad_norm": 20.885704040527344, + "learning_rate": 9.63119966087325e-07, + "loss": 0.4723, + "mean_token_accuracy": 0.8491201400756836, + "num_tokens": 86873414.0, + "step": 2273 + }, + { + "epoch": 0.28927617351482, + "ewc_loss": 0.0221701730042696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1085086953244172e-05, + "grad_norm": 20.987613677978516, + "learning_rate": 9.635438745231029e-07, + "loss": 0.4616, + "mean_token_accuracy": 0.8544687032699585, + "num_tokens": 86914272.0, + "step": 2274 + }, + { + "epoch": 0.2894033837934105, + "ewc_loss": 0.022132134065032005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1066066690545995e-05, + "grad_norm": 20.882713317871094, + "learning_rate": 9.63967782958881e-07, + "loss": 0.5039, + "mean_token_accuracy": 0.8381575345993042, + "num_tokens": 86952425.0, + "step": 2275 + }, + { + "epoch": 0.289530594072001, + "ewc_loss": 0.022176340222358704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1088170140283182e-05, + "grad_norm": 20.944671630859375, + "learning_rate": 9.643916913946588e-07, + "loss": 0.4619, + "mean_token_accuracy": 0.8551029562950134, + "num_tokens": 86987842.0, + "step": 2276 + }, + { + "epoch": 0.2896578043505915, + "ewc_loss": 0.02220088057219982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.11004401333048e-05, + "grad_norm": 20.925878524780273, + "learning_rate": 9.648155998304366e-07, + "loss": 0.4346, + "mean_token_accuracy": 0.8626964092254639, + "num_tokens": 87026750.0, + "step": 2277 + }, + { + "epoch": 0.28978501462918205, + "ewc_loss": 0.022172406315803528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1086202903243247e-05, + "grad_norm": 20.896377563476562, + "learning_rate": 9.652395082662145e-07, + "loss": 0.5384, + "mean_token_accuracy": 0.8301069736480713, + "num_tokens": 87069685.0, + "step": 2278 + }, + { + "epoch": 0.2899122249077726, + "ewc_loss": 0.022234732285141945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1117365829704795e-05, + "grad_norm": 20.942012786865234, + "learning_rate": 9.656634167019923e-07, + "loss": 0.4951, + "mean_token_accuracy": 0.8404913544654846, + "num_tokens": 87107054.0, + "step": 2279 + }, + { + "epoch": 0.29003943518636305, + "ewc_loss": 0.02221350558102131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1106752936029807e-05, + "grad_norm": 20.966833114624023, + "learning_rate": 9.660873251377701e-07, + "loss": 0.5009, + "mean_token_accuracy": 0.8412691354751587, + "num_tokens": 87146254.0, + "step": 2280 + }, + { + "epoch": 0.2901666454649536, + "ewc_loss": 0.02222844958305359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1114224435004871e-05, + "grad_norm": 20.922039031982422, + "learning_rate": 9.66511233573548e-07, + "loss": 0.5262, + "mean_token_accuracy": 0.8386185169219971, + "num_tokens": 87182404.0, + "step": 2281 + }, + { + "epoch": 0.2902938557435441, + "ewc_loss": 0.02225918136537075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1129590347991325e-05, + "grad_norm": 21.075382232666016, + "learning_rate": 9.669351420093258e-07, + "loss": 0.4571, + "mean_token_accuracy": 0.8557165861129761, + "num_tokens": 87216551.0, + "step": 2282 + }, + { + "epoch": 0.2904210660221346, + "ewc_loss": 0.022236838936805725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1118419024569448e-05, + "grad_norm": 20.99445152282715, + "learning_rate": 9.67359050445104e-07, + "loss": 0.4837, + "mean_token_accuracy": 0.8469655513763428, + "num_tokens": 87249402.0, + "step": 2283 + }, + { + "epoch": 0.2905482763007251, + "ewc_loss": 0.02224600687623024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1123003787361085e-05, + "grad_norm": 21.144559860229492, + "learning_rate": 9.677829588808817e-07, + "loss": 0.4688, + "mean_token_accuracy": 0.8488768339157104, + "num_tokens": 87281846.0, + "step": 2284 + }, + { + "epoch": 0.29067548657931563, + "ewc_loss": 0.022272810339927673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.113640519179171e-05, + "grad_norm": 21.045995712280273, + "learning_rate": 9.682068673166596e-07, + "loss": 0.5402, + "mean_token_accuracy": 0.8339425325393677, + "num_tokens": 87315765.0, + "step": 2285 + }, + { + "epoch": 0.2908026968579061, + "ewc_loss": 0.02224060706794262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1120303497591522e-05, + "grad_norm": 21.0983943939209, + "learning_rate": 9.686307757524374e-07, + "loss": 0.481, + "mean_token_accuracy": 0.8468279838562012, + "num_tokens": 87353668.0, + "step": 2286 + }, + { + "epoch": 0.29092990713649663, + "ewc_loss": 0.022318094968795776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1159047062392347e-05, + "grad_norm": 21.231748580932617, + "learning_rate": 9.690546841882153e-07, + "loss": 0.412, + "mean_token_accuracy": 0.867777407169342, + "num_tokens": 87390747.0, + "step": 2287 + }, + { + "epoch": 0.29105711741508716, + "ewc_loss": 0.0222634207457304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1131710380141158e-05, + "grad_norm": 21.04859161376953, + "learning_rate": 9.694785926239931e-07, + "loss": 0.4946, + "mean_token_accuracy": 0.8467787504196167, + "num_tokens": 87430686.0, + "step": 2288 + }, + { + "epoch": 0.29118432769367764, + "ewc_loss": 0.022246956825256348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1123478543595411e-05, + "grad_norm": 21.227338790893555, + "learning_rate": 9.69902501059771e-07, + "loss": 0.5353, + "mean_token_accuracy": 0.8316014409065247, + "num_tokens": 87465290.0, + "step": 2289 + }, + { + "epoch": 0.29131153797226816, + "ewc_loss": 0.02230031043291092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1150154932693113e-05, + "grad_norm": 21.066070556640625, + "learning_rate": 9.703264094955488e-07, + "loss": 0.5041, + "mean_token_accuracy": 0.8399837017059326, + "num_tokens": 87504553.0, + "step": 2290 + }, + { + "epoch": 0.2914387482508587, + "ewc_loss": 0.022199247032403946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1099623407062609e-05, + "grad_norm": 20.97970962524414, + "learning_rate": 9.707503179313269e-07, + "loss": 0.4718, + "mean_token_accuracy": 0.8522521257400513, + "num_tokens": 87544896.0, + "step": 2291 + }, + { + "epoch": 0.29156595852944917, + "ewc_loss": 0.022351829335093498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1175914551131427e-05, + "grad_norm": 21.29713249206543, + "learning_rate": 9.711742263671047e-07, + "loss": 0.4721, + "mean_token_accuracy": 0.8513125777244568, + "num_tokens": 87582075.0, + "step": 2292 + }, + { + "epoch": 0.2916931688080397, + "ewc_loss": 0.022303763777017593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.115188206313178e-05, + "grad_norm": 20.923297882080078, + "learning_rate": 9.715981348028826e-07, + "loss": 0.4662, + "mean_token_accuracy": 0.8519884347915649, + "num_tokens": 87623117.0, + "step": 2293 + }, + { + "epoch": 0.2918203790866302, + "ewc_loss": 0.022283660247921944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1141830327687785e-05, + "grad_norm": 21.3372859954834, + "learning_rate": 9.720220432386604e-07, + "loss": 0.4817, + "mean_token_accuracy": 0.8499336242675781, + "num_tokens": 87656263.0, + "step": 2294 + }, + { + "epoch": 0.2919475893652207, + "ewc_loss": 0.02236899733543396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.118449836212676e-05, + "grad_norm": 21.17525291442871, + "learning_rate": 9.724459516744383e-07, + "loss": 0.4533, + "mean_token_accuracy": 0.8569329977035522, + "num_tokens": 87695852.0, + "step": 2295 + }, + { + "epoch": 0.2920747996438112, + "ewc_loss": 0.022274408489465714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1137204637634568e-05, + "grad_norm": 21.10403823852539, + "learning_rate": 9.728698601102161e-07, + "loss": 0.4533, + "mean_token_accuracy": 0.8576443195343018, + "num_tokens": 87734331.0, + "step": 2296 + }, + { + "epoch": 0.29220200992240175, + "ewc_loss": 0.022337323054671288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1168661330884788e-05, + "grad_norm": 21.138395309448242, + "learning_rate": 9.73293768545994e-07, + "loss": 0.4871, + "mean_token_accuracy": 0.8419958353042603, + "num_tokens": 87768447.0, + "step": 2297 + }, + { + "epoch": 0.2923292202009922, + "ewc_loss": 0.02234778366982937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1173891834914684e-05, + "grad_norm": 21.29429054260254, + "learning_rate": 9.737176769817718e-07, + "loss": 0.5142, + "mean_token_accuracy": 0.8435752987861633, + "num_tokens": 87813213.0, + "step": 2298 + }, + { + "epoch": 0.29245643047958275, + "ewc_loss": 0.022323761135339737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.116188013838837e-05, + "grad_norm": 21.065759658813477, + "learning_rate": 9.741415854175499e-07, + "loss": 0.5509, + "mean_token_accuracy": 0.8318121433258057, + "num_tokens": 87851336.0, + "step": 2299 + }, + { + "epoch": 0.2925836407581733, + "ewc_loss": 0.022322669625282288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1161334441567305e-05, + "grad_norm": 21.172388076782227, + "learning_rate": 9.745654938533277e-07, + "loss": 0.5048, + "mean_token_accuracy": 0.8404406309127808, + "num_tokens": 87888474.0, + "step": 2300 + }, + { + "epoch": 0.29271085103676375, + "ewc_loss": 0.02238033153116703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1190165423613507e-05, + "grad_norm": 21.295875549316406, + "learning_rate": 9.749894022891056e-07, + "loss": 0.5287, + "mean_token_accuracy": 0.8362441062927246, + "num_tokens": 87928232.0, + "step": 2301 + }, + { + "epoch": 0.2928380613153543, + "ewc_loss": 0.0223530326038599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1176516636624001e-05, + "grad_norm": 20.958904266357422, + "learning_rate": 9.754133107248834e-07, + "loss": 0.4457, + "mean_token_accuracy": 0.8578164577484131, + "num_tokens": 87966982.0, + "step": 2302 + }, + { + "epoch": 0.2929652715939448, + "ewc_loss": 0.022315479815006256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1157740118505899e-05, + "grad_norm": 21.33988380432129, + "learning_rate": 9.758372191606612e-07, + "loss": 0.533, + "mean_token_accuracy": 0.8315165042877197, + "num_tokens": 88003550.0, + "step": 2303 + }, + { + "epoch": 0.2930924818725353, + "ewc_loss": 0.022445714101195335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1222857210668735e-05, + "grad_norm": 21.13069725036621, + "learning_rate": 9.76261127596439e-07, + "loss": 0.443, + "mean_token_accuracy": 0.860843300819397, + "num_tokens": 88038038.0, + "step": 2304 + }, + { + "epoch": 0.2932196921511258, + "ewc_loss": 0.022325549274683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1162774171680212e-05, + "grad_norm": 21.027664184570312, + "learning_rate": 9.76685036032217e-07, + "loss": 0.516, + "mean_token_accuracy": 0.8347407579421997, + "num_tokens": 88074627.0, + "step": 2305 + }, + { + "epoch": 0.29334690242971634, + "ewc_loss": 0.0224689282476902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1234464182052761e-05, + "grad_norm": 21.354055404663086, + "learning_rate": 9.771089444679948e-07, + "loss": 0.4574, + "mean_token_accuracy": 0.8551349639892578, + "num_tokens": 88116772.0, + "step": 2306 + }, + { + "epoch": 0.2934741127083068, + "ewc_loss": 0.022438280284404755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.121914010582259e-05, + "grad_norm": 21.02066993713379, + "learning_rate": 9.775328529037728e-07, + "loss": 0.5032, + "mean_token_accuracy": 0.8404359817504883, + "num_tokens": 88152888.0, + "step": 2307 + }, + { + "epoch": 0.29360132298689734, + "ewc_loss": 0.022448977455496788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1224488844163716e-05, + "grad_norm": 21.40621566772461, + "learning_rate": 9.779567613395507e-07, + "loss": 0.5332, + "mean_token_accuracy": 0.835949182510376, + "num_tokens": 88184544.0, + "step": 2308 + }, + { + "epoch": 0.29372853326548787, + "ewc_loss": 0.022495560348033905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1247780093981419e-05, + "grad_norm": 21.053234100341797, + "learning_rate": 9.783806697753285e-07, + "loss": 0.4764, + "mean_token_accuracy": 0.8507962226867676, + "num_tokens": 88221603.0, + "step": 2309 + }, + { + "epoch": 0.29385574354407834, + "ewc_loss": 0.022515665739774704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1257832738920115e-05, + "grad_norm": 21.402210235595703, + "learning_rate": 9.788045782111064e-07, + "loss": 0.4507, + "mean_token_accuracy": 0.857010543346405, + "num_tokens": 88265109.0, + "step": 2310 + }, + { + "epoch": 0.29398295382266887, + "ewc_loss": 0.02250620350241661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1253101547481492e-05, + "grad_norm": 21.224346160888672, + "learning_rate": 9.792284866468842e-07, + "loss": 0.4405, + "mean_token_accuracy": 0.8595558404922485, + "num_tokens": 88302756.0, + "step": 2311 + }, + { + "epoch": 0.2941101641012594, + "ewc_loss": 0.022462891414761543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1231445569137577e-05, + "grad_norm": 21.10515594482422, + "learning_rate": 9.79652395082662e-07, + "loss": 0.4413, + "mean_token_accuracy": 0.8554595112800598, + "num_tokens": 88335605.0, + "step": 2312 + }, + { + "epoch": 0.29423737437984987, + "ewc_loss": 0.022479863837361336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.123993206419982e-05, + "grad_norm": 21.062894821166992, + "learning_rate": 9.8007630351844e-07, + "loss": 0.5108, + "mean_token_accuracy": 0.8408764600753784, + "num_tokens": 88369447.0, + "step": 2313 + }, + { + "epoch": 0.2943645846584404, + "ewc_loss": 0.022588882595300674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1294441719655879e-05, + "grad_norm": 21.237642288208008, + "learning_rate": 9.805002119542178e-07, + "loss": 0.541, + "mean_token_accuracy": 0.8300524950027466, + "num_tokens": 88404442.0, + "step": 2314 + }, + { + "epoch": 0.2944917949370309, + "ewc_loss": 0.022542858496308327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.127142968471162e-05, + "grad_norm": 21.102479934692383, + "learning_rate": 9.809241203899958e-07, + "loss": 0.4858, + "mean_token_accuracy": 0.8482165932655334, + "num_tokens": 88445870.0, + "step": 2315 + }, + { + "epoch": 0.2946190052156214, + "ewc_loss": 0.022562941536307335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1281470506219193e-05, + "grad_norm": 21.17797088623047, + "learning_rate": 9.813480288257737e-07, + "loss": 0.5009, + "mean_token_accuracy": 0.8393427729606628, + "num_tokens": 88480994.0, + "step": 2316 + }, + { + "epoch": 0.2947462154942119, + "ewc_loss": 0.022633465006947517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1316732525301632e-05, + "grad_norm": 21.209394454956055, + "learning_rate": 9.817719372615515e-07, + "loss": 0.4549, + "mean_token_accuracy": 0.8561539649963379, + "num_tokens": 88519666.0, + "step": 2317 + }, + { + "epoch": 0.29487342577280246, + "ewc_loss": 0.022619280964136124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1309640285617206e-05, + "grad_norm": 21.172969818115234, + "learning_rate": 9.821958456973294e-07, + "loss": 0.4302, + "mean_token_accuracy": 0.8622751832008362, + "num_tokens": 88553855.0, + "step": 2318 + }, + { + "epoch": 0.29500063605139293, + "ewc_loss": 0.022593554109334946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1296777302050032e-05, + "grad_norm": 21.19583511352539, + "learning_rate": 9.826197541331072e-07, + "loss": 0.4459, + "mean_token_accuracy": 0.8600761294364929, + "num_tokens": 88589053.0, + "step": 2319 + }, + { + "epoch": 0.29512784632998346, + "ewc_loss": 0.022663520649075508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1331760106259026e-05, + "grad_norm": 21.2637882232666, + "learning_rate": 9.83043662568885e-07, + "loss": 0.5044, + "mean_token_accuracy": 0.8421251773834229, + "num_tokens": 88627067.0, + "step": 2320 + }, + { + "epoch": 0.295255056608574, + "ewc_loss": 0.02271164581179619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1355822607583832e-05, + "grad_norm": 21.30193328857422, + "learning_rate": 9.83467571004663e-07, + "loss": 0.4479, + "mean_token_accuracy": 0.8571305274963379, + "num_tokens": 88668030.0, + "step": 2321 + }, + { + "epoch": 0.29538226688716446, + "ewc_loss": 0.022656721994280815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.13283613245585e-05, + "grad_norm": 21.214710235595703, + "learning_rate": 9.838914794404407e-07, + "loss": 0.4469, + "mean_token_accuracy": 0.859986424446106, + "num_tokens": 88708889.0, + "step": 2322 + }, + { + "epoch": 0.295509477165755, + "ewc_loss": 0.022610627114772797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1305313819320872e-05, + "grad_norm": 21.230676651000977, + "learning_rate": 9.843153878762188e-07, + "loss": 0.5133, + "mean_token_accuracy": 0.8415758609771729, + "num_tokens": 88745689.0, + "step": 2323 + }, + { + "epoch": 0.2956366874443455, + "ewc_loss": 0.022703658789396286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1351829016348347e-05, + "grad_norm": 21.341331481933594, + "learning_rate": 9.847392963119966e-07, + "loss": 0.4649, + "mean_token_accuracy": 0.8537961840629578, + "num_tokens": 88784667.0, + "step": 2324 + }, + { + "epoch": 0.295763897722936, + "ewc_loss": 0.02270027995109558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1350140084687155e-05, + "grad_norm": 21.139869689941406, + "learning_rate": 9.851632047477745e-07, + "loss": 0.4788, + "mean_token_accuracy": 0.8478546142578125, + "num_tokens": 88827334.0, + "step": 2325 + }, + { + "epoch": 0.2958911080015265, + "ewc_loss": 0.022716108709573746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1358054507581983e-05, + "grad_norm": 21.28221893310547, + "learning_rate": 9.855871131835523e-07, + "loss": 0.4122, + "mean_token_accuracy": 0.8675493597984314, + "num_tokens": 88863744.0, + "step": 2326 + }, + { + "epoch": 0.29601831828011704, + "ewc_loss": 0.02275274693965912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1376373549865093e-05, + "grad_norm": 21.276226043701172, + "learning_rate": 9.860110216193302e-07, + "loss": 0.4443, + "mean_token_accuracy": 0.8607494831085205, + "num_tokens": 88903693.0, + "step": 2327 + }, + { + "epoch": 0.2961455285587075, + "ewc_loss": 0.022698163986206055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1349082342348993e-05, + "grad_norm": 21.235742568969727, + "learning_rate": 9.86434930055108e-07, + "loss": 0.4404, + "mean_token_accuracy": 0.8600866794586182, + "num_tokens": 88936668.0, + "step": 2328 + }, + { + "epoch": 0.29627273883729804, + "ewc_loss": 0.02269558049738407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1347789950377773e-05, + "grad_norm": 21.21604347229004, + "learning_rate": 9.868588384908859e-07, + "loss": 0.4495, + "mean_token_accuracy": 0.8557037115097046, + "num_tokens": 88970920.0, + "step": 2329 + }, + { + "epoch": 0.2963999491158886, + "ewc_loss": 0.02271580882370472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.135790444095619e-05, + "grad_norm": 21.261938095092773, + "learning_rate": 9.872827469266637e-07, + "loss": 0.4518, + "mean_token_accuracy": 0.8546353578567505, + "num_tokens": 89010856.0, + "step": 2330 + }, + { + "epoch": 0.2965271593944791, + "ewc_loss": 0.022721245884895325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.136062292061979e-05, + "grad_norm": 21.133712768554688, + "learning_rate": 9.877066553624418e-07, + "loss": 0.4609, + "mean_token_accuracy": 0.8541380167007446, + "num_tokens": 89047109.0, + "step": 2331 + }, + { + "epoch": 0.2966543696730696, + "ewc_loss": 0.022793056443333626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1396527952456381e-05, + "grad_norm": 21.54188346862793, + "learning_rate": 9.881305637982196e-07, + "loss": 0.4702, + "mean_token_accuracy": 0.8528115749359131, + "num_tokens": 89087859.0, + "step": 2332 + }, + { + "epoch": 0.2967815799516601, + "ewc_loss": 0.022820526733994484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1410263141442556e-05, + "grad_norm": 21.429973602294922, + "learning_rate": 9.885544722339975e-07, + "loss": 0.4531, + "mean_token_accuracy": 0.8538856506347656, + "num_tokens": 89123601.0, + "step": 2333 + }, + { + "epoch": 0.29690879023025063, + "ewc_loss": 0.02275342494249344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1376712791388854e-05, + "grad_norm": 21.385601043701172, + "learning_rate": 9.889783806697753e-07, + "loss": 0.4346, + "mean_token_accuracy": 0.8633671402931213, + "num_tokens": 89160133.0, + "step": 2334 + }, + { + "epoch": 0.2970360005088411, + "ewc_loss": 0.022689180448651314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1344590348016936e-05, + "grad_norm": 21.17667579650879, + "learning_rate": 9.894022891055532e-07, + "loss": 0.4275, + "mean_token_accuracy": 0.8654985427856445, + "num_tokens": 89196731.0, + "step": 2335 + }, + { + "epoch": 0.29716321078743163, + "ewc_loss": 0.02278677187860012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1393385648261756e-05, + "grad_norm": 21.483747482299805, + "learning_rate": 9.89826197541331e-07, + "loss": 0.4171, + "mean_token_accuracy": 0.866809070110321, + "num_tokens": 89230159.0, + "step": 2336 + }, + { + "epoch": 0.29729042106602216, + "ewc_loss": 0.022812779992818832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1406389603507705e-05, + "grad_norm": 21.323638916015625, + "learning_rate": 9.902501059771089e-07, + "loss": 0.4819, + "mean_token_accuracy": 0.8470363020896912, + "num_tokens": 89264648.0, + "step": 2337 + }, + { + "epoch": 0.29741763134461263, + "ewc_loss": 0.022762514650821686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1381257536413614e-05, + "grad_norm": 21.23108673095703, + "learning_rate": 9.906740144128867e-07, + "loss": 0.4832, + "mean_token_accuracy": 0.8438342213630676, + "num_tokens": 89302505.0, + "step": 2338 + }, + { + "epoch": 0.29754484162320316, + "ewc_loss": 0.022831499576568604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1415750122978352e-05, + "grad_norm": 21.419857025146484, + "learning_rate": 9.910979228486648e-07, + "loss": 0.441, + "mean_token_accuracy": 0.8588839173316956, + "num_tokens": 89338079.0, + "step": 2339 + }, + { + "epoch": 0.2976720519017937, + "ewc_loss": 0.022836774587631226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1418387657613494e-05, + "grad_norm": 21.291685104370117, + "learning_rate": 9.915218312844426e-07, + "loss": 0.5068, + "mean_token_accuracy": 0.8451070785522461, + "num_tokens": 89376717.0, + "step": 2340 + }, + { + "epoch": 0.29779926218038416, + "ewc_loss": 0.022836850956082344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1418425856390968e-05, + "grad_norm": 21.353805541992188, + "learning_rate": 9.919457397202205e-07, + "loss": 0.4957, + "mean_token_accuracy": 0.8426824808120728, + "num_tokens": 89416164.0, + "step": 2341 + }, + { + "epoch": 0.2979264724589747, + "ewc_loss": 0.022911107167601585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1455553249106742e-05, + "grad_norm": 21.512672424316406, + "learning_rate": 9.923696481559983e-07, + "loss": 0.4474, + "mean_token_accuracy": 0.8577380180358887, + "num_tokens": 89446491.0, + "step": 2342 + }, + { + "epoch": 0.2980536827375652, + "ewc_loss": 0.02290833182632923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1454166269686539e-05, + "grad_norm": 21.52813720703125, + "learning_rate": 9.927935565917761e-07, + "loss": 0.4967, + "mean_token_accuracy": 0.8414134383201599, + "num_tokens": 89483637.0, + "step": 2343 + }, + { + "epoch": 0.2981808930161557, + "ewc_loss": 0.02283262275159359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1416311281209346e-05, + "grad_norm": 21.31361198425293, + "learning_rate": 9.93217465027554e-07, + "loss": 0.4891, + "mean_token_accuracy": 0.8495700359344482, + "num_tokens": 89526532.0, + "step": 2344 + }, + { + "epoch": 0.2983081032947462, + "ewc_loss": 0.02289675734937191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1448378245404456e-05, + "grad_norm": 21.544296264648438, + "learning_rate": 9.936413734633318e-07, + "loss": 0.5516, + "mean_token_accuracy": 0.8321436643600464, + "num_tokens": 89558353.0, + "step": 2345 + }, + { + "epoch": 0.29843531357333675, + "ewc_loss": 0.022961055859923363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1480527973617427e-05, + "grad_norm": 21.453493118286133, + "learning_rate": 9.940652818991097e-07, + "loss": 0.4439, + "mean_token_accuracy": 0.8604382872581482, + "num_tokens": 89598727.0, + "step": 2346 + }, + { + "epoch": 0.2985625238519272, + "ewc_loss": 0.022829513996839523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1414756954764016e-05, + "grad_norm": 21.155067443847656, + "learning_rate": 9.944891903348877e-07, + "loss": 0.5114, + "mean_token_accuracy": 0.8388421535491943, + "num_tokens": 89640093.0, + "step": 2347 + }, + { + "epoch": 0.29868973413051775, + "ewc_loss": 0.02291981503367424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1459907909738831e-05, + "grad_norm": 21.430015563964844, + "learning_rate": 9.949130987706656e-07, + "loss": 0.4557, + "mean_token_accuracy": 0.8557273149490356, + "num_tokens": 89681401.0, + "step": 2348 + }, + { + "epoch": 0.2988169444091083, + "ewc_loss": 0.02300438843667507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1502193956403062e-05, + "grad_norm": 21.24887466430664, + "learning_rate": 9.953370072064432e-07, + "loss": 0.4496, + "mean_token_accuracy": 0.8570696115493774, + "num_tokens": 89720366.0, + "step": 2349 + }, + { + "epoch": 0.29894415468769875, + "ewc_loss": 0.022950660437345505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1475330211396795e-05, + "grad_norm": 21.415889739990234, + "learning_rate": 9.957609156422213e-07, + "loss": 0.5262, + "mean_token_accuracy": 0.8341881036758423, + "num_tokens": 89762234.0, + "step": 2350 + }, + { + "epoch": 0.2990713649662893, + "ewc_loss": 0.02305990271270275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1529951734701172e-05, + "grad_norm": 21.305389404296875, + "learning_rate": 9.961848240779991e-07, + "loss": 0.4738, + "mean_token_accuracy": 0.8523437976837158, + "num_tokens": 89792208.0, + "step": 2351 + }, + { + "epoch": 0.2991985752448798, + "ewc_loss": 0.02301173284649849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.150586649600882e-05, + "grad_norm": 21.424577713012695, + "learning_rate": 9.96608732513777e-07, + "loss": 0.4488, + "mean_token_accuracy": 0.8618177175521851, + "num_tokens": 89833246.0, + "step": 2352 + }, + { + "epoch": 0.2993257855234703, + "ewc_loss": 0.023094620555639267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.154731035057921e-05, + "grad_norm": 21.251157760620117, + "learning_rate": 9.970326409495548e-07, + "loss": 0.5096, + "mean_token_accuracy": 0.8383201360702515, + "num_tokens": 89873271.0, + "step": 2353 + }, + { + "epoch": 0.2994529958020608, + "ewc_loss": 0.023074286058545113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.153714310930809e-05, + "grad_norm": 21.53888702392578, + "learning_rate": 9.974565493853327e-07, + "loss": 0.525, + "mean_token_accuracy": 0.8336713910102844, + "num_tokens": 89913069.0, + "step": 2354 + }, + { + "epoch": 0.29958020608065133, + "ewc_loss": 0.02310035564005375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.15501779873739e-05, + "grad_norm": 21.34587287902832, + "learning_rate": 9.978804578211107e-07, + "loss": 0.5035, + "mean_token_accuracy": 0.8411769270896912, + "num_tokens": 89948915.0, + "step": 2355 + }, + { + "epoch": 0.2997074163592418, + "ewc_loss": 0.023015405982732773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.15077027658117e-05, + "grad_norm": 21.36894989013672, + "learning_rate": 9.983043662568886e-07, + "loss": 0.5024, + "mean_token_accuracy": 0.844038724899292, + "num_tokens": 89982123.0, + "step": 2356 + }, + { + "epoch": 0.29983462663783234, + "ewc_loss": 0.023113101720809937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1556550816749223e-05, + "grad_norm": 21.38248062133789, + "learning_rate": 9.987282746926662e-07, + "loss": 0.4513, + "mean_token_accuracy": 0.857782244682312, + "num_tokens": 90018259.0, + "step": 2357 + }, + { + "epoch": 0.29996183691642286, + "ewc_loss": 0.023130478337407112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.156523921963526e-05, + "grad_norm": 21.407180786132812, + "learning_rate": 9.991521831284443e-07, + "loss": 0.4752, + "mean_token_accuracy": 0.8517070412635803, + "num_tokens": 90059041.0, + "step": 2358 + }, + { + "epoch": 0.30008904719501334, + "ewc_loss": 0.02310715988278389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1553580407053232e-05, + "grad_norm": 21.467622756958008, + "learning_rate": 9.995760915642221e-07, + "loss": 0.4229, + "mean_token_accuracy": 0.8653164505958557, + "num_tokens": 90089463.0, + "step": 2359 + }, + { + "epoch": 0.30021625747360386, + "ewc_loss": 0.02323971875011921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1619859833444934e-05, + "grad_norm": 21.629688262939453, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8585803508758545, + "num_tokens": 90125691.0, + "step": 2360 + }, + { + "epoch": 0.3003434677521944, + "ewc_loss": 0.023102544248104095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1551272109500133e-05, + "grad_norm": 21.26021957397461, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8466838002204895, + "num_tokens": 90161778.0, + "step": 2361 + }, + { + "epoch": 0.30047067803078487, + "ewc_loss": 0.023100513964891434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1550257113412954e-05, + "grad_norm": 21.391164779663086, + "learning_rate": 1e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.8348884582519531, + "num_tokens": 90202243.0, + "step": 2362 + }, + { + "epoch": 0.3005978883093754, + "ewc_loss": 0.02320142276585102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1600711331993807e-05, + "grad_norm": 21.427082061767578, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8419674038887024, + "num_tokens": 90248924.0, + "step": 2363 + }, + { + "epoch": 0.3007250985879659, + "ewc_loss": 0.023101432248950005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.155071640823735e-05, + "grad_norm": 21.322031021118164, + "learning_rate": 1e-06, + "loss": 0.5916, + "mean_token_accuracy": 0.8209438920021057, + "num_tokens": 90281988.0, + "step": 2364 + }, + { + "epoch": 0.3008523088665564, + "ewc_loss": 0.023184336721897125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1592168448260054e-05, + "grad_norm": 21.400909423828125, + "learning_rate": 1e-06, + "loss": 0.5042, + "mean_token_accuracy": 0.8424975872039795, + "num_tokens": 90317050.0, + "step": 2365 + }, + { + "epoch": 0.3009795191451469, + "ewc_loss": 0.02319599688053131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1597998309298418e-05, + "grad_norm": 21.384281158447266, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8569276332855225, + "num_tokens": 90358064.0, + "step": 2366 + }, + { + "epoch": 0.30110672942373745, + "ewc_loss": 0.023231077939271927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1615538824116811e-05, + "grad_norm": 21.3565731048584, + "learning_rate": 1e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.838331937789917, + "num_tokens": 90400692.0, + "step": 2367 + }, + { + "epoch": 0.3012339397023279, + "ewc_loss": 0.023209726437926292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.16048631753074e-05, + "grad_norm": 21.428192138671875, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.841254711151123, + "num_tokens": 90432207.0, + "step": 2368 + }, + { + "epoch": 0.30136114998091845, + "ewc_loss": 0.023251518607139587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1625759725575335e-05, + "grad_norm": 21.425838470458984, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8454604744911194, + "num_tokens": 90466081.0, + "step": 2369 + }, + { + "epoch": 0.301488360259509, + "ewc_loss": 0.023238305002450943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1619152246566955e-05, + "grad_norm": 21.36899757385254, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8552082777023315, + "num_tokens": 90501963.0, + "step": 2370 + }, + { + "epoch": 0.30161557053809945, + "ewc_loss": 0.023309750482439995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1654875379463192e-05, + "grad_norm": 21.44029998779297, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8599690198898315, + "num_tokens": 90540651.0, + "step": 2371 + }, + { + "epoch": 0.30174278081669, + "ewc_loss": 0.023251531645655632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1625766092038248e-05, + "grad_norm": 21.384450912475586, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8407336473464966, + "num_tokens": 90585105.0, + "step": 2372 + }, + { + "epoch": 0.3018699910952805, + "ewc_loss": 0.023287927731871605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1643964171526022e-05, + "grad_norm": 21.29998779296875, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8462202548980713, + "num_tokens": 90626773.0, + "step": 2373 + }, + { + "epoch": 0.301997201373871, + "ewc_loss": 0.0232864860445261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1643242942227516e-05, + "grad_norm": 21.412799835205078, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8629173040390015, + "num_tokens": 90665974.0, + "step": 2374 + }, + { + "epoch": 0.3021244116524615, + "ewc_loss": 0.023317094892263412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.165854700957425e-05, + "grad_norm": 21.31100845336914, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8447493314743042, + "num_tokens": 90706801.0, + "step": 2375 + }, + { + "epoch": 0.30225162193105204, + "ewc_loss": 0.02331314980983734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1656575225060806e-05, + "grad_norm": 21.292695999145508, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8450000286102295, + "num_tokens": 90743635.0, + "step": 2376 + }, + { + "epoch": 0.3023788322096425, + "ewc_loss": 0.023373089730739594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1686544894473627e-05, + "grad_norm": 21.3874454498291, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8487693071365356, + "num_tokens": 90782797.0, + "step": 2377 + }, + { + "epoch": 0.30250604248823304, + "ewc_loss": 0.023359177634119987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1679589078994468e-05, + "grad_norm": 21.332887649536133, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8411482572555542, + "num_tokens": 90822661.0, + "step": 2378 + }, + { + "epoch": 0.30263325276682357, + "ewc_loss": 0.02332433871924877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1662169526971411e-05, + "grad_norm": 21.454010009765625, + "learning_rate": 1e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.8312863111495972, + "num_tokens": 90857804.0, + "step": 2379 + }, + { + "epoch": 0.3027604630454141, + "ewc_loss": 0.023360753431916237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1680376701406203e-05, + "grad_norm": 21.425750732421875, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8456710577011108, + "num_tokens": 90895778.0, + "step": 2380 + }, + { + "epoch": 0.30288767332400457, + "ewc_loss": 0.023297369480133057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1648684449028224e-05, + "grad_norm": 21.288034439086914, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8558015823364258, + "num_tokens": 90934463.0, + "step": 2381 + }, + { + "epoch": 0.3030148836025951, + "ewc_loss": 0.023306643590331078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1653321962512564e-05, + "grad_norm": 21.463808059692383, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8533567190170288, + "num_tokens": 90967888.0, + "step": 2382 + }, + { + "epoch": 0.3031420938811856, + "ewc_loss": 0.02331523410975933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1657617505989037e-05, + "grad_norm": 21.37129783630371, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8559454083442688, + "num_tokens": 91005200.0, + "step": 2383 + }, + { + "epoch": 0.3032693041597761, + "ewc_loss": 0.023328332230448723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1664165867841803e-05, + "grad_norm": 21.410356521606445, + "learning_rate": 1e-06, + "loss": 0.5417, + "mean_token_accuracy": 0.832621157169342, + "num_tokens": 91037840.0, + "step": 2384 + }, + { + "epoch": 0.3033965144383666, + "ewc_loss": 0.02335517853498459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.167758910014527e-05, + "grad_norm": 21.364816665649414, + "learning_rate": 1e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8346167206764221, + "num_tokens": 91073336.0, + "step": 2385 + }, + { + "epoch": 0.30352372471695716, + "ewc_loss": 0.02340814471244812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1704072676366195e-05, + "grad_norm": 21.533178329467773, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.849767804145813, + "num_tokens": 91107001.0, + "step": 2386 + }, + { + "epoch": 0.30365093499554763, + "ewc_loss": 0.02337457239627838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1687286132655572e-05, + "grad_norm": 21.470247268676758, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8552619218826294, + "num_tokens": 91143937.0, + "step": 2387 + }, + { + "epoch": 0.30377814527413816, + "ewc_loss": 0.023386716842651367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.169335882877931e-05, + "grad_norm": 21.426437377929688, + "learning_rate": 1e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8387487530708313, + "num_tokens": 91185391.0, + "step": 2388 + }, + { + "epoch": 0.3039053555527287, + "ewc_loss": 0.02336123213171959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.168061589851277e-05, + "grad_norm": 21.42154884338379, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8472567796707153, + "num_tokens": 91227653.0, + "step": 2389 + }, + { + "epoch": 0.30403256583131916, + "ewc_loss": 0.023356597870588303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1678298506012652e-05, + "grad_norm": 21.45720863342285, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8676061630249023, + "num_tokens": 91263610.0, + "step": 2390 + }, + { + "epoch": 0.3041597761099097, + "ewc_loss": 0.02339983358979225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1699917195073795e-05, + "grad_norm": 21.405155181884766, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8435057997703552, + "num_tokens": 91296880.0, + "step": 2391 + }, + { + "epoch": 0.3042869863885002, + "ewc_loss": 0.02337684854865074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.168842391052749e-05, + "grad_norm": 21.38309097290039, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8362768888473511, + "num_tokens": 91336242.0, + "step": 2392 + }, + { + "epoch": 0.3044141966670907, + "ewc_loss": 0.02341117523610592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1705587894539349e-05, + "grad_norm": 21.404090881347656, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.856149435043335, + "num_tokens": 91376527.0, + "step": 2393 + }, + { + "epoch": 0.3045414069456812, + "ewc_loss": 0.023374680429697037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1687339792842977e-05, + "grad_norm": 21.38118553161621, + "learning_rate": 1e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8383684754371643, + "num_tokens": 91408415.0, + "step": 2394 + }, + { + "epoch": 0.30466861722427174, + "ewc_loss": 0.02343844808638096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.171922394860303e-05, + "grad_norm": 21.459627151489258, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8492691516876221, + "num_tokens": 91449189.0, + "step": 2395 + }, + { + "epoch": 0.3047958275028622, + "ewc_loss": 0.02339955046772957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1699775313900318e-05, + "grad_norm": 21.434953689575195, + "learning_rate": 1e-06, + "loss": 0.496, + "mean_token_accuracy": 0.8448508977890015, + "num_tokens": 91488599.0, + "step": 2396 + }, + { + "epoch": 0.30492303778145274, + "ewc_loss": 0.0234350748360157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1717537745425943e-05, + "grad_norm": 21.401382446289062, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8535802960395813, + "num_tokens": 91525138.0, + "step": 2397 + }, + { + "epoch": 0.30505024806004327, + "ewc_loss": 0.023408208042383194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1704103599186055e-05, + "grad_norm": 21.433883666992188, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8502515554428101, + "num_tokens": 91567444.0, + "step": 2398 + }, + { + "epoch": 0.30517745833863374, + "ewc_loss": 0.02341163530945778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1705817996698897e-05, + "grad_norm": 21.41545867919922, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8464552164077759, + "num_tokens": 91602926.0, + "step": 2399 + }, + { + "epoch": 0.3053046686172243, + "ewc_loss": 0.023456979542970657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1728489880624693e-05, + "grad_norm": 21.51153564453125, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8590661883354187, + "num_tokens": 91635901.0, + "step": 2400 + }, + { + "epoch": 0.3054318788958148, + "ewc_loss": 0.02343243733048439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1716218978108373e-05, + "grad_norm": 21.416839599609375, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8506957292556763, + "num_tokens": 91679542.0, + "step": 2401 + }, + { + "epoch": 0.3055590891744053, + "ewc_loss": 0.02339784801006317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1698924026859459e-05, + "grad_norm": 21.450931549072266, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8439706563949585, + "num_tokens": 91723022.0, + "step": 2402 + }, + { + "epoch": 0.3056862994529958, + "ewc_loss": 0.023452501744031906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1726250704668928e-05, + "grad_norm": 21.45030403137207, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8441872000694275, + "num_tokens": 91760664.0, + "step": 2403 + }, + { + "epoch": 0.30581350973158633, + "ewc_loss": 0.023416096344590187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1708048077707645e-05, + "grad_norm": 21.473691940307617, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8447921276092529, + "num_tokens": 91805570.0, + "step": 2404 + }, + { + "epoch": 0.3059407200101768, + "ewc_loss": 0.023432794958353043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.171639723906992e-05, + "grad_norm": 21.46368980407715, + "learning_rate": 1e-06, + "loss": 0.527, + "mean_token_accuracy": 0.8407210111618042, + "num_tokens": 91843342.0, + "step": 2405 + }, + { + "epoch": 0.30606793028876733, + "ewc_loss": 0.023370690643787384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1685345270961989e-05, + "grad_norm": 21.49623680114746, + "learning_rate": 1e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.8391149044036865, + "num_tokens": 91883872.0, + "step": 2406 + }, + { + "epoch": 0.30619514056735786, + "ewc_loss": 0.023459166288375854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1729583093256224e-05, + "grad_norm": 21.476436614990234, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8474909067153931, + "num_tokens": 91923174.0, + "step": 2407 + }, + { + "epoch": 0.30632235084594833, + "ewc_loss": 0.023381367325782776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1690684004861396e-05, + "grad_norm": 21.670917510986328, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8481096625328064, + "num_tokens": 91963462.0, + "step": 2408 + }, + { + "epoch": 0.30644956112453886, + "ewc_loss": 0.023434681817889214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.171734129457036e-05, + "grad_norm": 21.393451690673828, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8515275716781616, + "num_tokens": 92000907.0, + "step": 2409 + }, + { + "epoch": 0.3065767714031294, + "ewc_loss": 0.023370565846562386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1685282515827566e-05, + "grad_norm": 21.549243927001953, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8503814339637756, + "num_tokens": 92036110.0, + "step": 2410 + }, + { + "epoch": 0.30670398168171986, + "ewc_loss": 0.023493099957704544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1746549716917798e-05, + "grad_norm": 21.570232391357422, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8441147804260254, + "num_tokens": 92080647.0, + "step": 2411 + }, + { + "epoch": 0.3068311919603104, + "ewc_loss": 0.023318391293287277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1659195479296613e-05, + "grad_norm": 21.39182472229004, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8526929020881653, + "num_tokens": 92115407.0, + "step": 2412 + }, + { + "epoch": 0.3069584022389009, + "ewc_loss": 0.023382391780614853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1691196050378494e-05, + "grad_norm": 21.453998565673828, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8493475914001465, + "num_tokens": 92149660.0, + "step": 2413 + }, + { + "epoch": 0.3070856125174914, + "ewc_loss": 0.02341451123356819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.17072559078224e-05, + "grad_norm": 21.51372718811035, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8516920208930969, + "num_tokens": 92191581.0, + "step": 2414 + }, + { + "epoch": 0.3072128227960819, + "ewc_loss": 0.02340281568467617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.170140785689e-05, + "grad_norm": 21.56026840209961, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8440443277359009, + "num_tokens": 92230435.0, + "step": 2415 + }, + { + "epoch": 0.30734003307467245, + "ewc_loss": 0.023460034281015396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.173001692222897e-05, + "grad_norm": 21.647762298583984, + "learning_rate": 1e-06, + "loss": 0.5247, + "mean_token_accuracy": 0.8358702063560486, + "num_tokens": 92269545.0, + "step": 2416 + }, + { + "epoch": 0.3074672433532629, + "ewc_loss": 0.02336147055029869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1680735042318702e-05, + "grad_norm": 21.437911987304688, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8550239205360413, + "num_tokens": 92307447.0, + "step": 2417 + }, + { + "epoch": 0.30759445363185345, + "ewc_loss": 0.023444946855306625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1722473573172465e-05, + "grad_norm": 21.68134117126465, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8466475009918213, + "num_tokens": 92349441.0, + "step": 2418 + }, + { + "epoch": 0.307721663910444, + "ewc_loss": 0.02337733656167984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1688668564602267e-05, + "grad_norm": 21.378128051757812, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8524793386459351, + "num_tokens": 92380405.0, + "step": 2419 + }, + { + "epoch": 0.30784887418903445, + "ewc_loss": 0.023390747606754303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1695373359543737e-05, + "grad_norm": 21.56154441833496, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8474572896957397, + "num_tokens": 92419596.0, + "step": 2420 + }, + { + "epoch": 0.307976084467625, + "ewc_loss": 0.023482779040932655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1741389243979938e-05, + "grad_norm": 21.478683471679688, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8414533138275146, + "num_tokens": 92454590.0, + "step": 2421 + }, + { + "epoch": 0.3081032947462155, + "ewc_loss": 0.02335922233760357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.167961090686731e-05, + "grad_norm": 21.45423126220703, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8438150882720947, + "num_tokens": 92491640.0, + "step": 2422 + }, + { + "epoch": 0.308230505024806, + "ewc_loss": 0.023445148020982742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.172257361758966e-05, + "grad_norm": 21.39797019958496, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8399465084075928, + "num_tokens": 92528682.0, + "step": 2423 + }, + { + "epoch": 0.3083577153033965, + "ewc_loss": 0.02348160743713379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1740803529391997e-05, + "grad_norm": 21.630901336669922, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8557376861572266, + "num_tokens": 92566368.0, + "step": 2424 + }, + { + "epoch": 0.30848492558198704, + "ewc_loss": 0.023495610803365707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1747805729100946e-05, + "grad_norm": 21.430727005004883, + "learning_rate": 1e-06, + "loss": 0.5474, + "mean_token_accuracy": 0.8262543678283691, + "num_tokens": 92601380.0, + "step": 2425 + }, + { + "epoch": 0.3086121358605775, + "ewc_loss": 0.023411249741911888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1705625183822121e-05, + "grad_norm": 21.514400482177734, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8654398322105408, + "num_tokens": 92639724.0, + "step": 2426 + }, + { + "epoch": 0.30873934613916804, + "ewc_loss": 0.023510625585913658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.175531269836938e-05, + "grad_norm": 21.577383041381836, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8546538352966309, + "num_tokens": 92677450.0, + "step": 2427 + }, + { + "epoch": 0.30886655641775856, + "ewc_loss": 0.023474395275115967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1737197382899467e-05, + "grad_norm": 21.52617645263672, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8558307886123657, + "num_tokens": 92714681.0, + "step": 2428 + }, + { + "epoch": 0.3089937666963491, + "ewc_loss": 0.02346496842801571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.173248438135488e-05, + "grad_norm": 21.447080612182617, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8460772037506104, + "num_tokens": 92749925.0, + "step": 2429 + }, + { + "epoch": 0.30912097697493957, + "ewc_loss": 0.02349880523979664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.174940280179726e-05, + "grad_norm": 21.491966247558594, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.852127194404602, + "num_tokens": 92791720.0, + "step": 2430 + }, + { + "epoch": 0.3092481872535301, + "ewc_loss": 0.023516569286584854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1758284927054774e-05, + "grad_norm": 21.432222366333008, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8406690955162048, + "num_tokens": 92826179.0, + "step": 2431 + }, + { + "epoch": 0.3093753975321206, + "ewc_loss": 0.023528799414634705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1764399459934793e-05, + "grad_norm": 21.562437057495117, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8486977815628052, + "num_tokens": 92872751.0, + "step": 2432 + }, + { + "epoch": 0.3095026078107111, + "ewc_loss": 0.023484941571950912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1742470633180346e-05, + "grad_norm": 21.50541877746582, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8423373103141785, + "num_tokens": 92907470.0, + "step": 2433 + }, + { + "epoch": 0.3096298180893016, + "ewc_loss": 0.02345598116517067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.172799056803342e-05, + "grad_norm": 21.483882904052734, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8540433049201965, + "num_tokens": 92943321.0, + "step": 2434 + }, + { + "epoch": 0.30975702836789215, + "ewc_loss": 0.023540716618299484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.177035846922081e-05, + "grad_norm": 21.53665542602539, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8501511812210083, + "num_tokens": 92982136.0, + "step": 2435 + }, + { + "epoch": 0.3098842386464826, + "ewc_loss": 0.023505421355366707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1752710634027608e-05, + "grad_norm": 21.471256256103516, + "learning_rate": 1e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8377736806869507, + "num_tokens": 93015974.0, + "step": 2436 + }, + { + "epoch": 0.31001144892507315, + "ewc_loss": 0.023536499589681625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1768249351007398e-05, + "grad_norm": 21.512258529663086, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8516527414321899, + "num_tokens": 93054010.0, + "step": 2437 + }, + { + "epoch": 0.3101386592036637, + "ewc_loss": 0.023553743958473206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1776872270274907e-05, + "grad_norm": 21.50515365600586, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8560658097267151, + "num_tokens": 93090226.0, + "step": 2438 + }, + { + "epoch": 0.31026586948225415, + "ewc_loss": 0.02356230653822422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1781153261836153e-05, + "grad_norm": 21.55140495300293, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8448129892349243, + "num_tokens": 93130242.0, + "step": 2439 + }, + { + "epoch": 0.3103930797608447, + "ewc_loss": 0.02359011210501194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1795055797847454e-05, + "grad_norm": 21.593114852905273, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8439497947692871, + "num_tokens": 93170990.0, + "step": 2440 + }, + { + "epoch": 0.3105202900394352, + "ewc_loss": 0.023553568869829178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1776784049288835e-05, + "grad_norm": 21.497465133666992, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8409508466720581, + "num_tokens": 93205478.0, + "step": 2441 + }, + { + "epoch": 0.3106475003180257, + "ewc_loss": 0.023571517318487167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.178575894300593e-05, + "grad_norm": 21.491621017456055, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8416261672973633, + "num_tokens": 93240952.0, + "step": 2442 + }, + { + "epoch": 0.3107747105966162, + "ewc_loss": 0.02355051040649414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1775255188695155e-05, + "grad_norm": 21.504467010498047, + "learning_rate": 1e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8375309109687805, + "num_tokens": 93278882.0, + "step": 2443 + }, + { + "epoch": 0.31090192087520674, + "ewc_loss": 0.023603903129696846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1801951586676296e-05, + "grad_norm": 21.457712173461914, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8405055999755859, + "num_tokens": 93324573.0, + "step": 2444 + }, + { + "epoch": 0.3110291311537972, + "ewc_loss": 0.023559560999274254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1779780834331177e-05, + "grad_norm": 21.44428253173828, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8621911406517029, + "num_tokens": 93361049.0, + "step": 2445 + }, + { + "epoch": 0.31115634143238774, + "ewc_loss": 0.0236225426197052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1811271178885363e-05, + "grad_norm": 21.452238082885742, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8450228571891785, + "num_tokens": 93401748.0, + "step": 2446 + }, + { + "epoch": 0.31128355171097827, + "ewc_loss": 0.023616349324584007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1808174349425826e-05, + "grad_norm": 21.557199478149414, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.859906792640686, + "num_tokens": 93434857.0, + "step": 2447 + }, + { + "epoch": 0.31141076198956874, + "ewc_loss": 0.0236312635242939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1815632205980364e-05, + "grad_norm": 21.510072708129883, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8457494974136353, + "num_tokens": 93474681.0, + "step": 2448 + }, + { + "epoch": 0.31153797226815927, + "ewc_loss": 0.023584945127367973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1792472832894418e-05, + "grad_norm": 21.506166458129883, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8459133505821228, + "num_tokens": 93510627.0, + "step": 2449 + }, + { + "epoch": 0.3116651825467498, + "ewc_loss": 0.02360955812036991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1804779205704108e-05, + "grad_norm": 21.50212860107422, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8386657238006592, + "num_tokens": 93551598.0, + "step": 2450 + }, + { + "epoch": 0.31179239282534027, + "ewc_loss": 0.023672286421060562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1836143130494747e-05, + "grad_norm": 21.63191032409668, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.841483473777771, + "num_tokens": 93586622.0, + "step": 2451 + }, + { + "epoch": 0.3119196031039308, + "ewc_loss": 0.02360818162560463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1804090718214866e-05, + "grad_norm": 21.50847625732422, + "learning_rate": 1e-06, + "loss": 0.5331, + "mean_token_accuracy": 0.836536169052124, + "num_tokens": 93626496.0, + "step": 2452 + }, + { + "epoch": 0.3120468133825213, + "ewc_loss": 0.023623520508408546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1811760487034917e-05, + "grad_norm": 21.577640533447266, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8491857051849365, + "num_tokens": 93665641.0, + "step": 2453 + }, + { + "epoch": 0.3121740236611118, + "ewc_loss": 0.023657966405153275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.182898358820239e-05, + "grad_norm": 21.591930389404297, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.852448582649231, + "num_tokens": 93702135.0, + "step": 2454 + }, + { + "epoch": 0.3123012339397023, + "ewc_loss": 0.023584099486470222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1792049917858094e-05, + "grad_norm": 21.53165626525879, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8548494577407837, + "num_tokens": 93738226.0, + "step": 2455 + }, + { + "epoch": 0.31242844421829286, + "ewc_loss": 0.02359284646809101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.179642367787892e-05, + "grad_norm": 21.537227630615234, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.851348340511322, + "num_tokens": 93780273.0, + "step": 2456 + }, + { + "epoch": 0.31255565449688333, + "ewc_loss": 0.023581895977258682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1790947610279545e-05, + "grad_norm": 21.613985061645508, + "learning_rate": 1e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8359751105308533, + "num_tokens": 93821977.0, + "step": 2457 + }, + { + "epoch": 0.31268286477547386, + "ewc_loss": 0.02361954189836979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1809770512627438e-05, + "grad_norm": 21.56024169921875, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8568588495254517, + "num_tokens": 93856084.0, + "step": 2458 + }, + { + "epoch": 0.3128100750540644, + "ewc_loss": 0.023577304556965828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.178865204565227e-05, + "grad_norm": 21.44847297668457, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8508214950561523, + "num_tokens": 93893491.0, + "step": 2459 + }, + { + "epoch": 0.31293728533265486, + "ewc_loss": 0.02360297180712223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1801485925388988e-05, + "grad_norm": 21.528934478759766, + "learning_rate": 1e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8366614580154419, + "num_tokens": 93940240.0, + "step": 2460 + }, + { + "epoch": 0.3130644956112454, + "ewc_loss": 0.02363523282110691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1817616723419633e-05, + "grad_norm": 21.554765701293945, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8391711711883545, + "num_tokens": 93974504.0, + "step": 2461 + }, + { + "epoch": 0.3131917058898359, + "ewc_loss": 0.023634660989046097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1817330232588574e-05, + "grad_norm": 21.752729415893555, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8499351143836975, + "num_tokens": 94009208.0, + "step": 2462 + }, + { + "epoch": 0.3133189161684264, + "ewc_loss": 0.023580092936754227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1790046301030088e-05, + "grad_norm": 21.482921600341797, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8481491208076477, + "num_tokens": 94049249.0, + "step": 2463 + }, + { + "epoch": 0.3134461264470169, + "ewc_loss": 0.023558693006634712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.177934609586373e-05, + "grad_norm": 21.582582473754883, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8482099175453186, + "num_tokens": 94085553.0, + "step": 2464 + }, + { + "epoch": 0.31357333672560744, + "ewc_loss": 0.023629993200302124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1814996469183825e-05, + "grad_norm": 21.579774856567383, + "learning_rate": 1e-06, + "loss": 0.5399, + "mean_token_accuracy": 0.8280013203620911, + "num_tokens": 94127531.0, + "step": 2465 + }, + { + "epoch": 0.3137005470041979, + "ewc_loss": 0.023541638627648354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1770819583034609e-05, + "grad_norm": 21.4913387298584, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8596076965332031, + "num_tokens": 94171191.0, + "step": 2466 + }, + { + "epoch": 0.31382775728278844, + "ewc_loss": 0.023615781217813492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1807890587078873e-05, + "grad_norm": 21.51384162902832, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8526780009269714, + "num_tokens": 94212090.0, + "step": 2467 + }, + { + "epoch": 0.313954967561379, + "ewc_loss": 0.02360994555056095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1804972928075586e-05, + "grad_norm": 21.433500289916992, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8668678998947144, + "num_tokens": 94246835.0, + "step": 2468 + }, + { + "epoch": 0.31408217783996945, + "ewc_loss": 0.023605549708008766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.18027746793814e-05, + "grad_norm": 21.538894653320312, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8457274436950684, + "num_tokens": 94284581.0, + "step": 2469 + }, + { + "epoch": 0.31420938811856, + "ewc_loss": 0.023672346025705338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1836173143819906e-05, + "grad_norm": 21.54422378540039, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8580535054206848, + "num_tokens": 94323050.0, + "step": 2470 + }, + { + "epoch": 0.3143365983971505, + "ewc_loss": 0.023586150258779526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1793074918386992e-05, + "grad_norm": 21.453750610351562, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8425366282463074, + "num_tokens": 94364702.0, + "step": 2471 + }, + { + "epoch": 0.314463808675741, + "ewc_loss": 0.023671751841902733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1835875739052426e-05, + "grad_norm": 21.545400619506836, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8545421361923218, + "num_tokens": 94404735.0, + "step": 2472 + }, + { + "epoch": 0.3145910189543315, + "ewc_loss": 0.023680774495005608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.184038683277322e-05, + "grad_norm": 21.611051559448242, + "learning_rate": 1e-06, + "loss": 0.5273, + "mean_token_accuracy": 0.8333364129066467, + "num_tokens": 94444683.0, + "step": 2473 + }, + { + "epoch": 0.31471822923292203, + "ewc_loss": 0.023635732010006905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1817865924967919e-05, + "grad_norm": 21.43488883972168, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8515604734420776, + "num_tokens": 94484392.0, + "step": 2474 + }, + { + "epoch": 0.3148454395115125, + "ewc_loss": 0.023665759712457657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1832879863504786e-05, + "grad_norm": 21.541126251220703, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8563063740730286, + "num_tokens": 94524860.0, + "step": 2475 + }, + { + "epoch": 0.31497264979010303, + "ewc_loss": 0.023672521114349365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1836260455311276e-05, + "grad_norm": 21.522634506225586, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8445926904678345, + "num_tokens": 94565076.0, + "step": 2476 + }, + { + "epoch": 0.31509986006869356, + "ewc_loss": 0.023674916476011276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1837458259833511e-05, + "grad_norm": 21.587026596069336, + "learning_rate": 1e-06, + "loss": 0.5592, + "mean_token_accuracy": 0.8253558278083801, + "num_tokens": 94606191.0, + "step": 2477 + }, + { + "epoch": 0.31522707034728403, + "ewc_loss": 0.02367066964507103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1835334589704871e-05, + "grad_norm": 21.62101936340332, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8436853885650635, + "num_tokens": 94648021.0, + "step": 2478 + }, + { + "epoch": 0.31535428062587456, + "ewc_loss": 0.023702509701251984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1851255294459406e-05, + "grad_norm": 21.622766494750977, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8590598106384277, + "num_tokens": 94686305.0, + "step": 2479 + }, + { + "epoch": 0.3154814909044651, + "ewc_loss": 0.02368149533867836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1840747902169824e-05, + "grad_norm": 21.654373168945312, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8462222814559937, + "num_tokens": 94728606.0, + "step": 2480 + }, + { + "epoch": 0.3156087011830556, + "ewc_loss": 0.023714371025562286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1857185199914966e-05, + "grad_norm": 21.63852310180664, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8511624336242676, + "num_tokens": 94770162.0, + "step": 2481 + }, + { + "epoch": 0.3157359114616461, + "ewc_loss": 0.023619743064045906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1809871466539335e-05, + "grad_norm": 21.590253829956055, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.855850338935852, + "num_tokens": 94807735.0, + "step": 2482 + }, + { + "epoch": 0.3158631217402366, + "ewc_loss": 0.023702768608927727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1851384442707058e-05, + "grad_norm": 21.60717010498047, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8435002565383911, + "num_tokens": 94840366.0, + "step": 2483 + }, + { + "epoch": 0.31599033201882715, + "ewc_loss": 0.02365817315876484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1829086361103691e-05, + "grad_norm": 21.62595558166504, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8449100255966187, + "num_tokens": 94874293.0, + "step": 2484 + }, + { + "epoch": 0.3161175422974176, + "ewc_loss": 0.023701995611190796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1850997907458805e-05, + "grad_norm": 21.553028106689453, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8473559617996216, + "num_tokens": 94909347.0, + "step": 2485 + }, + { + "epoch": 0.31624475257600815, + "ewc_loss": 0.02366214618086815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1831072697532363e-05, + "grad_norm": 21.516874313354492, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8583488464355469, + "num_tokens": 94955120.0, + "step": 2486 + }, + { + "epoch": 0.3163719628545987, + "ewc_loss": 0.023702798411250114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1851398994622286e-05, + "grad_norm": 21.603055953979492, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8633607625961304, + "num_tokens": 94992319.0, + "step": 2487 + }, + { + "epoch": 0.31649917313318915, + "ewc_loss": 0.023655150085687637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1827574780909345e-05, + "grad_norm": 21.586994171142578, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8576700091362, + "num_tokens": 95023464.0, + "step": 2488 + }, + { + "epoch": 0.3166263834117797, + "ewc_loss": 0.023681115359067917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1840557817777153e-05, + "grad_norm": 21.63809585571289, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8617442846298218, + "num_tokens": 95055963.0, + "step": 2489 + }, + { + "epoch": 0.3167535936903702, + "ewc_loss": 0.023686159402132034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.184307984658517e-05, + "grad_norm": 21.479625701904297, + "learning_rate": 1e-06, + "loss": 0.5394, + "mean_token_accuracy": 0.8300325274467468, + "num_tokens": 95089628.0, + "step": 2490 + }, + { + "epoch": 0.3168808039689607, + "ewc_loss": 0.02376207523047924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1881037607963663e-05, + "grad_norm": 21.71650505065918, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8634238243103027, + "num_tokens": 95128557.0, + "step": 2491 + }, + { + "epoch": 0.3170080142475512, + "ewc_loss": 0.023758264258503914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1879132216563448e-05, + "grad_norm": 21.564958572387695, + "learning_rate": 1e-06, + "loss": 0.5208, + "mean_token_accuracy": 0.8363016247749329, + "num_tokens": 95168324.0, + "step": 2492 + }, + { + "epoch": 0.31713522452614173, + "ewc_loss": 0.023737113922834396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1868556612171233e-05, + "grad_norm": 21.62087059020996, + "learning_rate": 1e-06, + "loss": 0.5516, + "mean_token_accuracy": 0.829070508480072, + "num_tokens": 95208139.0, + "step": 2493 + }, + { + "epoch": 0.3172624348047322, + "ewc_loss": 0.023791048675775528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1895524039573502e-05, + "grad_norm": 21.561656951904297, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8435767889022827, + "num_tokens": 95247337.0, + "step": 2494 + }, + { + "epoch": 0.31738964508332274, + "ewc_loss": 0.023798387497663498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1899193850695156e-05, + "grad_norm": 21.61081886291504, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8421347737312317, + "num_tokens": 95289208.0, + "step": 2495 + }, + { + "epoch": 0.31751685536191326, + "ewc_loss": 0.02383367531001568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.191683804790955e-05, + "grad_norm": 21.698532104492188, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8459560871124268, + "num_tokens": 95329717.0, + "step": 2496 + }, + { + "epoch": 0.31764406564050374, + "ewc_loss": 0.023769158869981766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1884579180332366e-05, + "grad_norm": 21.467082977294922, + "learning_rate": 1e-06, + "loss": 0.5324, + "mean_token_accuracy": 0.8280196785926819, + "num_tokens": 95371052.0, + "step": 2497 + }, + { + "epoch": 0.31777127591909426, + "ewc_loss": 0.023741360753774643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1870680282299872e-05, + "grad_norm": 21.62162208557129, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8687561750411987, + "num_tokens": 95412486.0, + "step": 2498 + }, + { + "epoch": 0.3178984861976848, + "ewc_loss": 0.023815937340259552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.190796865557786e-05, + "grad_norm": 21.539831161499023, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8653901219367981, + "num_tokens": 95446167.0, + "step": 2499 + }, + { + "epoch": 0.31802569647627527, + "ewc_loss": 0.02378595806658268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1892979273397941e-05, + "grad_norm": 21.661163330078125, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8437135815620422, + "num_tokens": 95485763.0, + "step": 2500 + }, + { + "epoch": 0.3181529067548658, + "ewc_loss": 0.02381357178092003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1906786312465556e-05, + "grad_norm": 21.558284759521484, + "learning_rate": 1e-06, + "loss": 0.5456, + "mean_token_accuracy": 0.838034987449646, + "num_tokens": 95526407.0, + "step": 2501 + }, + { + "epoch": 0.3182801170334563, + "ewc_loss": 0.023770801723003387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1885400454048067e-05, + "grad_norm": 21.573997497558594, + "learning_rate": 1e-06, + "loss": 0.5894, + "mean_token_accuracy": 0.8160243034362793, + "num_tokens": 95567479.0, + "step": 2502 + }, + { + "epoch": 0.3184073273120468, + "ewc_loss": 0.023891311138868332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1945655387535226e-05, + "grad_norm": 21.555479049682617, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8548431396484375, + "num_tokens": 95606588.0, + "step": 2503 + }, + { + "epoch": 0.3185345375906373, + "ewc_loss": 0.023777145892381668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1888572771567851e-05, + "grad_norm": 21.506610870361328, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8530063033103943, + "num_tokens": 95653293.0, + "step": 2504 + }, + { + "epoch": 0.31866174786922785, + "ewc_loss": 0.023830007761716843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1915003597096074e-05, + "grad_norm": 21.486608505249023, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8395819664001465, + "num_tokens": 95692424.0, + "step": 2505 + }, + { + "epoch": 0.3187889581478183, + "ewc_loss": 0.023853810504078865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1926905244763475e-05, + "grad_norm": 21.6344051361084, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8575941324234009, + "num_tokens": 95733679.0, + "step": 2506 + }, + { + "epoch": 0.31891616842640885, + "ewc_loss": 0.02386520430445671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.193260231957538e-05, + "grad_norm": 21.50165557861328, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.84300297498703, + "num_tokens": 95768211.0, + "step": 2507 + }, + { + "epoch": 0.3190433787049994, + "ewc_loss": 0.023858705535531044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1929352695005946e-05, + "grad_norm": 21.671560287475586, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8459965586662292, + "num_tokens": 95805974.0, + "step": 2508 + }, + { + "epoch": 0.31917058898358985, + "ewc_loss": 0.02384607121348381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1923035344807431e-05, + "grad_norm": 21.540685653686523, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8468890190124512, + "num_tokens": 95845463.0, + "step": 2509 + }, + { + "epoch": 0.3192977992621804, + "ewc_loss": 0.023823825642466545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.191191313409945e-05, + "grad_norm": 21.577674865722656, + "learning_rate": 1e-06, + "loss": 0.5341, + "mean_token_accuracy": 0.843529224395752, + "num_tokens": 95888148.0, + "step": 2510 + }, + { + "epoch": 0.3194250095407709, + "ewc_loss": 0.023834681138396263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.191734099847963e-05, + "grad_norm": 21.543060302734375, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8706780076026917, + "num_tokens": 95925432.0, + "step": 2511 + }, + { + "epoch": 0.3195522198193614, + "ewc_loss": 0.023857356980443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1928678759431932e-05, + "grad_norm": 21.6734619140625, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8542094826698303, + "num_tokens": 95970695.0, + "step": 2512 + }, + { + "epoch": 0.3196794300979519, + "ewc_loss": 0.023791832849383354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1895916031789966e-05, + "grad_norm": 21.537067413330078, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8491066098213196, + "num_tokens": 96006093.0, + "step": 2513 + }, + { + "epoch": 0.31980664037654244, + "ewc_loss": 0.023853572085499763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1926786100957543e-05, + "grad_norm": 21.687519073486328, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8482531905174255, + "num_tokens": 96045945.0, + "step": 2514 + }, + { + "epoch": 0.3199338506551329, + "ewc_loss": 0.023811187595129013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1905593964911532e-05, + "grad_norm": 21.578699111938477, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8634217977523804, + "num_tokens": 96076748.0, + "step": 2515 + }, + { + "epoch": 0.32006106093372344, + "ewc_loss": 0.023821331560611725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.191066621686332e-05, + "grad_norm": 21.56653594970703, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8503926992416382, + "num_tokens": 96116964.0, + "step": 2516 + }, + { + "epoch": 0.32018827121231397, + "ewc_loss": 0.023896019905805588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1948010069318116e-05, + "grad_norm": 21.707300186157227, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8515758514404297, + "num_tokens": 96152073.0, + "step": 2517 + }, + { + "epoch": 0.32031548149090444, + "ewc_loss": 0.023823073133826256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1911536603292916e-05, + "grad_norm": 21.661563873291016, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8479256629943848, + "num_tokens": 96191608.0, + "step": 2518 + }, + { + "epoch": 0.32044269176949497, + "ewc_loss": 0.023823382332921028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1911691217392217e-05, + "grad_norm": 21.622943878173828, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8668659925460815, + "num_tokens": 96228124.0, + "step": 2519 + }, + { + "epoch": 0.3205699020480855, + "ewc_loss": 0.02385694347321987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1928471394639928e-05, + "grad_norm": 21.74193572998047, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8595851063728333, + "num_tokens": 96265870.0, + "step": 2520 + }, + { + "epoch": 0.32069711232667597, + "ewc_loss": 0.023867126554250717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1933563655475155e-05, + "grad_norm": 21.698457717895508, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8592462539672852, + "num_tokens": 96309623.0, + "step": 2521 + }, + { + "epoch": 0.3208243226052665, + "ewc_loss": 0.023817645385861397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1908822671102826e-05, + "grad_norm": 21.719541549682617, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8422163724899292, + "num_tokens": 96344036.0, + "step": 2522 + }, + { + "epoch": 0.320951532883857, + "ewc_loss": 0.023841066285967827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1920533324882854e-05, + "grad_norm": 21.766845703125, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8605103492736816, + "num_tokens": 96377627.0, + "step": 2523 + }, + { + "epoch": 0.3210787431624475, + "ewc_loss": 0.023837003856897354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1918501513719093e-05, + "grad_norm": 21.728208541870117, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8493251800537109, + "num_tokens": 96413568.0, + "step": 2524 + }, + { + "epoch": 0.32120595344103803, + "ewc_loss": 0.02386624366044998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1933121641050093e-05, + "grad_norm": 21.682289123535156, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8463765978813171, + "num_tokens": 96454682.0, + "step": 2525 + }, + { + "epoch": 0.32133316371962856, + "ewc_loss": 0.023863302543759346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1931650988117326e-05, + "grad_norm": 21.9001522064209, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8386849164962769, + "num_tokens": 96490456.0, + "step": 2526 + }, + { + "epoch": 0.32146037399821903, + "ewc_loss": 0.023846372961997986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1923186320927925e-05, + "grad_norm": 21.69761085510254, + "learning_rate": 1e-06, + "loss": 0.5268, + "mean_token_accuracy": 0.842790424823761, + "num_tokens": 96526891.0, + "step": 2527 + }, + { + "epoch": 0.32158758427680956, + "ewc_loss": 0.023826466873288155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1913233720406424e-05, + "grad_norm": 21.60171127319336, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8436552286148071, + "num_tokens": 96575226.0, + "step": 2528 + }, + { + "epoch": 0.3217147945554001, + "ewc_loss": 0.023839933797717094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.191996670968365e-05, + "grad_norm": 21.922910690307617, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8388252854347229, + "num_tokens": 96606383.0, + "step": 2529 + }, + { + "epoch": 0.3218420048339906, + "ewc_loss": 0.02383062243461609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1915311006305274e-05, + "grad_norm": 21.65036964416504, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.850601851940155, + "num_tokens": 96639826.0, + "step": 2530 + }, + { + "epoch": 0.3219692151125811, + "ewc_loss": 0.023805050179362297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.190252532978775e-05, + "grad_norm": 21.647790908813477, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8529511094093323, + "num_tokens": 96679151.0, + "step": 2531 + }, + { + "epoch": 0.3220964253911716, + "ewc_loss": 0.02386723831295967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1933619134651963e-05, + "grad_norm": 21.83868408203125, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8558318018913269, + "num_tokens": 96719365.0, + "step": 2532 + }, + { + "epoch": 0.32222363566976214, + "ewc_loss": 0.0238040778785944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1902038750122301e-05, + "grad_norm": 21.6171817779541, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8454447388648987, + "num_tokens": 96758318.0, + "step": 2533 + }, + { + "epoch": 0.3223508459483526, + "ewc_loss": 0.02381109818816185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1905549399671145e-05, + "grad_norm": 21.65184211730957, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8434509634971619, + "num_tokens": 96791159.0, + "step": 2534 + }, + { + "epoch": 0.32247805622694314, + "ewc_loss": 0.02389480546116829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1947402526857331e-05, + "grad_norm": 21.730148315429688, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8516104221343994, + "num_tokens": 96825352.0, + "step": 2535 + }, + { + "epoch": 0.32260526650553367, + "ewc_loss": 0.02388198859989643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.194099422718864e-05, + "grad_norm": 21.60782814025879, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8471283316612244, + "num_tokens": 96859700.0, + "step": 2536 + }, + { + "epoch": 0.32273247678412414, + "ewc_loss": 0.023869305849075317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1934653230127878e-05, + "grad_norm": 21.669939041137695, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8474700450897217, + "num_tokens": 96897077.0, + "step": 2537 + }, + { + "epoch": 0.3228596870627147, + "ewc_loss": 0.02387361042201519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1936805094592273e-05, + "grad_norm": 21.43544578552246, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.843523383140564, + "num_tokens": 96936936.0, + "step": 2538 + }, + { + "epoch": 0.3229868973413052, + "ewc_loss": 0.02393956109881401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1969780643994454e-05, + "grad_norm": 21.670757293701172, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8541635274887085, + "num_tokens": 96975702.0, + "step": 2539 + }, + { + "epoch": 0.3231141076198957, + "ewc_loss": 0.023992497473955154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.199624875880545e-05, + "grad_norm": 21.580642700195312, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8477966785430908, + "num_tokens": 97010585.0, + "step": 2540 + }, + { + "epoch": 0.3232413178984862, + "ewc_loss": 0.023949502035975456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1974751032539643e-05, + "grad_norm": 21.611082077026367, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8729905486106873, + "num_tokens": 97047210.0, + "step": 2541 + }, + { + "epoch": 0.32336852817707673, + "ewc_loss": 0.02401784434914589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2008922567474656e-05, + "grad_norm": 21.621097564697266, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8435620069503784, + "num_tokens": 97084835.0, + "step": 2542 + }, + { + "epoch": 0.3234957384556672, + "ewc_loss": 0.023981578648090363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1990789062110707e-05, + "grad_norm": 21.601951599121094, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8463301658630371, + "num_tokens": 97125829.0, + "step": 2543 + }, + { + "epoch": 0.32362294873425773, + "ewc_loss": 0.024041520431637764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2020760550512932e-05, + "grad_norm": 21.63679313659668, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8433946371078491, + "num_tokens": 97162874.0, + "step": 2544 + }, + { + "epoch": 0.32375015901284826, + "ewc_loss": 0.024050075560808182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.202503790409537e-05, + "grad_norm": 21.767715454101562, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8525915741920471, + "num_tokens": 97197559.0, + "step": 2545 + }, + { + "epoch": 0.32387736929143873, + "ewc_loss": 0.023997344076633453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1998671652690973e-05, + "grad_norm": 21.56970977783203, + "learning_rate": 1e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8361192941665649, + "num_tokens": 97227028.0, + "step": 2546 + }, + { + "epoch": 0.32400457957002926, + "ewc_loss": 0.02399933896958828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1999669368378818e-05, + "grad_norm": 21.714948654174805, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.844795823097229, + "num_tokens": 97270209.0, + "step": 2547 + }, + { + "epoch": 0.3241317898486198, + "ewc_loss": 0.024112632498145103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2056316336384043e-05, + "grad_norm": 21.61378288269043, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8524556159973145, + "num_tokens": 97309176.0, + "step": 2548 + }, + { + "epoch": 0.32425900012721026, + "ewc_loss": 0.02401905320584774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2009526471956633e-05, + "grad_norm": 21.69916343688965, + "learning_rate": 1e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.8283274173736572, + "num_tokens": 97340464.0, + "step": 2549 + }, + { + "epoch": 0.3243862104058008, + "ewc_loss": 0.02406281977891922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2031409823975991e-05, + "grad_norm": 21.597475051879883, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.85178542137146, + "num_tokens": 97378435.0, + "step": 2550 + }, + { + "epoch": 0.3245134206843913, + "ewc_loss": 0.023995526134967804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.1997763067483902e-05, + "grad_norm": 21.590253829956055, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8455305099487305, + "num_tokens": 97417886.0, + "step": 2551 + }, + { + "epoch": 0.3246406309629818, + "ewc_loss": 0.024127449840307236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2063725080224685e-05, + "grad_norm": 21.63521957397461, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8514717817306519, + "num_tokens": 97450552.0, + "step": 2552 + }, + { + "epoch": 0.3247678412415723, + "ewc_loss": 0.024058619514107704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2029309800709598e-05, + "grad_norm": 21.569364547729492, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8553932905197144, + "num_tokens": 97488586.0, + "step": 2553 + }, + { + "epoch": 0.32489505152016285, + "ewc_loss": 0.02409336529672146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2046682968502864e-05, + "grad_norm": 21.564687728881836, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8437919616699219, + "num_tokens": 97528893.0, + "step": 2554 + }, + { + "epoch": 0.3250222617987533, + "ewc_loss": 0.024133002385497093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2066500858054496e-05, + "grad_norm": 21.578615188598633, + "learning_rate": 1e-06, + "loss": 0.5547, + "mean_token_accuracy": 0.8256149888038635, + "num_tokens": 97568654.0, + "step": 2555 + }, + { + "epoch": 0.32514947207734385, + "ewc_loss": 0.024145280942320824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2072640856786165e-05, + "grad_norm": 21.56853485107422, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.852552056312561, + "num_tokens": 97607068.0, + "step": 2556 + }, + { + "epoch": 0.3252766823559344, + "ewc_loss": 0.024149147793650627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2074573533027433e-05, + "grad_norm": 21.691572189331055, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8706986904144287, + "num_tokens": 97647505.0, + "step": 2557 + }, + { + "epoch": 0.32540389263452485, + "ewc_loss": 0.02414039522409439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2070197954017203e-05, + "grad_norm": 21.55525016784668, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8425038456916809, + "num_tokens": 97690505.0, + "step": 2558 + }, + { + "epoch": 0.3255311029131154, + "ewc_loss": 0.024147681891918182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2073841389792506e-05, + "grad_norm": 21.710777282714844, + "learning_rate": 1e-06, + "loss": 0.5374, + "mean_token_accuracy": 0.8372728228569031, + "num_tokens": 97723955.0, + "step": 2559 + }, + { + "epoch": 0.3256583131917059, + "ewc_loss": 0.024127881973981857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2063940630469006e-05, + "grad_norm": 21.54299545288086, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8432396054267883, + "num_tokens": 97763822.0, + "step": 2560 + }, + { + "epoch": 0.3257855234702964, + "ewc_loss": 0.024109411984682083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2054705621267203e-05, + "grad_norm": 21.664663314819336, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8443735837936401, + "num_tokens": 97803827.0, + "step": 2561 + }, + { + "epoch": 0.3259127337488869, + "ewc_loss": 0.02416321262717247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2081606655556243e-05, + "grad_norm": 21.670372009277344, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8479288816452026, + "num_tokens": 97841056.0, + "step": 2562 + }, + { + "epoch": 0.32603994402747744, + "ewc_loss": 0.024110402911901474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.205520129587967e-05, + "grad_norm": 21.625289916992188, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8666994571685791, + "num_tokens": 97881714.0, + "step": 2563 + }, + { + "epoch": 0.3261671543060679, + "ewc_loss": 0.024115631356835365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2057816093147267e-05, + "grad_norm": 21.647106170654297, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8455650806427002, + "num_tokens": 97925797.0, + "step": 2564 + }, + { + "epoch": 0.32629436458465844, + "ewc_loss": 0.024183014407753944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2091507414879743e-05, + "grad_norm": 21.751487731933594, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8532342910766602, + "num_tokens": 97963638.0, + "step": 2565 + }, + { + "epoch": 0.32642157486324896, + "ewc_loss": 0.024114230647683144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2057115782226902e-05, + "grad_norm": 21.695383071899414, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8424832820892334, + "num_tokens": 98002043.0, + "step": 2566 + }, + { + "epoch": 0.32654878514183944, + "ewc_loss": 0.02412460930645466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2062304449500516e-05, + "grad_norm": 21.662710189819336, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8506884574890137, + "num_tokens": 98041031.0, + "step": 2567 + }, + { + "epoch": 0.32667599542042997, + "ewc_loss": 0.024114307016134262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2057153071509674e-05, + "grad_norm": 21.747285842895508, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8567298650741577, + "num_tokens": 98075510.0, + "step": 2568 + }, + { + "epoch": 0.3268032056990205, + "ewc_loss": 0.02410346269607544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2051731573592406e-05, + "grad_norm": 21.678970336914062, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8539826273918152, + "num_tokens": 98114162.0, + "step": 2569 + }, + { + "epoch": 0.32693041597761097, + "ewc_loss": 0.02413339726626873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2066698218404781e-05, + "grad_norm": 21.751850128173828, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8433184623718262, + "num_tokens": 98148722.0, + "step": 2570 + }, + { + "epoch": 0.3270576262562015, + "ewc_loss": 0.024092191830277443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.204609634442022e-05, + "grad_norm": 21.6256160736084, + "learning_rate": 1e-06, + "loss": 0.5553, + "mean_token_accuracy": 0.8278538584709167, + "num_tokens": 98188255.0, + "step": 2571 + }, + { + "epoch": 0.327184836534792, + "ewc_loss": 0.024069080129265785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2034540304739494e-05, + "grad_norm": 21.69318199157715, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8590342402458191, + "num_tokens": 98228597.0, + "step": 2572 + }, + { + "epoch": 0.3273120468133825, + "ewc_loss": 0.02413015626370907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2065078408340923e-05, + "grad_norm": 21.664703369140625, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8396973013877869, + "num_tokens": 98273405.0, + "step": 2573 + }, + { + "epoch": 0.327439257091973, + "ewc_loss": 0.024086683988571167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2043342394463252e-05, + "grad_norm": 21.61806297302246, + "learning_rate": 1e-06, + "loss": 0.5442, + "mean_token_accuracy": 0.8334373235702515, + "num_tokens": 98315091.0, + "step": 2574 + }, + { + "epoch": 0.32756646737056355, + "ewc_loss": 0.02411561645567417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.205780790769495e-05, + "grad_norm": 21.68254280090332, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8497772216796875, + "num_tokens": 98356967.0, + "step": 2575 + }, + { + "epoch": 0.327693677649154, + "ewc_loss": 0.02415374293923378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2076871826138813e-05, + "grad_norm": 21.77703857421875, + "learning_rate": 1e-06, + "loss": 0.5142, + "mean_token_accuracy": 0.8393024206161499, + "num_tokens": 98394455.0, + "step": 2576 + }, + { + "epoch": 0.32782088792774455, + "ewc_loss": 0.024107277393341064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2053638783982024e-05, + "grad_norm": 21.638870239257812, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.857174277305603, + "num_tokens": 98430455.0, + "step": 2577 + }, + { + "epoch": 0.3279480982063351, + "ewc_loss": 0.024084093049168587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2042046364513226e-05, + "grad_norm": 21.73714256286621, + "learning_rate": 1e-06, + "loss": 0.5224, + "mean_token_accuracy": 0.8356566429138184, + "num_tokens": 98472218.0, + "step": 2578 + }, + { + "epoch": 0.3280753084849256, + "ewc_loss": 0.024121657013893127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2060828339599539e-05, + "grad_norm": 21.725515365600586, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8553520441055298, + "num_tokens": 98509540.0, + "step": 2579 + }, + { + "epoch": 0.3282025187635161, + "ewc_loss": 0.02415013685822487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2075068298145197e-05, + "grad_norm": 21.766630172729492, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8479466438293457, + "num_tokens": 98544761.0, + "step": 2580 + }, + { + "epoch": 0.3283297290421066, + "ewc_loss": 0.024106351658701897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2053175851178821e-05, + "grad_norm": 21.71973991394043, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8609145879745483, + "num_tokens": 98580522.0, + "step": 2581 + }, + { + "epoch": 0.32845693932069714, + "ewc_loss": 0.024091074243187904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.204553700517863e-05, + "grad_norm": 21.71651268005371, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8569680452346802, + "num_tokens": 98612988.0, + "step": 2582 + }, + { + "epoch": 0.3285841495992876, + "ewc_loss": 0.024137580767273903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2068790056218859e-05, + "grad_norm": 21.766740798950195, + "learning_rate": 1e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8368577361106873, + "num_tokens": 98654213.0, + "step": 2583 + }, + { + "epoch": 0.32871135987787814, + "ewc_loss": 0.024092763662338257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2046381925756577e-05, + "grad_norm": 21.585180282592773, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8520417213439941, + "num_tokens": 98697168.0, + "step": 2584 + }, + { + "epoch": 0.32883857015646867, + "ewc_loss": 0.024108875542879105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.205443732033018e-05, + "grad_norm": 21.7935791015625, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8543277978897095, + "num_tokens": 98735664.0, + "step": 2585 + }, + { + "epoch": 0.32896578043505914, + "ewc_loss": 0.024155380204319954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2077690371370409e-05, + "grad_norm": 21.61328887939453, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8429595828056335, + "num_tokens": 98777257.0, + "step": 2586 + }, + { + "epoch": 0.32909299071364967, + "ewc_loss": 0.024087073281407356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2043537026329432e-05, + "grad_norm": 21.697113037109375, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8527101278305054, + "num_tokens": 98812443.0, + "step": 2587 + }, + { + "epoch": 0.3292202009922402, + "ewc_loss": 0.024188192561268806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2094095836800989e-05, + "grad_norm": 21.827972412109375, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8574008941650391, + "num_tokens": 98848231.0, + "step": 2588 + }, + { + "epoch": 0.32934741127083067, + "ewc_loss": 0.02411951869726181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2059759683324955e-05, + "grad_norm": 21.690061569213867, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8533920049667358, + "num_tokens": 98886487.0, + "step": 2589 + }, + { + "epoch": 0.3294746215494212, + "ewc_loss": 0.024112455546855927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2056228115397971e-05, + "grad_norm": 21.719675064086914, + "learning_rate": 1e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8389812707901001, + "num_tokens": 98924750.0, + "step": 2590 + }, + { + "epoch": 0.3296018318280117, + "ewc_loss": 0.024149060249328613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2074529877281748e-05, + "grad_norm": 21.60621452331543, + "learning_rate": 1e-06, + "loss": 0.5644, + "mean_token_accuracy": 0.8269257545471191, + "num_tokens": 98964151.0, + "step": 2591 + }, + { + "epoch": 0.3297290421066022, + "ewc_loss": 0.0241681020706892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2084051377314609e-05, + "grad_norm": 21.827518463134766, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8594075441360474, + "num_tokens": 99000277.0, + "step": 2592 + }, + { + "epoch": 0.3298562523851927, + "ewc_loss": 0.02418443001806736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2092215001757722e-05, + "grad_norm": 21.6536922454834, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8394491672515869, + "num_tokens": 99042335.0, + "step": 2593 + }, + { + "epoch": 0.32998346266378326, + "ewc_loss": 0.024132337421178818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.206616889248835e-05, + "grad_norm": 21.67267608642578, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.847261905670166, + "num_tokens": 99082964.0, + "step": 2594 + }, + { + "epoch": 0.33011067294237373, + "ewc_loss": 0.024191096425056458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.209554829983972e-05, + "grad_norm": 21.647075653076172, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8717191219329834, + "num_tokens": 99123440.0, + "step": 2595 + }, + { + "epoch": 0.33023788322096426, + "ewc_loss": 0.02423136867582798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2115684512536973e-05, + "grad_norm": 21.830360412597656, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8471651077270508, + "num_tokens": 99167047.0, + "step": 2596 + }, + { + "epoch": 0.3303650934995548, + "ewc_loss": 0.024189570918679237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2094785233784933e-05, + "grad_norm": 21.724037170410156, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8459711074829102, + "num_tokens": 99202594.0, + "step": 2597 + }, + { + "epoch": 0.33049230377814526, + "ewc_loss": 0.024171628057956696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2085813978046644e-05, + "grad_norm": 21.653472900390625, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8550268411636353, + "num_tokens": 99244732.0, + "step": 2598 + }, + { + "epoch": 0.3306195140567358, + "ewc_loss": 0.024133983999490738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2066991985193454e-05, + "grad_norm": 21.660470962524414, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8596606254577637, + "num_tokens": 99287161.0, + "step": 2599 + }, + { + "epoch": 0.3307467243353263, + "ewc_loss": 0.024155447259545326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2077724022674374e-05, + "grad_norm": 21.5490665435791, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8424028158187866, + "num_tokens": 99322650.0, + "step": 2600 + }, + { + "epoch": 0.3308739346139168, + "ewc_loss": 0.024145975708961487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2072988283762243e-05, + "grad_norm": 21.717323303222656, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8497014045715332, + "num_tokens": 99358529.0, + "step": 2601 + }, + { + "epoch": 0.3310011448925073, + "ewc_loss": 0.024244407191872597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2122203770559281e-05, + "grad_norm": 21.62302017211914, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8581441044807434, + "num_tokens": 99394010.0, + "step": 2602 + }, + { + "epoch": 0.33112835517109784, + "ewc_loss": 0.024150550365447998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.20752747534425e-05, + "grad_norm": 21.59629249572754, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8524290919303894, + "num_tokens": 99434764.0, + "step": 2603 + }, + { + "epoch": 0.3312555654496883, + "ewc_loss": 0.024252187460660934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2126093679398764e-05, + "grad_norm": 21.616640090942383, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8509470820426941, + "num_tokens": 99481322.0, + "step": 2604 + }, + { + "epoch": 0.33138277572827884, + "ewc_loss": 0.024230442941188812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.211522157973377e-05, + "grad_norm": 21.657150268554688, + "learning_rate": 1e-06, + "loss": 0.525, + "mean_token_accuracy": 0.8402804136276245, + "num_tokens": 99517719.0, + "step": 2605 + }, + { + "epoch": 0.3315099860068694, + "ewc_loss": 0.024261102080345154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2130551112932153e-05, + "grad_norm": 21.6528377532959, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8517708778381348, + "num_tokens": 99553662.0, + "step": 2606 + }, + { + "epoch": 0.33163719628545985, + "ewc_loss": 0.024313725531101227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2156862794654444e-05, + "grad_norm": 21.767040252685547, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8493399024009705, + "num_tokens": 99589195.0, + "step": 2607 + }, + { + "epoch": 0.3317644065640504, + "ewc_loss": 0.024311382323503494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.215569136547856e-05, + "grad_norm": 21.730106353759766, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8509510159492493, + "num_tokens": 99626092.0, + "step": 2608 + }, + { + "epoch": 0.3318916168426409, + "ewc_loss": 0.02424144372344017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2120722203690093e-05, + "grad_norm": 21.58865737915039, + "learning_rate": 1e-06, + "loss": 0.5514, + "mean_token_accuracy": 0.8247352242469788, + "num_tokens": 99667087.0, + "step": 2609 + }, + { + "epoch": 0.3320188271212314, + "ewc_loss": 0.0243037398904562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.215186966874171e-05, + "grad_norm": 21.633237838745117, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8448494672775269, + "num_tokens": 99705929.0, + "step": 2610 + }, + { + "epoch": 0.3321460373998219, + "ewc_loss": 0.024299556389451027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2149777830927633e-05, + "grad_norm": 21.624414443969727, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8451193571090698, + "num_tokens": 99742044.0, + "step": 2611 + }, + { + "epoch": 0.33227324767841243, + "ewc_loss": 0.024342693388462067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2171346497780178e-05, + "grad_norm": 21.72296905517578, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8528071641921997, + "num_tokens": 99772621.0, + "step": 2612 + }, + { + "epoch": 0.3324004579570029, + "ewc_loss": 0.02433568798005581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.216784403368365e-05, + "grad_norm": 21.533123016357422, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8549412488937378, + "num_tokens": 99814405.0, + "step": 2613 + }, + { + "epoch": 0.33252766823559343, + "ewc_loss": 0.02433795854449272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2168979083071463e-05, + "grad_norm": 21.63799285888672, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8592774868011475, + "num_tokens": 99852197.0, + "step": 2614 + }, + { + "epoch": 0.33265487851418396, + "ewc_loss": 0.024381183087825775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2190591405669693e-05, + "grad_norm": 21.634225845336914, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8453588485717773, + "num_tokens": 99894254.0, + "step": 2615 + }, + { + "epoch": 0.33278208879277443, + "ewc_loss": 0.02434077300131321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2170386071375106e-05, + "grad_norm": 21.639053344726562, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8432137966156006, + "num_tokens": 99936230.0, + "step": 2616 + }, + { + "epoch": 0.33290929907136496, + "ewc_loss": 0.02438187785446644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.219093883264577e-05, + "grad_norm": 21.62897491455078, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8507544994354248, + "num_tokens": 99974964.0, + "step": 2617 + }, + { + "epoch": 0.3330365093499555, + "ewc_loss": 0.024386519566178322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2193259863124695e-05, + "grad_norm": 21.64436149597168, + "learning_rate": 1e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8399627208709717, + "num_tokens": 100018750.0, + "step": 2618 + }, + { + "epoch": 0.33316371962854596, + "ewc_loss": 0.02441953495144844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2209767191961873e-05, + "grad_norm": 21.7237606048584, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8446295261383057, + "num_tokens": 100054064.0, + "step": 2619 + }, + { + "epoch": 0.3332909299071365, + "ewc_loss": 0.024420497938990593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2210249224153813e-05, + "grad_norm": 21.713085174560547, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8538805842399597, + "num_tokens": 100092549.0, + "step": 2620 + }, + { + "epoch": 0.333418140185727, + "ewc_loss": 0.02436717413365841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.218358738697134e-05, + "grad_norm": 21.695568084716797, + "learning_rate": 1e-06, + "loss": 0.5613, + "mean_token_accuracy": 0.8231244683265686, + "num_tokens": 100123973.0, + "step": 2621 + }, + { + "epoch": 0.3335453504643175, + "ewc_loss": 0.024404795840382576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2202397556393407e-05, + "grad_norm": 21.725189208984375, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8479728698730469, + "num_tokens": 100162142.0, + "step": 2622 + }, + { + "epoch": 0.333672560742908, + "ewc_loss": 0.02438982017338276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.219490968651371e-05, + "grad_norm": 21.690942764282227, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8439042568206787, + "num_tokens": 100199352.0, + "step": 2623 + }, + { + "epoch": 0.33379977102149855, + "ewc_loss": 0.02437441609799862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2187208085379098e-05, + "grad_norm": 21.683977127075195, + "learning_rate": 1e-06, + "loss": 0.5307, + "mean_token_accuracy": 0.8339710235595703, + "num_tokens": 100237446.0, + "step": 2624 + }, + { + "epoch": 0.333926981300089, + "ewc_loss": 0.02439543791115284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2197719115647487e-05, + "grad_norm": 21.67945098876953, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8544878363609314, + "num_tokens": 100272986.0, + "step": 2625 + }, + { + "epoch": 0.33405419157867955, + "ewc_loss": 0.024376533925533295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2188266737211961e-05, + "grad_norm": 21.623506546020508, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8403620719909668, + "num_tokens": 100304998.0, + "step": 2626 + }, + { + "epoch": 0.3341814018572701, + "ewc_loss": 0.024414949119091034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2207474355818704e-05, + "grad_norm": 21.762359619140625, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8455047607421875, + "num_tokens": 100338473.0, + "step": 2627 + }, + { + "epoch": 0.33430861213586055, + "ewc_loss": 0.02447454445064068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.223727213073289e-05, + "grad_norm": 21.748519897460938, + "learning_rate": 1e-06, + "loss": 0.5475, + "mean_token_accuracy": 0.8319047689437866, + "num_tokens": 100372644.0, + "step": 2628 + }, + { + "epoch": 0.3344358224144511, + "ewc_loss": 0.02443249709904194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2216248251206707e-05, + "grad_norm": 21.640098571777344, + "learning_rate": 1e-06, + "loss": 0.5482, + "mean_token_accuracy": 0.8292319774627686, + "num_tokens": 100413248.0, + "step": 2629 + }, + { + "epoch": 0.3345630326930416, + "ewc_loss": 0.0244806706905365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2240335308888461e-05, + "grad_norm": 21.89242172241211, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8417340517044067, + "num_tokens": 100448022.0, + "step": 2630 + }, + { + "epoch": 0.33469024297163213, + "ewc_loss": 0.02446455880999565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2232279004820157e-05, + "grad_norm": 21.65382957458496, + "learning_rate": 1e-06, + "loss": 0.5238, + "mean_token_accuracy": 0.8372758626937866, + "num_tokens": 100493222.0, + "step": 2631 + }, + { + "epoch": 0.3348174532502226, + "ewc_loss": 0.02443460188806057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2217300536576658e-05, + "grad_norm": 21.694948196411133, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8545446991920471, + "num_tokens": 100534736.0, + "step": 2632 + }, + { + "epoch": 0.33494466352881314, + "ewc_loss": 0.024514954537153244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2257477465027478e-05, + "grad_norm": 21.81249237060547, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8568867444992065, + "num_tokens": 100576754.0, + "step": 2633 + }, + { + "epoch": 0.33507187380740366, + "ewc_loss": 0.024470122531056404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.223506114911288e-05, + "grad_norm": 21.752721786499023, + "learning_rate": 1e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8392099142074585, + "num_tokens": 100611407.0, + "step": 2634 + }, + { + "epoch": 0.33519908408599414, + "ewc_loss": 0.024501245468854904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2250622603460215e-05, + "grad_norm": 21.711130142211914, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8548148274421692, + "num_tokens": 100643531.0, + "step": 2635 + }, + { + "epoch": 0.33532629436458466, + "ewc_loss": 0.02449874021112919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2249370229255874e-05, + "grad_norm": 21.74047088623047, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8496180176734924, + "num_tokens": 100678612.0, + "step": 2636 + }, + { + "epoch": 0.3354535046431752, + "ewc_loss": 0.0244735199958086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2236760085215792e-05, + "grad_norm": 21.753463745117188, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8530603647232056, + "num_tokens": 100718873.0, + "step": 2637 + }, + { + "epoch": 0.33558071492176567, + "ewc_loss": 0.02447817474603653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2239087482157629e-05, + "grad_norm": 21.644487380981445, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.847224771976471, + "num_tokens": 100759883.0, + "step": 2638 + }, + { + "epoch": 0.3357079252003562, + "ewc_loss": 0.02444401942193508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2222009900142439e-05, + "grad_norm": 21.735536575317383, + "learning_rate": 1e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8281601071357727, + "num_tokens": 100800022.0, + "step": 2639 + }, + { + "epoch": 0.3358351354789467, + "ewc_loss": 0.024528540670871735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.226427048095502e-05, + "grad_norm": 21.74291229248047, + "learning_rate": 1e-06, + "loss": 0.5378, + "mean_token_accuracy": 0.8332473635673523, + "num_tokens": 100837299.0, + "step": 2640 + }, + { + "epoch": 0.3359623457575372, + "ewc_loss": 0.024539515376091003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2269757462490816e-05, + "grad_norm": 21.770532608032227, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8438056707382202, + "num_tokens": 100868433.0, + "step": 2641 + }, + { + "epoch": 0.3360895560361277, + "ewc_loss": 0.024520182982087135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2260091352800373e-05, + "grad_norm": 21.733619689941406, + "learning_rate": 1e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8346232175827026, + "num_tokens": 100907768.0, + "step": 2642 + }, + { + "epoch": 0.33621676631471825, + "ewc_loss": 0.02454443834722042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2272219464648515e-05, + "grad_norm": 21.713533401489258, + "learning_rate": 1e-06, + "loss": 0.5581, + "mean_token_accuracy": 0.8260531425476074, + "num_tokens": 100942723.0, + "step": 2643 + }, + { + "epoch": 0.3363439765933087, + "ewc_loss": 0.02452482283115387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2262411473784596e-05, + "grad_norm": 21.76099967956543, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8548480272293091, + "num_tokens": 100984997.0, + "step": 2644 + }, + { + "epoch": 0.33647118687189925, + "ewc_loss": 0.024572759866714478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.228638029715512e-05, + "grad_norm": 21.780027389526367, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8636250495910645, + "num_tokens": 101026290.0, + "step": 2645 + }, + { + "epoch": 0.3365983971504898, + "ewc_loss": 0.024528594687581062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2264297765796073e-05, + "grad_norm": 21.86652183532715, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8423023223876953, + "num_tokens": 101066766.0, + "step": 2646 + }, + { + "epoch": 0.33672560742908025, + "ewc_loss": 0.024508293718099594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2254146895429585e-05, + "grad_norm": 21.747806549072266, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8526190519332886, + "num_tokens": 101100833.0, + "step": 2647 + }, + { + "epoch": 0.3368528177076708, + "ewc_loss": 0.024472687393426895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.223634353664238e-05, + "grad_norm": 21.7058048248291, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8596808910369873, + "num_tokens": 101145552.0, + "step": 2648 + }, + { + "epoch": 0.3369800279862613, + "ewc_loss": 0.02454129047691822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2270645129319746e-05, + "grad_norm": 21.814739227294922, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8504496812820435, + "num_tokens": 101186644.0, + "step": 2649 + }, + { + "epoch": 0.3371072382648518, + "ewc_loss": 0.024505555629730225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2252778105903417e-05, + "grad_norm": 21.7736873626709, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8455685377120972, + "num_tokens": 101223356.0, + "step": 2650 + }, + { + "epoch": 0.3372344485434423, + "ewc_loss": 0.02446778118610382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2233890629431698e-05, + "grad_norm": 21.74775505065918, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8607522249221802, + "num_tokens": 101260527.0, + "step": 2651 + }, + { + "epoch": 0.33736165882203284, + "ewc_loss": 0.024486863985657692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2243432138347998e-05, + "grad_norm": 21.71799659729004, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8487711548805237, + "num_tokens": 101305403.0, + "step": 2652 + }, + { + "epoch": 0.3374888691006233, + "ewc_loss": 0.02447463572025299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2237317605467979e-05, + "grad_norm": 21.68680763244629, + "learning_rate": 1e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.8340966701507568, + "num_tokens": 101345858.0, + "step": 2653 + }, + { + "epoch": 0.33761607937921384, + "ewc_loss": 0.024546699598431587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2273349966562819e-05, + "grad_norm": 21.9791316986084, + "learning_rate": 1e-06, + "loss": 0.553, + "mean_token_accuracy": 0.8240909576416016, + "num_tokens": 101380321.0, + "step": 2654 + }, + { + "epoch": 0.33774328965780437, + "ewc_loss": 0.024498311802744865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2249155588506255e-05, + "grad_norm": 21.847509384155273, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.857377290725708, + "num_tokens": 101419120.0, + "step": 2655 + }, + { + "epoch": 0.33787049993639484, + "ewc_loss": 0.024418994784355164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2209497072035447e-05, + "grad_norm": 21.87412452697754, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8496881723403931, + "num_tokens": 101453272.0, + "step": 2656 + }, + { + "epoch": 0.33799771021498537, + "ewc_loss": 0.024519557133316994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2259778486622963e-05, + "grad_norm": 21.84049415588379, + "learning_rate": 1e-06, + "loss": 0.5257, + "mean_token_accuracy": 0.8379203081130981, + "num_tokens": 101486879.0, + "step": 2657 + }, + { + "epoch": 0.3381249204935759, + "ewc_loss": 0.024432018399238586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2216009054100141e-05, + "grad_norm": 21.68894386291504, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8409628868103027, + "num_tokens": 101523691.0, + "step": 2658 + }, + { + "epoch": 0.33825213077216637, + "ewc_loss": 0.024491526186466217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2245763173268642e-05, + "grad_norm": 21.79332733154297, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8500435948371887, + "num_tokens": 101555279.0, + "step": 2659 + }, + { + "epoch": 0.3383793410507569, + "ewc_loss": 0.02451501227915287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2257506568857934e-05, + "grad_norm": 21.71050453186035, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8649746775627136, + "num_tokens": 101591780.0, + "step": 2660 + }, + { + "epoch": 0.3385065513293474, + "ewc_loss": 0.02449854463338852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2249272003828082e-05, + "grad_norm": 21.762561798095703, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8650200366973877, + "num_tokens": 101631041.0, + "step": 2661 + }, + { + "epoch": 0.3386337616079379, + "ewc_loss": 0.024555761367082596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2277881069167051e-05, + "grad_norm": 21.757692337036133, + "learning_rate": 1e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.8382225632667542, + "num_tokens": 101666315.0, + "step": 2662 + }, + { + "epoch": 0.33876097188652843, + "ewc_loss": 0.024572674185037613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2286336641409434e-05, + "grad_norm": 21.893369674682617, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8624475002288818, + "num_tokens": 101704046.0, + "step": 2663 + }, + { + "epoch": 0.33888818216511896, + "ewc_loss": 0.024547576904296875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2273788343009073e-05, + "grad_norm": 21.728757858276367, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8589075207710266, + "num_tokens": 101743076.0, + "step": 2664 + }, + { + "epoch": 0.33901539244370943, + "ewc_loss": 0.024553803727030754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2276901543373242e-05, + "grad_norm": 21.90245819091797, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8623201251029968, + "num_tokens": 101784006.0, + "step": 2665 + }, + { + "epoch": 0.33914260272229996, + "ewc_loss": 0.02462111972272396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2310560123296455e-05, + "grad_norm": 21.823232650756836, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8553102612495422, + "num_tokens": 101821845.0, + "step": 2666 + }, + { + "epoch": 0.3392698130008905, + "ewc_loss": 0.024526365101337433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.22631827252917e-05, + "grad_norm": 21.809417724609375, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8548425436019897, + "num_tokens": 101861121.0, + "step": 2667 + }, + { + "epoch": 0.33939702327948096, + "ewc_loss": 0.02457287907600403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2286439414310735e-05, + "grad_norm": 21.770601272583008, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8449145555496216, + "num_tokens": 101894330.0, + "step": 2668 + }, + { + "epoch": 0.3395242335580715, + "ewc_loss": 0.024545874446630478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2272937055968214e-05, + "grad_norm": 21.825634002685547, + "learning_rate": 1e-06, + "loss": 0.5303, + "mean_token_accuracy": 0.8335040807723999, + "num_tokens": 101934835.0, + "step": 2669 + }, + { + "epoch": 0.339651443836662, + "ewc_loss": 0.02458050101995468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2290250197111163e-05, + "grad_norm": 21.78929901123047, + "learning_rate": 1e-06, + "loss": 0.5392, + "mean_token_accuracy": 0.8316243290901184, + "num_tokens": 101970943.0, + "step": 2670 + }, + { + "epoch": 0.3397786541152525, + "ewc_loss": 0.024515565484762192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2257783055247273e-05, + "grad_norm": 21.822248458862305, + "learning_rate": 1e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8372151851654053, + "num_tokens": 102001211.0, + "step": 2671 + }, + { + "epoch": 0.339905864393843, + "ewc_loss": 0.024584703147411346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.229235203936696e-05, + "grad_norm": 21.754703521728516, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8552626371383667, + "num_tokens": 102039399.0, + "step": 2672 + }, + { + "epoch": 0.34003307467243354, + "ewc_loss": 0.024550439789891243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2275219887669664e-05, + "grad_norm": 21.902881622314453, + "learning_rate": 1e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8377733826637268, + "num_tokens": 102070635.0, + "step": 2673 + }, + { + "epoch": 0.340160284951024, + "ewc_loss": 0.024587182328104973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2293590771150775e-05, + "grad_norm": 21.74352264404297, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8536105155944824, + "num_tokens": 102104590.0, + "step": 2674 + }, + { + "epoch": 0.34028749522961454, + "ewc_loss": 0.024526100605726242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.226304993906524e-05, + "grad_norm": 21.806739807128906, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8547351360321045, + "num_tokens": 102149267.0, + "step": 2675 + }, + { + "epoch": 0.3404147055082051, + "ewc_loss": 0.024583403021097183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2291701750655193e-05, + "grad_norm": 21.638092041015625, + "learning_rate": 1e-06, + "loss": 0.5512, + "mean_token_accuracy": 0.8336724042892456, + "num_tokens": 102188089.0, + "step": 2676 + }, + { + "epoch": 0.34054191578679555, + "ewc_loss": 0.024612871930003166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2306435564823914e-05, + "grad_norm": 21.833826065063477, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8560221195220947, + "num_tokens": 102224335.0, + "step": 2677 + }, + { + "epoch": 0.3406691260653861, + "ewc_loss": 0.024640390649437904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2320195310167037e-05, + "grad_norm": 21.73534393310547, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8531926870346069, + "num_tokens": 102263421.0, + "step": 2678 + }, + { + "epoch": 0.3407963363439766, + "ewc_loss": 0.024583401158452034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2291700841160491e-05, + "grad_norm": 21.775705337524414, + "learning_rate": 1e-06, + "loss": 0.5452, + "mean_token_accuracy": 0.8322615623474121, + "num_tokens": 102301133.0, + "step": 2679 + }, + { + "epoch": 0.34092354662256713, + "ewc_loss": 0.02461230754852295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2306153621466365e-05, + "grad_norm": 21.8271541595459, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8534770011901855, + "num_tokens": 102338516.0, + "step": 2680 + }, + { + "epoch": 0.3410507569011576, + "ewc_loss": 0.0246100053191185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2305003110668622e-05, + "grad_norm": 21.73332405090332, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8394244909286499, + "num_tokens": 102376010.0, + "step": 2681 + }, + { + "epoch": 0.34117796717974813, + "ewc_loss": 0.024648653343319893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.232432623510249e-05, + "grad_norm": 21.921293258666992, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8708187937736511, + "num_tokens": 102417080.0, + "step": 2682 + }, + { + "epoch": 0.34130517745833866, + "ewc_loss": 0.024622522294521332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2311261343711521e-05, + "grad_norm": 22.005064010620117, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8513500690460205, + "num_tokens": 102458042.0, + "step": 2683 + }, + { + "epoch": 0.34143238773692913, + "ewc_loss": 0.024588340893387794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2294170119275805e-05, + "grad_norm": 21.856834411621094, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8568668365478516, + "num_tokens": 102497152.0, + "step": 2684 + }, + { + "epoch": 0.34155959801551966, + "ewc_loss": 0.024505093693733215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2252547094249167e-05, + "grad_norm": 21.695890426635742, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8465738892555237, + "num_tokens": 102531486.0, + "step": 2685 + }, + { + "epoch": 0.3416868082941102, + "ewc_loss": 0.024612758308649063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2306379176152404e-05, + "grad_norm": 21.80120277404785, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8474645614624023, + "num_tokens": 102571552.0, + "step": 2686 + }, + { + "epoch": 0.34181401857270066, + "ewc_loss": 0.024545440450310707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2272720596229192e-05, + "grad_norm": 21.768999099731445, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8547093868255615, + "num_tokens": 102607338.0, + "step": 2687 + }, + { + "epoch": 0.3419412288512912, + "ewc_loss": 0.024565178900957108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.228258952323813e-05, + "grad_norm": 21.764324188232422, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8488718271255493, + "num_tokens": 102654445.0, + "step": 2688 + }, + { + "epoch": 0.3420684391298817, + "ewc_loss": 0.02460768073797226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2303840776439756e-05, + "grad_norm": 21.883703231811523, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8567578792572021, + "num_tokens": 102695265.0, + "step": 2689 + }, + { + "epoch": 0.3421956494084722, + "ewc_loss": 0.02458059787750244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2290299309825059e-05, + "grad_norm": 21.731782913208008, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8533825874328613, + "num_tokens": 102728621.0, + "step": 2690 + }, + { + "epoch": 0.3423228596870627, + "ewc_loss": 0.024577926844358444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2288963262108155e-05, + "grad_norm": 21.81576156616211, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8428360223770142, + "num_tokens": 102766378.0, + "step": 2691 + }, + { + "epoch": 0.34245006996565325, + "ewc_loss": 0.024614399299025536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2307199540373404e-05, + "grad_norm": 21.872177124023438, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8476453423500061, + "num_tokens": 102806331.0, + "step": 2692 + }, + { + "epoch": 0.3425772802442437, + "ewc_loss": 0.024596059694886208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2298030014790129e-05, + "grad_norm": 21.664932250976562, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8523136377334595, + "num_tokens": 102845314.0, + "step": 2693 + }, + { + "epoch": 0.34270449052283425, + "ewc_loss": 0.024666814133524895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.233340663020499e-05, + "grad_norm": 21.922754287719727, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8477612137794495, + "num_tokens": 102888614.0, + "step": 2694 + }, + { + "epoch": 0.3428317008014248, + "ewc_loss": 0.024665413424372673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2332706319284625e-05, + "grad_norm": 21.715003967285156, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8442027568817139, + "num_tokens": 102925988.0, + "step": 2695 + }, + { + "epoch": 0.34295891108001525, + "ewc_loss": 0.024668429046869278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2334214261500165e-05, + "grad_norm": 21.883895874023438, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8577829003334045, + "num_tokens": 102963150.0, + "step": 2696 + }, + { + "epoch": 0.3430861213586058, + "ewc_loss": 0.024658260866999626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2329130186117254e-05, + "grad_norm": 21.727855682373047, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.856208324432373, + "num_tokens": 102999164.0, + "step": 2697 + }, + { + "epoch": 0.3432133316371963, + "ewc_loss": 0.024647612124681473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2323806004133075e-05, + "grad_norm": 21.85356330871582, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.857474684715271, + "num_tokens": 103034337.0, + "step": 2698 + }, + { + "epoch": 0.3433405419157868, + "ewc_loss": 0.024689404293894768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.234470255440101e-05, + "grad_norm": 21.879125595092773, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8557206988334656, + "num_tokens": 103075062.0, + "step": 2699 + }, + { + "epoch": 0.3434677521943773, + "ewc_loss": 0.024683114141225815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.234155752172228e-05, + "grad_norm": 21.85457420349121, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8443804979324341, + "num_tokens": 103113209.0, + "step": 2700 + }, + { + "epoch": 0.34359496247296784, + "ewc_loss": 0.02464180253446102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2320901078055613e-05, + "grad_norm": 21.68165397644043, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8538970947265625, + "num_tokens": 103153117.0, + "step": 2701 + }, + { + "epoch": 0.3437221727515583, + "ewc_loss": 0.024634528905153275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2317264918237925e-05, + "grad_norm": 21.794267654418945, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.848676860332489, + "num_tokens": 103192234.0, + "step": 2702 + }, + { + "epoch": 0.34384938303014884, + "ewc_loss": 0.024702146649360657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.235107356478693e-05, + "grad_norm": 21.767147064208984, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8473054766654968, + "num_tokens": 103224958.0, + "step": 2703 + }, + { + "epoch": 0.34397659330873936, + "ewc_loss": 0.024734746664762497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2367373528832104e-05, + "grad_norm": 21.814895629882812, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8514106273651123, + "num_tokens": 103259679.0, + "step": 2704 + }, + { + "epoch": 0.34410380358732984, + "ewc_loss": 0.02475166507065296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2375832739053294e-05, + "grad_norm": 21.918283462524414, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.840613842010498, + "num_tokens": 103299987.0, + "step": 2705 + }, + { + "epoch": 0.34423101386592037, + "ewc_loss": 0.024710379540920258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2355189937807154e-05, + "grad_norm": 21.787708282470703, + "learning_rate": 1e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.836213231086731, + "num_tokens": 103344457.0, + "step": 2706 + }, + { + "epoch": 0.3443582241445109, + "ewc_loss": 0.024672871455550194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2336436157056596e-05, + "grad_norm": 21.82794952392578, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8540105223655701, + "num_tokens": 103385173.0, + "step": 2707 + }, + { + "epoch": 0.34448543442310137, + "ewc_loss": 0.024709012359380722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.235450599779142e-05, + "grad_norm": 21.83769416809082, + "learning_rate": 1e-06, + "loss": 0.53, + "mean_token_accuracy": 0.8352893590927124, + "num_tokens": 103425972.0, + "step": 2708 + }, + { + "epoch": 0.3446126447016919, + "ewc_loss": 0.02463487908244133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2317439541220665e-05, + "grad_norm": 21.807235717773438, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8457376956939697, + "num_tokens": 103466819.0, + "step": 2709 + }, + { + "epoch": 0.3447398549802824, + "ewc_loss": 0.024751342833042145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.237567175849108e-05, + "grad_norm": 21.78669548034668, + "learning_rate": 1e-06, + "loss": 0.517, + "mean_token_accuracy": 0.8387573957443237, + "num_tokens": 103509660.0, + "step": 2710 + }, + { + "epoch": 0.3448670652588729, + "ewc_loss": 0.02468431554734707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.234215778822545e-05, + "grad_norm": 21.849565505981445, + "learning_rate": 1e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8441867828369141, + "num_tokens": 103544557.0, + "step": 2711 + }, + { + "epoch": 0.3449942755374634, + "ewc_loss": 0.024770457297563553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.238522872881731e-05, + "grad_norm": 21.827219009399414, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8587263822555542, + "num_tokens": 103585132.0, + "step": 2712 + }, + { + "epoch": 0.34512148581605395, + "ewc_loss": 0.024727869778871536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.23639347293647e-05, + "grad_norm": 21.75804901123047, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8578200340270996, + "num_tokens": 103626353.0, + "step": 2713 + }, + { + "epoch": 0.3452486960946444, + "ewc_loss": 0.02470928058028221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2354640603007283e-05, + "grad_norm": 21.802568435668945, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8531648516654968, + "num_tokens": 103659035.0, + "step": 2714 + }, + { + "epoch": 0.34537590637323495, + "ewc_loss": 0.024741513654589653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.23707568491227e-05, + "grad_norm": 21.801897048950195, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8572255373001099, + "num_tokens": 103699497.0, + "step": 2715 + }, + { + "epoch": 0.3455031166518255, + "ewc_loss": 0.024720260873436928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2360130313027184e-05, + "grad_norm": 21.866548538208008, + "learning_rate": 1e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8379395008087158, + "num_tokens": 103733685.0, + "step": 2716 + }, + { + "epoch": 0.34563032693041595, + "ewc_loss": 0.02474597655236721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2372987839626148e-05, + "grad_norm": 21.81360626220703, + "learning_rate": 1e-06, + "loss": 0.532, + "mean_token_accuracy": 0.8324215412139893, + "num_tokens": 103769399.0, + "step": 2717 + }, + { + "epoch": 0.3457575372090065, + "ewc_loss": 0.024731459096074104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2365729162411299e-05, + "grad_norm": 21.83434295654297, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8570051789283752, + "num_tokens": 103809078.0, + "step": 2718 + }, + { + "epoch": 0.345884747487597, + "ewc_loss": 0.02474384382367134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.237192191183567e-05, + "grad_norm": 21.777503967285156, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8667691946029663, + "num_tokens": 103846764.0, + "step": 2719 + }, + { + "epoch": 0.3460119577661875, + "ewc_loss": 0.024794012308120728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2397005775710568e-05, + "grad_norm": 21.89150047302246, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8594300150871277, + "num_tokens": 103883089.0, + "step": 2720 + }, + { + "epoch": 0.346139168044778, + "ewc_loss": 0.024719350039958954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2359674656181596e-05, + "grad_norm": 21.708694458007812, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8660078048706055, + "num_tokens": 103915775.0, + "step": 2721 + }, + { + "epoch": 0.34626637832336854, + "ewc_loss": 0.024761226028203964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2380613043205813e-05, + "grad_norm": 21.862138748168945, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8452872037887573, + "num_tokens": 103953703.0, + "step": 2722 + }, + { + "epoch": 0.346393588601959, + "ewc_loss": 0.024787386879324913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.239369339600671e-05, + "grad_norm": 21.776371002197266, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8407249450683594, + "num_tokens": 103990775.0, + "step": 2723 + }, + { + "epoch": 0.34652079888054954, + "ewc_loss": 0.02479289472103119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.239644734596368e-05, + "grad_norm": 21.902990341186523, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8528004288673401, + "num_tokens": 104025927.0, + "step": 2724 + }, + { + "epoch": 0.34664800915914007, + "ewc_loss": 0.024825308471918106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2412654541549273e-05, + "grad_norm": 21.76949119567871, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8408949971199036, + "num_tokens": 104065860.0, + "step": 2725 + }, + { + "epoch": 0.34677521943773054, + "ewc_loss": 0.024766165763139725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2383083230815828e-05, + "grad_norm": 21.889419555664062, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8527478575706482, + "num_tokens": 104102117.0, + "step": 2726 + }, + { + "epoch": 0.34690242971632107, + "ewc_loss": 0.02483741194009781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.241870631929487e-05, + "grad_norm": 21.83983612060547, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.84662926197052, + "num_tokens": 104140896.0, + "step": 2727 + }, + { + "epoch": 0.3470296399949116, + "ewc_loss": 0.02479715272784233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.239857647306053e-05, + "grad_norm": 21.833620071411133, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8481167554855347, + "num_tokens": 104181355.0, + "step": 2728 + }, + { + "epoch": 0.3471568502735021, + "ewc_loss": 0.024805890396237373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2402944776113145e-05, + "grad_norm": 21.8370418548584, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.851581335067749, + "num_tokens": 104218564.0, + "step": 2729 + }, + { + "epoch": 0.3472840605520926, + "ewc_loss": 0.024807441979646683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2403720575093757e-05, + "grad_norm": 21.854015350341797, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8545935750007629, + "num_tokens": 104256553.0, + "step": 2730 + }, + { + "epoch": 0.3474112708306831, + "ewc_loss": 0.024792660027742386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.239633002114715e-05, + "grad_norm": 21.767240524291992, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8629639148712158, + "num_tokens": 104291150.0, + "step": 2731 + }, + { + "epoch": 0.34753848110927366, + "ewc_loss": 0.024776380509138107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2388190043566283e-05, + "grad_norm": 21.78968620300293, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8420975208282471, + "num_tokens": 104327977.0, + "step": 2732 + }, + { + "epoch": 0.34766569138786413, + "ewc_loss": 0.02482166327536106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.241083191416692e-05, + "grad_norm": 21.76722526550293, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8458684682846069, + "num_tokens": 104365167.0, + "step": 2733 + }, + { + "epoch": 0.34779290166645466, + "ewc_loss": 0.024848800152540207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2424399756127968e-05, + "grad_norm": 21.839479446411133, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8550843000411987, + "num_tokens": 104400304.0, + "step": 2734 + }, + { + "epoch": 0.3479201119450452, + "ewc_loss": 0.02486465498805046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2432327821443323e-05, + "grad_norm": 21.937511444091797, + "learning_rate": 1e-06, + "loss": 0.5284, + "mean_token_accuracy": 0.8347839117050171, + "num_tokens": 104437974.0, + "step": 2735 + }, + { + "epoch": 0.34804732222363566, + "ewc_loss": 0.0248380359262228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2419018275977578e-05, + "grad_norm": 21.848121643066406, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8533430695533752, + "num_tokens": 104474474.0, + "step": 2736 + }, + { + "epoch": 0.3481745325022262, + "ewc_loss": 0.02487386390566826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2436931683623698e-05, + "grad_norm": 21.77329444885254, + "learning_rate": 1e-06, + "loss": 0.5265, + "mean_token_accuracy": 0.8359426259994507, + "num_tokens": 104511004.0, + "step": 2737 + }, + { + "epoch": 0.3483017427808167, + "ewc_loss": 0.02492048777639866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2460243851819541e-05, + "grad_norm": 21.897205352783203, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.844025731086731, + "num_tokens": 104551889.0, + "step": 2738 + }, + { + "epoch": 0.3484289530594072, + "ewc_loss": 0.024914423003792763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.245721159648383e-05, + "grad_norm": 21.772188186645508, + "learning_rate": 1e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.8334823846817017, + "num_tokens": 104591962.0, + "step": 2739 + }, + { + "epoch": 0.3485561633379977, + "ewc_loss": 0.024932119995355606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.246606007043738e-05, + "grad_norm": 21.9415225982666, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8483271598815918, + "num_tokens": 104636201.0, + "step": 2740 + }, + { + "epoch": 0.34868337361658824, + "ewc_loss": 0.02496163547039032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2480817531468347e-05, + "grad_norm": 21.804563522338867, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.849495530128479, + "num_tokens": 104673448.0, + "step": 2741 + }, + { + "epoch": 0.3488105838951787, + "ewc_loss": 0.02495429292321205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2477146810851991e-05, + "grad_norm": 21.969266891479492, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.858515202999115, + "num_tokens": 104708879.0, + "step": 2742 + }, + { + "epoch": 0.34893779417376924, + "ewc_loss": 0.024957876652479172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2478938515414484e-05, + "grad_norm": 21.789995193481445, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8597536087036133, + "num_tokens": 104747073.0, + "step": 2743 + }, + { + "epoch": 0.3490650044523598, + "ewc_loss": 0.024959711357951164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2479855286073871e-05, + "grad_norm": 22.02749252319336, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.849846363067627, + "num_tokens": 104786950.0, + "step": 2744 + }, + { + "epoch": 0.34919221473095025, + "ewc_loss": 0.024998188018798828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2499093827500474e-05, + "grad_norm": 21.87938690185547, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8547287583351135, + "num_tokens": 104828306.0, + "step": 2745 + }, + { + "epoch": 0.3493194250095408, + "ewc_loss": 0.02484932541847229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.242466260009678e-05, + "grad_norm": 21.918529510498047, + "learning_rate": 1e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8411195278167725, + "num_tokens": 104867030.0, + "step": 2746 + }, + { + "epoch": 0.3494466352881313, + "ewc_loss": 0.025012750178575516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2506375242082868e-05, + "grad_norm": 21.95234489440918, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8538815975189209, + "num_tokens": 104909182.0, + "step": 2747 + }, + { + "epoch": 0.3495738455667218, + "ewc_loss": 0.02489311434328556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.244655686605256e-05, + "grad_norm": 21.923341751098633, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8461828231811523, + "num_tokens": 104950648.0, + "step": 2748 + }, + { + "epoch": 0.3497010558453123, + "ewc_loss": 0.024943767115473747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2471883565012831e-05, + "grad_norm": 21.913175582885742, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8559002876281738, + "num_tokens": 104994780.0, + "step": 2749 + }, + { + "epoch": 0.34982826612390283, + "ewc_loss": 0.0248451828956604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2422591680660844e-05, + "grad_norm": 21.913555145263672, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8579925298690796, + "num_tokens": 105029469.0, + "step": 2750 + }, + { + "epoch": 0.3499554764024933, + "ewc_loss": 0.024919873103499413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2459936442610342e-05, + "grad_norm": 21.934797286987305, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8444509506225586, + "num_tokens": 105066797.0, + "step": 2751 + }, + { + "epoch": 0.35008268668108383, + "ewc_loss": 0.02489292621612549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2446463188098278e-05, + "grad_norm": 21.959762573242188, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8647276163101196, + "num_tokens": 105100869.0, + "step": 2752 + }, + { + "epoch": 0.35020989695967436, + "ewc_loss": 0.024875320494174957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2437660188879818e-05, + "grad_norm": 21.939542770385742, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8630533814430237, + "num_tokens": 105135409.0, + "step": 2753 + }, + { + "epoch": 0.35033710723826483, + "ewc_loss": 0.024844639003276825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2422319741745014e-05, + "grad_norm": 21.912315368652344, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.856147825717926, + "num_tokens": 105172839.0, + "step": 2754 + }, + { + "epoch": 0.35046431751685536, + "ewc_loss": 0.02488049305975437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.244024679181166e-05, + "grad_norm": 21.896259307861328, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8569660782814026, + "num_tokens": 105216656.0, + "step": 2755 + }, + { + "epoch": 0.3505915277954459, + "ewc_loss": 0.024857331067323685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2428665286279283e-05, + "grad_norm": 21.87002944946289, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8453540205955505, + "num_tokens": 105253740.0, + "step": 2756 + }, + { + "epoch": 0.35071873807403636, + "ewc_loss": 0.02480292320251465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2401461390254553e-05, + "grad_norm": 21.830955505371094, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8540281057357788, + "num_tokens": 105290073.0, + "step": 2757 + }, + { + "epoch": 0.3508459483526269, + "ewc_loss": 0.024842221289873123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2421111023286358e-05, + "grad_norm": 21.856727600097656, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8570678234100342, + "num_tokens": 105325933.0, + "step": 2758 + }, + { + "epoch": 0.3509731586312174, + "ewc_loss": 0.0248645581305027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2432278708729427e-05, + "grad_norm": 21.82330894470215, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8539756536483765, + "num_tokens": 105362962.0, + "step": 2759 + }, + { + "epoch": 0.3511003689098079, + "ewc_loss": 0.02484169788658619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2420849088812247e-05, + "grad_norm": 21.82313346862793, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8513597846031189, + "num_tokens": 105410074.0, + "step": 2760 + }, + { + "epoch": 0.3512275791883984, + "ewc_loss": 0.02486122027039528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2430609785951674e-05, + "grad_norm": 21.884761810302734, + "learning_rate": 1e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.8369265794754028, + "num_tokens": 105450264.0, + "step": 2761 + }, + { + "epoch": 0.35135478946698895, + "ewc_loss": 0.024877527728676796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2438763405953068e-05, + "grad_norm": 21.915874481201172, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8418713808059692, + "num_tokens": 105486566.0, + "step": 2762 + }, + { + "epoch": 0.3514819997455794, + "ewc_loss": 0.02488032355904579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2440162208804395e-05, + "grad_norm": 21.88360595703125, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8550807237625122, + "num_tokens": 105527185.0, + "step": 2763 + }, + { + "epoch": 0.35160921002416995, + "ewc_loss": 0.024876099079847336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2438049452612177e-05, + "grad_norm": 21.898134231567383, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8493525981903076, + "num_tokens": 105557401.0, + "step": 2764 + }, + { + "epoch": 0.3517364203027605, + "ewc_loss": 0.02485281229019165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.242640610144008e-05, + "grad_norm": 21.92973518371582, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8561994433403015, + "num_tokens": 105586877.0, + "step": 2765 + }, + { + "epoch": 0.35186363058135095, + "ewc_loss": 0.024893073365092278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2446536857169122e-05, + "grad_norm": 21.940176010131836, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8547666072845459, + "num_tokens": 105626988.0, + "step": 2766 + }, + { + "epoch": 0.3519908408599415, + "ewc_loss": 0.024896923452615738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2448461347958073e-05, + "grad_norm": 21.793060302734375, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8425091505050659, + "num_tokens": 105665796.0, + "step": 2767 + }, + { + "epoch": 0.352118051138532, + "ewc_loss": 0.02489742636680603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2448713277990464e-05, + "grad_norm": 21.86843490600586, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8427125811576843, + "num_tokens": 105708892.0, + "step": 2768 + }, + { + "epoch": 0.3522452614171225, + "ewc_loss": 0.024966169148683548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2483084901759867e-05, + "grad_norm": 22.052696228027344, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.845251202583313, + "num_tokens": 105748221.0, + "step": 2769 + }, + { + "epoch": 0.352372471695713, + "ewc_loss": 0.02490369975566864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2451850125216879e-05, + "grad_norm": 22.049489974975586, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.84937584400177, + "num_tokens": 105784568.0, + "step": 2770 + }, + { + "epoch": 0.35249968197430354, + "ewc_loss": 0.024908291175961494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2454145689844154e-05, + "grad_norm": 21.76751708984375, + "learning_rate": 1e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8391112089157104, + "num_tokens": 105827223.0, + "step": 2771 + }, + { + "epoch": 0.352626892252894, + "ewc_loss": 0.024919016286730766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2459508070605807e-05, + "grad_norm": 22.218252182006836, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8680343627929688, + "num_tokens": 105871014.0, + "step": 2772 + }, + { + "epoch": 0.35275410253148454, + "ewc_loss": 0.024968473240733147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2484236322052311e-05, + "grad_norm": 21.982166290283203, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8462451696395874, + "num_tokens": 105908995.0, + "step": 2773 + }, + { + "epoch": 0.35288131281007507, + "ewc_loss": 0.02483030967414379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2415154742484447e-05, + "grad_norm": 21.906391143798828, + "learning_rate": 1e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8444299101829529, + "num_tokens": 105948243.0, + "step": 2774 + }, + { + "epoch": 0.35300852308866554, + "ewc_loss": 0.024922078475356102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2461039659683593e-05, + "grad_norm": 22.048717498779297, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8519719243049622, + "num_tokens": 105983997.0, + "step": 2775 + }, + { + "epoch": 0.35313573336725607, + "ewc_loss": 0.02486804500222206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2434022210072726e-05, + "grad_norm": 21.81151008605957, + "learning_rate": 1e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8371070623397827, + "num_tokens": 106020336.0, + "step": 2776 + }, + { + "epoch": 0.3532629436458466, + "ewc_loss": 0.024872146546840668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2436073120625224e-05, + "grad_norm": 21.920299530029297, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8521220684051514, + "num_tokens": 106053300.0, + "step": 2777 + }, + { + "epoch": 0.35339015392443707, + "ewc_loss": 0.024905851110816002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2452925147954375e-05, + "grad_norm": 21.876510620117188, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8645756840705872, + "num_tokens": 106091790.0, + "step": 2778 + }, + { + "epoch": 0.3535173642030276, + "ewc_loss": 0.02490115538239479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2450577742129099e-05, + "grad_norm": 21.872234344482422, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8548262715339661, + "num_tokens": 106123768.0, + "step": 2779 + }, + { + "epoch": 0.3536445744816181, + "ewc_loss": 0.02495165914297104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2475829862523824e-05, + "grad_norm": 21.868741989135742, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8578215837478638, + "num_tokens": 106158219.0, + "step": 2780 + }, + { + "epoch": 0.35377178476020865, + "ewc_loss": 0.024971434846520424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2485716979426797e-05, + "grad_norm": 21.879478454589844, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8460140228271484, + "num_tokens": 106195795.0, + "step": 2781 + }, + { + "epoch": 0.3538989950387991, + "ewc_loss": 0.024938715621829033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2469357898226008e-05, + "grad_norm": 21.8135986328125, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8518065214157104, + "num_tokens": 106234973.0, + "step": 2782 + }, + { + "epoch": 0.35402620531738965, + "ewc_loss": 0.02500302344560623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2501512173912488e-05, + "grad_norm": 21.956838607788086, + "learning_rate": 1e-06, + "loss": 0.5368, + "mean_token_accuracy": 0.837042510509491, + "num_tokens": 106267297.0, + "step": 2783 + }, + { + "epoch": 0.3541534155959802, + "ewc_loss": 0.025044851005077362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2522425095085055e-05, + "grad_norm": 21.774621963500977, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8574622869491577, + "num_tokens": 106309070.0, + "step": 2784 + }, + { + "epoch": 0.35428062587457065, + "ewc_loss": 0.0249911118298769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2495555893110577e-05, + "grad_norm": 21.864376068115234, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8545899987220764, + "num_tokens": 106345784.0, + "step": 2785 + }, + { + "epoch": 0.3544078361531612, + "ewc_loss": 0.025073472410440445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2536735994217452e-05, + "grad_norm": 21.937618255615234, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8579207062721252, + "num_tokens": 106383172.0, + "step": 2786 + }, + { + "epoch": 0.3545350464317517, + "ewc_loss": 0.025082135573029518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2541067917481996e-05, + "grad_norm": 21.823205947875977, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8488763570785522, + "num_tokens": 106416916.0, + "step": 2787 + }, + { + "epoch": 0.3546622567103422, + "ewc_loss": 0.02507016994059086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2535085261333734e-05, + "grad_norm": 21.9212646484375, + "learning_rate": 1e-06, + "loss": 0.529, + "mean_token_accuracy": 0.8357589244842529, + "num_tokens": 106456969.0, + "step": 2788 + }, + { + "epoch": 0.3547894669889327, + "ewc_loss": 0.025182431563735008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2591215636348352e-05, + "grad_norm": 22.269248962402344, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8471940755844116, + "num_tokens": 106490476.0, + "step": 2789 + }, + { + "epoch": 0.35491667726752324, + "ewc_loss": 0.02511240914463997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2556204637803603e-05, + "grad_norm": 21.952335357666016, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8512678742408752, + "num_tokens": 106525990.0, + "step": 2790 + }, + { + "epoch": 0.3550438875461137, + "ewc_loss": 0.02501877397298813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2509386579040438e-05, + "grad_norm": 22.097698211669922, + "learning_rate": 1e-06, + "loss": 0.5765, + "mean_token_accuracy": 0.8257785439491272, + "num_tokens": 106562953.0, + "step": 2791 + }, + { + "epoch": 0.35517109782470424, + "ewc_loss": 0.0250861719250679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.254308608622523e-05, + "grad_norm": 21.924156188964844, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.852554440498352, + "num_tokens": 106599142.0, + "step": 2792 + }, + { + "epoch": 0.35529830810329477, + "ewc_loss": 0.025068853050470352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.253442678716965e-05, + "grad_norm": 21.992094039916992, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8620952367782593, + "num_tokens": 106636743.0, + "step": 2793 + }, + { + "epoch": 0.35542551838188524, + "ewc_loss": 0.02513618767261505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2568093552545179e-05, + "grad_norm": 21.991201400756836, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8546026945114136, + "num_tokens": 106678992.0, + "step": 2794 + }, + { + "epoch": 0.35555272866047577, + "ewc_loss": 0.025011952966451645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2505976883403491e-05, + "grad_norm": 21.75122833251953, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8419721126556396, + "num_tokens": 106719884.0, + "step": 2795 + }, + { + "epoch": 0.3556799389390663, + "ewc_loss": 0.025113601237535477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2556800356833264e-05, + "grad_norm": 21.973241806030273, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8465346097946167, + "num_tokens": 106755224.0, + "step": 2796 + }, + { + "epoch": 0.35580714921765677, + "ewc_loss": 0.025159070268273354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2579534995893482e-05, + "grad_norm": 21.92703628540039, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8617223501205444, + "num_tokens": 106789366.0, + "step": 2797 + }, + { + "epoch": 0.3559343594962473, + "ewc_loss": 0.025084130465984344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2542065633169841e-05, + "grad_norm": 21.931766510009766, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8410207033157349, + "num_tokens": 106826382.0, + "step": 2798 + }, + { + "epoch": 0.3560615697748378, + "ewc_loss": 0.02509438432753086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2547192454803735e-05, + "grad_norm": 21.950090408325195, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8415027856826782, + "num_tokens": 106860027.0, + "step": 2799 + }, + { + "epoch": 0.3561887800534283, + "ewc_loss": 0.025119014084339142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.255950701306574e-05, + "grad_norm": 21.963069915771484, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8587071895599365, + "num_tokens": 106898684.0, + "step": 2800 + }, + { + "epoch": 0.35631599033201883, + "ewc_loss": 0.025095796212553978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.254789822269231e-05, + "grad_norm": 21.860687255859375, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8518292307853699, + "num_tokens": 106934496.0, + "step": 2801 + }, + { + "epoch": 0.35644320061060936, + "ewc_loss": 0.02509547397494316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2547737242130097e-05, + "grad_norm": 21.848819732666016, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8646019697189331, + "num_tokens": 106974335.0, + "step": 2802 + }, + { + "epoch": 0.35657041088919983, + "ewc_loss": 0.02512219361960888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.256109680980444e-05, + "grad_norm": 21.969158172607422, + "learning_rate": 1e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.8246397972106934, + "num_tokens": 107013914.0, + "step": 2803 + }, + { + "epoch": 0.35669762116779036, + "ewc_loss": 0.025141701102256775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2570850230986252e-05, + "grad_norm": 21.858598709106445, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8553603291511536, + "num_tokens": 107053747.0, + "step": 2804 + }, + { + "epoch": 0.3568248314463809, + "ewc_loss": 0.02510652132332325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2553260603453964e-05, + "grad_norm": 21.933330535888672, + "learning_rate": 1e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8402136564254761, + "num_tokens": 107095245.0, + "step": 2805 + }, + { + "epoch": 0.35695204172497136, + "ewc_loss": 0.025173362344503403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2586680895765312e-05, + "grad_norm": 21.927045822143555, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8506186008453369, + "num_tokens": 107133627.0, + "step": 2806 + }, + { + "epoch": 0.3570792520035619, + "ewc_loss": 0.02516544982790947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.25827245938126e-05, + "grad_norm": 22.00347328186035, + "learning_rate": 1e-06, + "loss": 0.5448, + "mean_token_accuracy": 0.8264386653900146, + "num_tokens": 107170128.0, + "step": 2807 + }, + { + "epoch": 0.3572064622821524, + "ewc_loss": 0.025144845247268677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2572422747325618e-05, + "grad_norm": 21.90665626525879, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8609309196472168, + "num_tokens": 107209137.0, + "step": 2808 + }, + { + "epoch": 0.3573336725607429, + "ewc_loss": 0.0251078512519598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.255392544408096e-05, + "grad_norm": 21.882244110107422, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8457747101783752, + "num_tokens": 107244866.0, + "step": 2809 + }, + { + "epoch": 0.3574608828393334, + "ewc_loss": 0.025162912905216217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2581456758198328e-05, + "grad_norm": 21.882116317749023, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8492958545684814, + "num_tokens": 107287399.0, + "step": 2810 + }, + { + "epoch": 0.35758809311792394, + "ewc_loss": 0.025150485336780548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2575243090395816e-05, + "grad_norm": 21.98615074157715, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8405236005783081, + "num_tokens": 107325627.0, + "step": 2811 + }, + { + "epoch": 0.3577153033965144, + "ewc_loss": 0.025138888508081436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2569444152177311e-05, + "grad_norm": 21.886552810668945, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.865454375743866, + "num_tokens": 107359375.0, + "step": 2812 + }, + { + "epoch": 0.35784251367510495, + "ewc_loss": 0.025142334401607513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2571167644637171e-05, + "grad_norm": 21.95565414428711, + "learning_rate": 1e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8391497135162354, + "num_tokens": 107400286.0, + "step": 2813 + }, + { + "epoch": 0.3579697239536955, + "ewc_loss": 0.02516065537929535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2580328075273428e-05, + "grad_norm": 21.919544219970703, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8410666584968567, + "num_tokens": 107431768.0, + "step": 2814 + }, + { + "epoch": 0.35809693423228595, + "ewc_loss": 0.02507760562002659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.253880236617988e-05, + "grad_norm": 21.802082061767578, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8599127531051636, + "num_tokens": 107470875.0, + "step": 2815 + }, + { + "epoch": 0.3582241445108765, + "ewc_loss": 0.025140782818198204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2570391845656559e-05, + "grad_norm": 21.970006942749023, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8425056338310242, + "num_tokens": 107510988.0, + "step": 2816 + }, + { + "epoch": 0.358351354789467, + "ewc_loss": 0.0251895934343338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2594796316989232e-05, + "grad_norm": 21.932363510131836, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8592035174369812, + "num_tokens": 107552423.0, + "step": 2817 + }, + { + "epoch": 0.3584785650680575, + "ewc_loss": 0.025169391185045242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2584695468831342e-05, + "grad_norm": 22.063251495361328, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8424822092056274, + "num_tokens": 107591693.0, + "step": 2818 + }, + { + "epoch": 0.358605775346648, + "ewc_loss": 0.025148624554276466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2574312677315902e-05, + "grad_norm": 21.809062957763672, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8549927473068237, + "num_tokens": 107626017.0, + "step": 2819 + }, + { + "epoch": 0.35873298562523853, + "ewc_loss": 0.025132376700639725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2566188161144964e-05, + "grad_norm": 21.90655517578125, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8668705224990845, + "num_tokens": 107664927.0, + "step": 2820 + }, + { + "epoch": 0.358860195903829, + "ewc_loss": 0.025177009403705597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2588504432642367e-05, + "grad_norm": 21.88542366027832, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8433533906936646, + "num_tokens": 107702487.0, + "step": 2821 + }, + { + "epoch": 0.35898740618241953, + "ewc_loss": 0.025182757526636124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.259137843589997e-05, + "grad_norm": 21.931568145751953, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8634272217750549, + "num_tokens": 107741263.0, + "step": 2822 + }, + { + "epoch": 0.35911461646101006, + "ewc_loss": 0.025164689868688583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.258234533452196e-05, + "grad_norm": 21.921201705932617, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8533822894096375, + "num_tokens": 107781079.0, + "step": 2823 + }, + { + "epoch": 0.35924182673960053, + "ewc_loss": 0.025171611458063126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2585805961862206e-05, + "grad_norm": 21.94776725769043, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8694415092468262, + "num_tokens": 107819309.0, + "step": 2824 + }, + { + "epoch": 0.35936903701819106, + "ewc_loss": 0.02515869401395321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2579346730490215e-05, + "grad_norm": 21.997901916503906, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8529595136642456, + "num_tokens": 107853251.0, + "step": 2825 + }, + { + "epoch": 0.3594962472967816, + "ewc_loss": 0.02520107291638851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.260053613805212e-05, + "grad_norm": 22.111162185668945, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8699040412902832, + "num_tokens": 107893476.0, + "step": 2826 + }, + { + "epoch": 0.35962345757537206, + "ewc_loss": 0.0251474566757679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2573728781717364e-05, + "grad_norm": 21.99009895324707, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8597666621208191, + "num_tokens": 107931204.0, + "step": 2827 + }, + { + "epoch": 0.3597506678539626, + "ewc_loss": 0.02520417422056198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2602086826518644e-05, + "grad_norm": 22.318584442138672, + "learning_rate": 1e-06, + "loss": 0.5295, + "mean_token_accuracy": 0.8327014446258545, + "num_tokens": 107966471.0, + "step": 2828 + }, + { + "epoch": 0.3598778781325531, + "ewc_loss": 0.025144897401332855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.257244912267197e-05, + "grad_norm": 21.91141700744629, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8521924018859863, + "num_tokens": 108005328.0, + "step": 2829 + }, + { + "epoch": 0.36000508841114365, + "ewc_loss": 0.02505861595273018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2529308150988072e-05, + "grad_norm": 22.08476448059082, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8392459750175476, + "num_tokens": 108046295.0, + "step": 2830 + }, + { + "epoch": 0.3601322986897341, + "ewc_loss": 0.025175847113132477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2587923265527934e-05, + "grad_norm": 22.016448974609375, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.852744460105896, + "num_tokens": 108078164.0, + "step": 2831 + }, + { + "epoch": 0.36025950896832465, + "ewc_loss": 0.02503986470401287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2519932170107495e-05, + "grad_norm": 21.90862274169922, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.845978856086731, + "num_tokens": 108117105.0, + "step": 2832 + }, + { + "epoch": 0.3603867192469152, + "ewc_loss": 0.025169041007757187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2584520845848601e-05, + "grad_norm": 21.9256649017334, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8477532863616943, + "num_tokens": 108155301.0, + "step": 2833 + }, + { + "epoch": 0.36051392952550565, + "ewc_loss": 0.02515576221048832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2577881534525659e-05, + "grad_norm": 22.06883430480957, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8611730337142944, + "num_tokens": 108193091.0, + "step": 2834 + }, + { + "epoch": 0.3606411398040962, + "ewc_loss": 0.025157691910862923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2578845598909538e-05, + "grad_norm": 21.911291122436523, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8481730222702026, + "num_tokens": 108233719.0, + "step": 2835 + }, + { + "epoch": 0.3607683500826867, + "ewc_loss": 0.025141414254903793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2570707440318074e-05, + "grad_norm": 22.035009384155273, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8568288683891296, + "num_tokens": 108265775.0, + "step": 2836 + }, + { + "epoch": 0.3608955603612772, + "ewc_loss": 0.02514941804111004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2574709217005875e-05, + "grad_norm": 21.86931800842285, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8615281581878662, + "num_tokens": 108307953.0, + "step": 2837 + }, + { + "epoch": 0.3610227706398677, + "ewc_loss": 0.02510424517095089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2552122825582046e-05, + "grad_norm": 21.936687469482422, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8477836847305298, + "num_tokens": 108351998.0, + "step": 2838 + }, + { + "epoch": 0.36114998091845824, + "ewc_loss": 0.025243645533919334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2621822861547116e-05, + "grad_norm": 21.93791961669922, + "learning_rate": 1e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.838396430015564, + "num_tokens": 108395968.0, + "step": 2839 + }, + { + "epoch": 0.3612771911970487, + "ewc_loss": 0.025132866576313972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2566433724714443e-05, + "grad_norm": 21.894725799560547, + "learning_rate": 1e-06, + "loss": 0.5412, + "mean_token_accuracy": 0.8318102359771729, + "num_tokens": 108433045.0, + "step": 2840 + }, + { + "epoch": 0.36140440147563924, + "ewc_loss": 0.025242779403924942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2621389942069072e-05, + "grad_norm": 22.04568099975586, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8459600210189819, + "num_tokens": 108470964.0, + "step": 2841 + }, + { + "epoch": 0.36153161175422976, + "ewc_loss": 0.025208793580532074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2604396943061147e-05, + "grad_norm": 21.944406509399414, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8512551784515381, + "num_tokens": 108512572.0, + "step": 2842 + }, + { + "epoch": 0.36165882203282024, + "ewc_loss": 0.025159411132335663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2579705980897415e-05, + "grad_norm": 21.9859619140625, + "learning_rate": 1e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8356778621673584, + "num_tokens": 108548526.0, + "step": 2843 + }, + { + "epoch": 0.36178603231141077, + "ewc_loss": 0.025227323174476624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2613661965588108e-05, + "grad_norm": 22.00790023803711, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.848877489566803, + "num_tokens": 108588447.0, + "step": 2844 + }, + { + "epoch": 0.3619132425900013, + "ewc_loss": 0.02519809454679489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2599047295225319e-05, + "grad_norm": 22.00019073486328, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8411368131637573, + "num_tokens": 108628426.0, + "step": 2845 + }, + { + "epoch": 0.36204045286859177, + "ewc_loss": 0.025173334404826164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2586667253344785e-05, + "grad_norm": 21.9437198638916, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8454318046569824, + "num_tokens": 108661355.0, + "step": 2846 + }, + { + "epoch": 0.3621676631471823, + "ewc_loss": 0.025219712406396866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.260985663975589e-05, + "grad_norm": 21.96078872680664, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8663438558578491, + "num_tokens": 108698505.0, + "step": 2847 + }, + { + "epoch": 0.3622948734257728, + "ewc_loss": 0.025179419666528702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2589709513122216e-05, + "grad_norm": 22.02630615234375, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8549436330795288, + "num_tokens": 108737409.0, + "step": 2848 + }, + { + "epoch": 0.3624220837043633, + "ewc_loss": 0.025199148803949356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2599574802152347e-05, + "grad_norm": 21.963045120239258, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8516991138458252, + "num_tokens": 108772969.0, + "step": 2849 + }, + { + "epoch": 0.3625492939829538, + "ewc_loss": 0.02518294006586075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2591470294864848e-05, + "grad_norm": 21.94134521484375, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8579306602478027, + "num_tokens": 108805129.0, + "step": 2850 + }, + { + "epoch": 0.36267650426154435, + "ewc_loss": 0.025257179513573647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2628589502128307e-05, + "grad_norm": 21.958263397216797, + "learning_rate": 1e-06, + "loss": 0.5163, + "mean_token_accuracy": 0.836523175239563, + "num_tokens": 108841197.0, + "step": 2851 + }, + { + "epoch": 0.3628037145401348, + "ewc_loss": 0.025274567306041718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2637283361982554e-05, + "grad_norm": 21.97624397277832, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8640553951263428, + "num_tokens": 108876442.0, + "step": 2852 + }, + { + "epoch": 0.36293092481872535, + "ewc_loss": 0.025280695408582687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2640347449632827e-05, + "grad_norm": 22.018468856811523, + "learning_rate": 1e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.8353747129440308, + "num_tokens": 108911432.0, + "step": 2853 + }, + { + "epoch": 0.3630581350973159, + "ewc_loss": 0.025267386808991432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2633693586394656e-05, + "grad_norm": 21.908615112304688, + "learning_rate": 1e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8352757692337036, + "num_tokens": 108952977.0, + "step": 2854 + }, + { + "epoch": 0.36318534537590635, + "ewc_loss": 0.025314949452877045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2657475053856615e-05, + "grad_norm": 22.065502166748047, + "learning_rate": 1e-06, + "loss": 0.5615, + "mean_token_accuracy": 0.8270859122276306, + "num_tokens": 108995602.0, + "step": 2855 + }, + { + "epoch": 0.3633125556544969, + "ewc_loss": 0.025326386094093323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2663193047046661e-05, + "grad_norm": 21.9682674407959, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8529965281486511, + "num_tokens": 109040226.0, + "step": 2856 + }, + { + "epoch": 0.3634397659330874, + "ewc_loss": 0.025270728394389153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2635364328161813e-05, + "grad_norm": 21.977678298950195, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8535270094871521, + "num_tokens": 109082898.0, + "step": 2857 + }, + { + "epoch": 0.3635669762116779, + "ewc_loss": 0.025327596813440323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2663798770518042e-05, + "grad_norm": 22.035375595092773, + "learning_rate": 1e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8414839506149292, + "num_tokens": 109122460.0, + "step": 2858 + }, + { + "epoch": 0.3636941864902684, + "ewc_loss": 0.025267286226153374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2633643564186059e-05, + "grad_norm": 21.998397827148438, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8495007753372192, + "num_tokens": 109161680.0, + "step": 2859 + }, + { + "epoch": 0.36382139676885894, + "ewc_loss": 0.025301756337285042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2650878488784656e-05, + "grad_norm": 22.01254653930664, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8690555691719055, + "num_tokens": 109198943.0, + "step": 2860 + }, + { + "epoch": 0.3639486070474494, + "ewc_loss": 0.025314366444945335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2657183106057346e-05, + "grad_norm": 22.016277313232422, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8581860065460205, + "num_tokens": 109235719.0, + "step": 2861 + }, + { + "epoch": 0.36407581732603994, + "ewc_loss": 0.025310093536973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2655046703002881e-05, + "grad_norm": 22.088552474975586, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8425054550170898, + "num_tokens": 109272681.0, + "step": 2862 + }, + { + "epoch": 0.36420302760463047, + "ewc_loss": 0.025305738672614098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2652869372686837e-05, + "grad_norm": 22.05282974243164, + "learning_rate": 1e-06, + "loss": 0.5221, + "mean_token_accuracy": 0.8393962383270264, + "num_tokens": 109306114.0, + "step": 2863 + }, + { + "epoch": 0.36433023788322094, + "ewc_loss": 0.025280309841036797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2640154636756051e-05, + "grad_norm": 22.037321090698242, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8420172929763794, + "num_tokens": 109337728.0, + "step": 2864 + }, + { + "epoch": 0.36445744816181147, + "ewc_loss": 0.025260141119360924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2630070159502793e-05, + "grad_norm": 21.982717514038086, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8406829833984375, + "num_tokens": 109377809.0, + "step": 2865 + }, + { + "epoch": 0.364584658440402, + "ewc_loss": 0.02534566819667816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2672833690885454e-05, + "grad_norm": 22.046316146850586, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8649365901947021, + "num_tokens": 109413499.0, + "step": 2866 + }, + { + "epoch": 0.36471186871899247, + "ewc_loss": 0.025304609909653664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2652304576477036e-05, + "grad_norm": 21.939912796020508, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.840990424156189, + "num_tokens": 109460961.0, + "step": 2867 + }, + { + "epoch": 0.364839078997583, + "ewc_loss": 0.02527744136750698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2638720363611355e-05, + "grad_norm": 21.96131706237793, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8612331748008728, + "num_tokens": 109500615.0, + "step": 2868 + }, + { + "epoch": 0.36496628927617353, + "ewc_loss": 0.025387071073055267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.269353560928721e-05, + "grad_norm": 22.032249450683594, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8529399633407593, + "num_tokens": 109533921.0, + "step": 2869 + }, + { + "epoch": 0.365093499554764, + "ewc_loss": 0.02536972612142563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2684862667811103e-05, + "grad_norm": 21.96141242980957, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.841117799282074, + "num_tokens": 109567069.0, + "step": 2870 + }, + { + "epoch": 0.36522070983335453, + "ewc_loss": 0.02538672462105751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2693361895799171e-05, + "grad_norm": 22.033708572387695, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8636513352394104, + "num_tokens": 109603696.0, + "step": 2871 + }, + { + "epoch": 0.36534792011194506, + "ewc_loss": 0.02537260390818119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.268630239792401e-05, + "grad_norm": 21.92401695251465, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8679065704345703, + "num_tokens": 109644590.0, + "step": 2872 + }, + { + "epoch": 0.36547513039053553, + "ewc_loss": 0.025377890095114708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2688945389527362e-05, + "grad_norm": 22.03302764892578, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8460615277290344, + "num_tokens": 109678869.0, + "step": 2873 + }, + { + "epoch": 0.36560234066912606, + "ewc_loss": 0.025430506095290184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2715253433270846e-05, + "grad_norm": 22.01085090637207, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8445771932601929, + "num_tokens": 109722517.0, + "step": 2874 + }, + { + "epoch": 0.3657295509477166, + "ewc_loss": 0.02534361742436886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2671808690356556e-05, + "grad_norm": 21.863697052001953, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8471976518630981, + "num_tokens": 109757869.0, + "step": 2875 + }, + { + "epoch": 0.36585676122630706, + "ewc_loss": 0.02543080970644951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2715405318886042e-05, + "grad_norm": 22.08456039428711, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8459321856498718, + "num_tokens": 109793162.0, + "step": 2876 + }, + { + "epoch": 0.3659839715048976, + "ewc_loss": 0.025403089821338654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2701544619631022e-05, + "grad_norm": 21.963581085205078, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8564668893814087, + "num_tokens": 109832942.0, + "step": 2877 + }, + { + "epoch": 0.3661111817834881, + "ewc_loss": 0.02539611980319023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2698059435933828e-05, + "grad_norm": 22.026002883911133, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8490152359008789, + "num_tokens": 109876230.0, + "step": 2878 + }, + { + "epoch": 0.3662383920620786, + "ewc_loss": 0.02540217898786068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2701089872280136e-05, + "grad_norm": 22.095626831054688, + "learning_rate": 1e-06, + "loss": 0.5109, + "mean_token_accuracy": 0.8410276770591736, + "num_tokens": 109911844.0, + "step": 2879 + }, + { + "epoch": 0.3663656023406691, + "ewc_loss": 0.025416459888219833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2708230315183755e-05, + "grad_norm": 22.030702590942383, + "learning_rate": 1e-06, + "loss": 0.5382, + "mean_token_accuracy": 0.8386749029159546, + "num_tokens": 109957619.0, + "step": 2880 + }, + { + "epoch": 0.36649281261925964, + "ewc_loss": 0.025354979559779167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.267748939426383e-05, + "grad_norm": 22.05866813659668, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8396486043930054, + "num_tokens": 109999117.0, + "step": 2881 + }, + { + "epoch": 0.3666200228978502, + "ewc_loss": 0.025421570986509323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2710785085801035e-05, + "grad_norm": 22.144168853759766, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8574284911155701, + "num_tokens": 110039557.0, + "step": 2882 + }, + { + "epoch": 0.36674723317644065, + "ewc_loss": 0.025344615802168846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.267230800294783e-05, + "grad_norm": 22.020992279052734, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8426840305328369, + "num_tokens": 110074418.0, + "step": 2883 + }, + { + "epoch": 0.3668744434550312, + "ewc_loss": 0.025366192683577538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.268309642910026e-05, + "grad_norm": 22.023984909057617, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8559310436248779, + "num_tokens": 110109255.0, + "step": 2884 + }, + { + "epoch": 0.3670016537336217, + "ewc_loss": 0.02532886527478695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2664432688325178e-05, + "grad_norm": 21.966909408569336, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8558180332183838, + "num_tokens": 110150180.0, + "step": 2885 + }, + { + "epoch": 0.3671288640122122, + "ewc_loss": 0.025331759825348854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2665879694395699e-05, + "grad_norm": 22.03631019592285, + "learning_rate": 1e-06, + "loss": 0.5317, + "mean_token_accuracy": 0.8338156938552856, + "num_tokens": 110193653.0, + "step": 2886 + }, + { + "epoch": 0.3672560742908027, + "ewc_loss": 0.025368813425302505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2684407010965515e-05, + "grad_norm": 22.007902145385742, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8573257923126221, + "num_tokens": 110231020.0, + "step": 2887 + }, + { + "epoch": 0.36738328456939323, + "ewc_loss": 0.02534567564725876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2672838238358963e-05, + "grad_norm": 22.03829002380371, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.865971028804779, + "num_tokens": 110269397.0, + "step": 2888 + }, + { + "epoch": 0.3675104948479837, + "ewc_loss": 0.02538914792239666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2694574252236634e-05, + "grad_norm": 21.99642562866211, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8661378622055054, + "num_tokens": 110311188.0, + "step": 2889 + }, + { + "epoch": 0.36763770512657423, + "ewc_loss": 0.025301404297351837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2650702046812512e-05, + "grad_norm": 22.1082763671875, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8538311719894409, + "num_tokens": 110343801.0, + "step": 2890 + }, + { + "epoch": 0.36776491540516476, + "ewc_loss": 0.025384817272424698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2692408745351713e-05, + "grad_norm": 21.973257064819336, + "learning_rate": 1e-06, + "loss": 0.5286, + "mean_token_accuracy": 0.8335078954696655, + "num_tokens": 110385830.0, + "step": 2891 + }, + { + "epoch": 0.36789212568375523, + "ewc_loss": 0.025326237082481384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2663118468481116e-05, + "grad_norm": 22.106367111206055, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8502787351608276, + "num_tokens": 110421494.0, + "step": 2892 + }, + { + "epoch": 0.36801933596234576, + "ewc_loss": 0.025374293327331543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2687146409007255e-05, + "grad_norm": 21.976720809936523, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8493071794509888, + "num_tokens": 110463132.0, + "step": 2893 + }, + { + "epoch": 0.3681465462409363, + "ewc_loss": 0.025377973914146423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2688987226283643e-05, + "grad_norm": 22.11568832397461, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8533778190612793, + "num_tokens": 110498330.0, + "step": 2894 + }, + { + "epoch": 0.36827375651952676, + "ewc_loss": 0.025389352813363075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2694676115643233e-05, + "grad_norm": 22.013916015625, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8598048686981201, + "num_tokens": 110534438.0, + "step": 2895 + }, + { + "epoch": 0.3684009667981173, + "ewc_loss": 0.025366902351379395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2683451132033952e-05, + "grad_norm": 22.117185592651367, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8462367057800293, + "num_tokens": 110572059.0, + "step": 2896 + }, + { + "epoch": 0.3685281770767078, + "ewc_loss": 0.02534066140651703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2670330761466175e-05, + "grad_norm": 22.041934967041016, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8524404764175415, + "num_tokens": 110611012.0, + "step": 2897 + }, + { + "epoch": 0.3686553873552983, + "ewc_loss": 0.025408484041690826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.270424218091648e-05, + "grad_norm": 22.071523666381836, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8582548499107361, + "num_tokens": 110647429.0, + "step": 2898 + }, + { + "epoch": 0.3687825976338888, + "ewc_loss": 0.025358496233820915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2679248357017059e-05, + "grad_norm": 22.03036880493164, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8441321849822998, + "num_tokens": 110685052.0, + "step": 2899 + }, + { + "epoch": 0.36890980791247935, + "ewc_loss": 0.025368526577949524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2684263310802635e-05, + "grad_norm": 21.961702346801758, + "learning_rate": 1e-06, + "loss": 0.5334, + "mean_token_accuracy": 0.8294996619224548, + "num_tokens": 110725222.0, + "step": 2900 + }, + { + "epoch": 0.3690370181910698, + "ewc_loss": 0.025404732674360275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2702366802841425e-05, + "grad_norm": 22.083786010742188, + "learning_rate": 1e-06, + "loss": 0.52, + "mean_token_accuracy": 0.839836835861206, + "num_tokens": 110764314.0, + "step": 2901 + }, + { + "epoch": 0.36916422846966035, + "ewc_loss": 0.025392811745405197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2696405974566005e-05, + "grad_norm": 21.878210067749023, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8627160787582397, + "num_tokens": 110803585.0, + "step": 2902 + }, + { + "epoch": 0.3692914387482509, + "ewc_loss": 0.025410402566194534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2705201697826851e-05, + "grad_norm": 22.038515090942383, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8453292846679688, + "num_tokens": 110846532.0, + "step": 2903 + }, + { + "epoch": 0.36941864902684135, + "ewc_loss": 0.0254600178450346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.273000907531241e-05, + "grad_norm": 21.961740493774414, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8475923538208008, + "num_tokens": 110889807.0, + "step": 2904 + }, + { + "epoch": 0.3695458593054319, + "ewc_loss": 0.025476472452282906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2738236364384647e-05, + "grad_norm": 21.99201774597168, + "learning_rate": 1e-06, + "loss": 0.5369, + "mean_token_accuracy": 0.8309849500656128, + "num_tokens": 110930567.0, + "step": 2905 + }, + { + "epoch": 0.3696730695840224, + "ewc_loss": 0.025474337860941887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2737168617604766e-05, + "grad_norm": 22.037973403930664, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8565741777420044, + "num_tokens": 110968725.0, + "step": 2906 + }, + { + "epoch": 0.3698002798626129, + "ewc_loss": 0.025506768375635147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2753383998642676e-05, + "grad_norm": 21.952945709228516, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.874232828617096, + "num_tokens": 111002724.0, + "step": 2907 + }, + { + "epoch": 0.3699274901412034, + "ewc_loss": 0.025480248034000397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2740124475385528e-05, + "grad_norm": 22.00490379333496, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8482381105422974, + "num_tokens": 111036668.0, + "step": 2908 + }, + { + "epoch": 0.37005470041979394, + "ewc_loss": 0.025526821613311768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2763411177729722e-05, + "grad_norm": 22.136632919311523, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8425172567367554, + "num_tokens": 111077712.0, + "step": 2909 + }, + { + "epoch": 0.3701819106983844, + "ewc_loss": 0.025480011478066444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2740005331579596e-05, + "grad_norm": 21.958126068115234, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8413150906562805, + "num_tokens": 111118856.0, + "step": 2910 + }, + { + "epoch": 0.37030912097697494, + "ewc_loss": 0.02546255849301815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2731279639410786e-05, + "grad_norm": 21.988927841186523, + "learning_rate": 1e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8416122198104858, + "num_tokens": 111156712.0, + "step": 2911 + }, + { + "epoch": 0.37043633125556547, + "ewc_loss": 0.025489430874586105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2744715604640078e-05, + "grad_norm": 21.986042022705078, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.851253867149353, + "num_tokens": 111195542.0, + "step": 2912 + }, + { + "epoch": 0.37056354153415594, + "ewc_loss": 0.025566009804606438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2783005331584718e-05, + "grad_norm": 22.060426712036133, + "learning_rate": 1e-06, + "loss": 0.5496, + "mean_token_accuracy": 0.8328278064727783, + "num_tokens": 111237827.0, + "step": 2913 + }, + { + "epoch": 0.37069075181274647, + "ewc_loss": 0.02550114504992962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2750572750519495e-05, + "grad_norm": 21.948633193969727, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8585625886917114, + "num_tokens": 111278802.0, + "step": 2914 + }, + { + "epoch": 0.370817962091337, + "ewc_loss": 0.025506991893053055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2753495866490994e-05, + "grad_norm": 22.057844161987305, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.848371684551239, + "num_tokens": 111313795.0, + "step": 2915 + }, + { + "epoch": 0.37094517236992747, + "ewc_loss": 0.025550032034516335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2775016330124345e-05, + "grad_norm": 22.024993896484375, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8565959930419922, + "num_tokens": 111353798.0, + "step": 2916 + }, + { + "epoch": 0.371072382648518, + "ewc_loss": 0.025526951998472214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2763475751853548e-05, + "grad_norm": 21.991107940673828, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8506020307540894, + "num_tokens": 111390613.0, + "step": 2917 + }, + { + "epoch": 0.3711995929271085, + "ewc_loss": 0.02553809992969036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2769050044880714e-05, + "grad_norm": 21.96217155456543, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8508597612380981, + "num_tokens": 111432043.0, + "step": 2918 + }, + { + "epoch": 0.371326803205699, + "ewc_loss": 0.025575906038284302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2787952982762363e-05, + "grad_norm": 22.089359283447266, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.856610894203186, + "num_tokens": 111476575.0, + "step": 2919 + }, + { + "epoch": 0.3714540134842895, + "ewc_loss": 0.02552986331284046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2764931852871086e-05, + "grad_norm": 21.95195960998535, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8493105173110962, + "num_tokens": 111515304.0, + "step": 2920 + }, + { + "epoch": 0.37158122376288005, + "ewc_loss": 0.025593100115656853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2796550436178222e-05, + "grad_norm": 22.204330444335938, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8599250316619873, + "num_tokens": 111550246.0, + "step": 2921 + }, + { + "epoch": 0.3717084340414705, + "ewc_loss": 0.025564691051840782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2782345947925933e-05, + "grad_norm": 21.96221351623535, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8522336483001709, + "num_tokens": 111592189.0, + "step": 2922 + }, + { + "epoch": 0.37183564432006105, + "ewc_loss": 0.025485722348093987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2742861144943163e-05, + "grad_norm": 22.09035873413086, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8535743951797485, + "num_tokens": 111638177.0, + "step": 2923 + }, + { + "epoch": 0.3719628545986516, + "ewc_loss": 0.025569897145032883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2784948921762407e-05, + "grad_norm": 22.015892028808594, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8527206778526306, + "num_tokens": 111674365.0, + "step": 2924 + }, + { + "epoch": 0.37209006487724205, + "ewc_loss": 0.025498947128653526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2749473171425052e-05, + "grad_norm": 21.98456573486328, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8457950353622437, + "num_tokens": 111714571.0, + "step": 2925 + }, + { + "epoch": 0.3722172751558326, + "ewc_loss": 0.025588063523173332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2794032045349013e-05, + "grad_norm": 22.05237579345703, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8506213426589966, + "num_tokens": 111755214.0, + "step": 2926 + }, + { + "epoch": 0.3723444854344231, + "ewc_loss": 0.02554551512002945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.277275714528514e-05, + "grad_norm": 21.97129249572754, + "learning_rate": 1e-06, + "loss": 0.5624, + "mean_token_accuracy": 0.8237453699111938, + "num_tokens": 111796516.0, + "step": 2927 + }, + { + "epoch": 0.3724716957130136, + "ewc_loss": 0.025592636317014694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.279631851502927e-05, + "grad_norm": 22.10813331604004, + "learning_rate": 1e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8357807397842407, + "num_tokens": 111832615.0, + "step": 2928 + }, + { + "epoch": 0.3725989059916041, + "ewc_loss": 0.025573596358299255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2786797924491111e-05, + "grad_norm": 22.060333251953125, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8406928777694702, + "num_tokens": 111874682.0, + "step": 2929 + }, + { + "epoch": 0.37272611627019464, + "ewc_loss": 0.02552030235528946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2760151548718568e-05, + "grad_norm": 22.048200607299805, + "learning_rate": 1e-06, + "loss": 0.5279, + "mean_token_accuracy": 0.8386673927307129, + "num_tokens": 111916369.0, + "step": 2930 + }, + { + "epoch": 0.37285332654878517, + "ewc_loss": 0.02557016722857952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.278508352697827e-05, + "grad_norm": 21.996557235717773, + "learning_rate": 1e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8418090343475342, + "num_tokens": 111950671.0, + "step": 2931 + }, + { + "epoch": 0.37298053682737564, + "ewc_loss": 0.025499597191810608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2749798770528287e-05, + "grad_norm": 22.071861267089844, + "learning_rate": 1e-06, + "loss": 0.5291, + "mean_token_accuracy": 0.8330053091049194, + "num_tokens": 111988846.0, + "step": 2932 + }, + { + "epoch": 0.37310774710596617, + "ewc_loss": 0.02558683604001999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2793418136425316e-05, + "grad_norm": 22.08254051208496, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.858710765838623, + "num_tokens": 112022853.0, + "step": 2933 + }, + { + "epoch": 0.3732349573845567, + "ewc_loss": 0.02555161528289318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2775807590514887e-05, + "grad_norm": 21.98550033569336, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8458335399627686, + "num_tokens": 112065631.0, + "step": 2934 + }, + { + "epoch": 0.37336216766314717, + "ewc_loss": 0.025543540716171265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2771770343533717e-05, + "grad_norm": 22.00960350036621, + "learning_rate": 1e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.8395088911056519, + "num_tokens": 112107236.0, + "step": 2935 + }, + { + "epoch": 0.3734893779417377, + "ewc_loss": 0.025612251833081245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2806125596398488e-05, + "grad_norm": 22.03276824951172, + "learning_rate": 1e-06, + "loss": 0.543, + "mean_token_accuracy": 0.8277080059051514, + "num_tokens": 112153914.0, + "step": 2936 + }, + { + "epoch": 0.3736165882203282, + "ewc_loss": 0.02559293992817402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2796470400644466e-05, + "grad_norm": 22.031953811645508, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8524438738822937, + "num_tokens": 112187596.0, + "step": 2937 + }, + { + "epoch": 0.3737437984989187, + "ewc_loss": 0.025661760941147804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2830880223191343e-05, + "grad_norm": 22.11697769165039, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8465248346328735, + "num_tokens": 112222409.0, + "step": 2938 + }, + { + "epoch": 0.37387100877750923, + "ewc_loss": 0.025643210858106613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2821605196222663e-05, + "grad_norm": 22.081825256347656, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8371974229812622, + "num_tokens": 112257699.0, + "step": 2939 + }, + { + "epoch": 0.37399821905609976, + "ewc_loss": 0.025621458888053894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2810729458578862e-05, + "grad_norm": 22.110445022583008, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8586423993110657, + "num_tokens": 112292431.0, + "step": 2940 + }, + { + "epoch": 0.37412542933469023, + "ewc_loss": 0.02560843713581562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.280421838600887e-05, + "grad_norm": 22.073869705200195, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8512877225875854, + "num_tokens": 112331241.0, + "step": 2941 + }, + { + "epoch": 0.37425263961328076, + "ewc_loss": 0.025643927976489067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2821963537135161e-05, + "grad_norm": 22.051597595214844, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8463195562362671, + "num_tokens": 112368486.0, + "step": 2942 + }, + { + "epoch": 0.3743798498918713, + "ewc_loss": 0.025662682950496674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2831341337005142e-05, + "grad_norm": 22.023649215698242, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8404780030250549, + "num_tokens": 112409638.0, + "step": 2943 + }, + { + "epoch": 0.37450706017046176, + "ewc_loss": 0.02563803642988205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.281901859329082e-05, + "grad_norm": 21.9744873046875, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8458681106567383, + "num_tokens": 112448257.0, + "step": 2944 + }, + { + "epoch": 0.3746342704490523, + "ewc_loss": 0.02568533644080162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2842668184021022e-05, + "grad_norm": 22.01130485534668, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8643467426300049, + "num_tokens": 112488739.0, + "step": 2945 + }, + { + "epoch": 0.3747614807276428, + "ewc_loss": 0.025703707709908485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2851854080508929e-05, + "grad_norm": 22.080440521240234, + "learning_rate": 1e-06, + "loss": 0.5253, + "mean_token_accuracy": 0.8355099558830261, + "num_tokens": 112528621.0, + "step": 2946 + }, + { + "epoch": 0.3748886910062333, + "ewc_loss": 0.02569105476140976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2845527635363396e-05, + "grad_norm": 21.984106063842773, + "learning_rate": 1e-06, + "loss": 0.504, + "mean_token_accuracy": 0.8440123796463013, + "num_tokens": 112564615.0, + "step": 2947 + }, + { + "epoch": 0.3750159012848238, + "ewc_loss": 0.0257217139005661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2860857168561779e-05, + "grad_norm": 22.16638946533203, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8382300138473511, + "num_tokens": 112598190.0, + "step": 2948 + }, + { + "epoch": 0.37514311156341434, + "ewc_loss": 0.025700215250253677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2850107850681525e-05, + "grad_norm": 22.055450439453125, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.849345326423645, + "num_tokens": 112632078.0, + "step": 2949 + }, + { + "epoch": 0.3752703218420048, + "ewc_loss": 0.025659378618001938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2829689694626722e-05, + "grad_norm": 22.03160858154297, + "learning_rate": 1e-06, + "loss": 0.5511, + "mean_token_accuracy": 0.8273180723190308, + "num_tokens": 112667848.0, + "step": 2950 + }, + { + "epoch": 0.37539753212059535, + "ewc_loss": 0.025694217532873154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2847108337155078e-05, + "grad_norm": 22.045238494873047, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8473905920982361, + "num_tokens": 112707037.0, + "step": 2951 + }, + { + "epoch": 0.3755247423991859, + "ewc_loss": 0.025767533108592033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2883766430604737e-05, + "grad_norm": 22.04151153564453, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8588786125183105, + "num_tokens": 112751921.0, + "step": 2952 + }, + { + "epoch": 0.37565195267777635, + "ewc_loss": 0.025668609887361526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.283430538023822e-05, + "grad_norm": 22.07782554626465, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8536170721054077, + "num_tokens": 112793061.0, + "step": 2953 + }, + { + "epoch": 0.3757791629563669, + "ewc_loss": 0.02571895718574524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2859478374593891e-05, + "grad_norm": 22.0637264251709, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8589871525764465, + "num_tokens": 112833852.0, + "step": 2954 + }, + { + "epoch": 0.3759063732349574, + "ewc_loss": 0.02565867081284523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2829334991693031e-05, + "grad_norm": 22.076162338256836, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8457934856414795, + "num_tokens": 112866718.0, + "step": 2955 + }, + { + "epoch": 0.3760335835135479, + "ewc_loss": 0.02573704719543457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2868523299403023e-05, + "grad_norm": 22.062990188598633, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8506919145584106, + "num_tokens": 112909420.0, + "step": 2956 + }, + { + "epoch": 0.3761607937921384, + "ewc_loss": 0.025720786303281784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2860393326263875e-05, + "grad_norm": 22.13241195678711, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8517311811447144, + "num_tokens": 112944570.0, + "step": 2957 + }, + { + "epoch": 0.37628800407072893, + "ewc_loss": 0.02575581520795822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2877907465735916e-05, + "grad_norm": 22.263887405395508, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8633279204368591, + "num_tokens": 112984107.0, + "step": 2958 + }, + { + "epoch": 0.3764152143493194, + "ewc_loss": 0.025710398331284523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.285519920202205e-05, + "grad_norm": 22.029279708862305, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8722994327545166, + "num_tokens": 113023066.0, + "step": 2959 + }, + { + "epoch": 0.37654242462790993, + "ewc_loss": 0.025631504133343697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2815751688322052e-05, + "grad_norm": 22.01091766357422, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.84754478931427, + "num_tokens": 113060093.0, + "step": 2960 + }, + { + "epoch": 0.37666963490650046, + "ewc_loss": 0.025736242532730103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.286812130274484e-05, + "grad_norm": 22.195852279663086, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8422850370407104, + "num_tokens": 113095940.0, + "step": 2961 + }, + { + "epoch": 0.37679684518509093, + "ewc_loss": 0.025615354999899864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2807677194359712e-05, + "grad_norm": 21.97165870666504, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8608113527297974, + "num_tokens": 113130699.0, + "step": 2962 + }, + { + "epoch": 0.37692405546368146, + "ewc_loss": 0.02565254457294941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2826272723032162e-05, + "grad_norm": 22.063541412353516, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8501605987548828, + "num_tokens": 113169145.0, + "step": 2963 + }, + { + "epoch": 0.377051265742272, + "ewc_loss": 0.02573237754404545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2866188626503572e-05, + "grad_norm": 22.04715347290039, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8648864030838013, + "num_tokens": 113201987.0, + "step": 2964 + }, + { + "epoch": 0.37717847602086246, + "ewc_loss": 0.02569478750228882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2847393918491434e-05, + "grad_norm": 22.007797241210938, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8555817008018494, + "num_tokens": 113239588.0, + "step": 2965 + }, + { + "epoch": 0.377305686299453, + "ewc_loss": 0.025733154267072678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2866576980741229e-05, + "grad_norm": 22.00494956970215, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8512275815010071, + "num_tokens": 113277075.0, + "step": 2966 + }, + { + "epoch": 0.3774328965780435, + "ewc_loss": 0.02572561427950859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.286280712520238e-05, + "grad_norm": 21.96261978149414, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.877703845500946, + "num_tokens": 113313708.0, + "step": 2967 + }, + { + "epoch": 0.377560106856634, + "ewc_loss": 0.025769885629415512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.288494240725413e-05, + "grad_norm": 22.14144515991211, + "learning_rate": 1e-06, + "loss": 0.5371, + "mean_token_accuracy": 0.8315429091453552, + "num_tokens": 113344027.0, + "step": 2968 + }, + { + "epoch": 0.3776873171352245, + "ewc_loss": 0.025813283398747444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.290664204134373e-05, + "grad_norm": 22.051036834716797, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.85548335313797, + "num_tokens": 113388444.0, + "step": 2969 + }, + { + "epoch": 0.37781452741381505, + "ewc_loss": 0.02574356086552143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2871780199930072e-05, + "grad_norm": 22.129072189331055, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8456303477287292, + "num_tokens": 113428660.0, + "step": 2970 + }, + { + "epoch": 0.3779417376924055, + "ewc_loss": 0.025789177045226097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2894588508061133e-05, + "grad_norm": 22.161060333251953, + "learning_rate": 1e-06, + "loss": 0.5455, + "mean_token_accuracy": 0.8290736079216003, + "num_tokens": 113469160.0, + "step": 2971 + }, + { + "epoch": 0.37806894797099605, + "ewc_loss": 0.02571425773203373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2857129149779212e-05, + "grad_norm": 22.01447105407715, + "learning_rate": 1e-06, + "loss": 0.5474, + "mean_token_accuracy": 0.8254176378250122, + "num_tokens": 113504291.0, + "step": 2972 + }, + { + "epoch": 0.3781961582495866, + "ewc_loss": 0.025781966745853424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2890983271063305e-05, + "grad_norm": 22.203968048095703, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8521966934204102, + "num_tokens": 113548085.0, + "step": 2973 + }, + { + "epoch": 0.37832336852817705, + "ewc_loss": 0.025800760835409164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2900380170322023e-05, + "grad_norm": 22.075878143310547, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8542281985282898, + "num_tokens": 113581664.0, + "step": 2974 + }, + { + "epoch": 0.3784505788067676, + "ewc_loss": 0.02571515552699566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2857577530667186e-05, + "grad_norm": 22.023962020874023, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8630028963088989, + "num_tokens": 113618552.0, + "step": 2975 + }, + { + "epoch": 0.3785777890853581, + "ewc_loss": 0.025832507759332657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2916253581352066e-05, + "grad_norm": 22.17824935913086, + "learning_rate": 1e-06, + "loss": 0.5529, + "mean_token_accuracy": 0.8294208645820618, + "num_tokens": 113653406.0, + "step": 2976 + }, + { + "epoch": 0.3787049993639486, + "ewc_loss": 0.02584311179816723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2921555935463402e-05, + "grad_norm": 22.163246154785156, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8583254814147949, + "num_tokens": 113693803.0, + "step": 2977 + }, + { + "epoch": 0.3788322096425391, + "ewc_loss": 0.025782069191336632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2891034202766605e-05, + "grad_norm": 22.11159324645996, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8641083240509033, + "num_tokens": 113734637.0, + "step": 2978 + }, + { + "epoch": 0.37895941992112964, + "ewc_loss": 0.025803694501519203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2901847185275983e-05, + "grad_norm": 22.046443939208984, + "learning_rate": 1e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8429874181747437, + "num_tokens": 113776579.0, + "step": 2979 + }, + { + "epoch": 0.37908663019972016, + "ewc_loss": 0.0258023664355278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2901183254143689e-05, + "grad_norm": 22.12689971923828, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8392552733421326, + "num_tokens": 113818857.0, + "step": 2980 + }, + { + "epoch": 0.37921384047831064, + "ewc_loss": 0.025803983211517334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2901991794933565e-05, + "grad_norm": 22.098819732666016, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8415802121162415, + "num_tokens": 113853248.0, + "step": 2981 + }, + { + "epoch": 0.37934105075690117, + "ewc_loss": 0.025769641622900963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2884820534964092e-05, + "grad_norm": 22.080596923828125, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8568737506866455, + "num_tokens": 113894206.0, + "step": 2982 + }, + { + "epoch": 0.3794682610354917, + "ewc_loss": 0.02584107406437397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2920537301397417e-05, + "grad_norm": 22.113325119018555, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8574398159980774, + "num_tokens": 113931309.0, + "step": 2983 + }, + { + "epoch": 0.37959547131408217, + "ewc_loss": 0.025812583044171333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2906291885883547e-05, + "grad_norm": 22.174028396606445, + "learning_rate": 1e-06, + "loss": 0.5063, + "mean_token_accuracy": 0.8401530981063843, + "num_tokens": 113976559.0, + "step": 2984 + }, + { + "epoch": 0.3797226815926727, + "ewc_loss": 0.025811025872826576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.290551335841883e-05, + "grad_norm": 22.20901107788086, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8487658500671387, + "num_tokens": 114010814.0, + "step": 2985 + }, + { + "epoch": 0.3798498918712632, + "ewc_loss": 0.025780264288187027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2890131984022446e-05, + "grad_norm": 22.09598731994629, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8693852424621582, + "num_tokens": 114049389.0, + "step": 2986 + }, + { + "epoch": 0.3799771021498537, + "ewc_loss": 0.025820160284638405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2910079931316432e-05, + "grad_norm": 22.392061233520508, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8679488897323608, + "num_tokens": 114081463.0, + "step": 2987 + }, + { + "epoch": 0.3801043124284442, + "ewc_loss": 0.02576005645096302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.288002840738045e-05, + "grad_norm": 22.088476181030273, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8663216233253479, + "num_tokens": 114117175.0, + "step": 2988 + }, + { + "epoch": 0.38023152270703475, + "ewc_loss": 0.025708923116326332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2854461601818912e-05, + "grad_norm": 22.18195343017578, + "learning_rate": 1e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8396470546722412, + "num_tokens": 114153764.0, + "step": 2989 + }, + { + "epoch": 0.3803587329856252, + "ewc_loss": 0.025770153850317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2885077012469992e-05, + "grad_norm": 22.2108154296875, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8503060340881348, + "num_tokens": 114191487.0, + "step": 2990 + }, + { + "epoch": 0.38048594326421575, + "ewc_loss": 0.025716830044984818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2858415175287519e-05, + "grad_norm": 22.125242233276367, + "learning_rate": 1e-06, + "loss": 0.5483, + "mean_token_accuracy": 0.8266131281852722, + "num_tokens": 114233880.0, + "step": 2991 + }, + { + "epoch": 0.3806131535428063, + "ewc_loss": 0.02575795352458954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2878977031505201e-05, + "grad_norm": 22.0849666595459, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8490535616874695, + "num_tokens": 114269238.0, + "step": 2992 + }, + { + "epoch": 0.38074036382139675, + "ewc_loss": 0.025792984291911125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2896492080471944e-05, + "grad_norm": 22.28354263305664, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8622233867645264, + "num_tokens": 114307645.0, + "step": 2993 + }, + { + "epoch": 0.3808675740999873, + "ewc_loss": 0.02577085979282856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.288542989641428e-05, + "grad_norm": 21.929140090942383, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8548468947410583, + "num_tokens": 114341800.0, + "step": 2994 + }, + { + "epoch": 0.3809947843785778, + "ewc_loss": 0.02576305903494358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2881529073638376e-05, + "grad_norm": 22.28673553466797, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8581803441047668, + "num_tokens": 114378363.0, + "step": 2995 + }, + { + "epoch": 0.3811219946571683, + "ewc_loss": 0.025831053033471107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.291552689508535e-05, + "grad_norm": 22.043806076049805, + "learning_rate": 1e-06, + "loss": 0.514, + "mean_token_accuracy": 0.8397879600524902, + "num_tokens": 114414489.0, + "step": 2996 + }, + { + "epoch": 0.3812492049357588, + "ewc_loss": 0.025735313072800636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2867656550952233e-05, + "grad_norm": 22.16762351989746, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8454084992408752, + "num_tokens": 114449197.0, + "step": 2997 + }, + { + "epoch": 0.38137641521434934, + "ewc_loss": 0.025900322943925858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2950161362823565e-05, + "grad_norm": 22.26729393005371, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8477388620376587, + "num_tokens": 114478654.0, + "step": 2998 + }, + { + "epoch": 0.3815036254929398, + "ewc_loss": 0.025830741971731186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2915371371491347e-05, + "grad_norm": 22.17548370361328, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8589428067207336, + "num_tokens": 114521807.0, + "step": 2999 + }, + { + "epoch": 0.38163083577153034, + "ewc_loss": 0.02584308758378029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.292154411203228e-05, + "grad_norm": 22.237213134765625, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8641766309738159, + "num_tokens": 114558297.0, + "step": 3000 + }, + { + "epoch": 0.38175804605012087, + "ewc_loss": 0.02578705921769142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.289352985622827e-05, + "grad_norm": 21.985986709594727, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8565499782562256, + "num_tokens": 114599630.0, + "step": 3001 + }, + { + "epoch": 0.38188525632871134, + "ewc_loss": 0.02582498826086521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2912493730254937e-05, + "grad_norm": 22.155256271362305, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.848833441734314, + "num_tokens": 114638697.0, + "step": 3002 + }, + { + "epoch": 0.38201246660730187, + "ewc_loss": 0.02588377147912979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2941885870532133e-05, + "grad_norm": 22.06612777709961, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.851109504699707, + "num_tokens": 114676106.0, + "step": 3003 + }, + { + "epoch": 0.3821396768858924, + "ewc_loss": 0.02584727481007576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.292363776883576e-05, + "grad_norm": 22.115985870361328, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8489433526992798, + "num_tokens": 114712026.0, + "step": 3004 + }, + { + "epoch": 0.38226688716448287, + "ewc_loss": 0.025962140411138535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2981070540263318e-05, + "grad_norm": 22.17192268371582, + "learning_rate": 1e-06, + "loss": 0.5431, + "mean_token_accuracy": 0.8281111121177673, + "num_tokens": 114752348.0, + "step": 3005 + }, + { + "epoch": 0.3823940974430734, + "ewc_loss": 0.025880562141537666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2940280612383503e-05, + "grad_norm": 22.09502601623535, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8549553155899048, + "num_tokens": 114789922.0, + "step": 3006 + }, + { + "epoch": 0.38252130772166393, + "ewc_loss": 0.025917863473296165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2958931620232761e-05, + "grad_norm": 22.13364028930664, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8450653553009033, + "num_tokens": 114838834.0, + "step": 3007 + }, + { + "epoch": 0.3826485180002544, + "ewc_loss": 0.02591145969927311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.295573019888252e-05, + "grad_norm": 22.146421432495117, + "learning_rate": 1e-06, + "loss": 0.5316, + "mean_token_accuracy": 0.8290387392044067, + "num_tokens": 114873320.0, + "step": 3008 + }, + { + "epoch": 0.38277572827884493, + "ewc_loss": 0.025900574401021004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.295028687309241e-05, + "grad_norm": 22.039873123168945, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.862113893032074, + "num_tokens": 114908293.0, + "step": 3009 + }, + { + "epoch": 0.38290293855743546, + "ewc_loss": 0.02591104432940483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2955521924595814e-05, + "grad_norm": 22.134307861328125, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8403380513191223, + "num_tokens": 114950083.0, + "step": 3010 + }, + { + "epoch": 0.38303014883602593, + "ewc_loss": 0.025950096547603607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2975047866348177e-05, + "grad_norm": 22.12766456604004, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.854282021522522, + "num_tokens": 114982353.0, + "step": 3011 + }, + { + "epoch": 0.38315735911461646, + "ewc_loss": 0.025908606126904488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2954303201695438e-05, + "grad_norm": 22.084991455078125, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8560630679130554, + "num_tokens": 115017668.0, + "step": 3012 + }, + { + "epoch": 0.383284569393207, + "ewc_loss": 0.025934310629963875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2967155271326192e-05, + "grad_norm": 22.089275360107422, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8622429966926575, + "num_tokens": 115055266.0, + "step": 3013 + }, + { + "epoch": 0.38341177967179746, + "ewc_loss": 0.025959059596061707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2979529856238514e-05, + "grad_norm": 22.11307716369629, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8454346060752869, + "num_tokens": 115089728.0, + "step": 3014 + }, + { + "epoch": 0.383538989950388, + "ewc_loss": 0.025951245799660683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2975622666999698e-05, + "grad_norm": 22.105161666870117, + "learning_rate": 1e-06, + "loss": 0.5175, + "mean_token_accuracy": 0.8355451822280884, + "num_tokens": 115135192.0, + "step": 3015 + }, + { + "epoch": 0.3836662002289785, + "ewc_loss": 0.025950243696570396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.297512153541902e-05, + "grad_norm": 22.146312713623047, + "learning_rate": 1e-06, + "loss": 0.5362, + "mean_token_accuracy": 0.8334670066833496, + "num_tokens": 115170350.0, + "step": 3016 + }, + { + "epoch": 0.383793410507569, + "ewc_loss": 0.025958316400647163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2979157872905489e-05, + "grad_norm": 22.15958595275879, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8577277064323425, + "num_tokens": 115202724.0, + "step": 3017 + }, + { + "epoch": 0.3839206207861595, + "ewc_loss": 0.0259542316198349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2977116057300009e-05, + "grad_norm": 22.1579647064209, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8498749732971191, + "num_tokens": 115239274.0, + "step": 3018 + }, + { + "epoch": 0.38404783106475004, + "ewc_loss": 0.02595694176852703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.297847120440565e-05, + "grad_norm": 22.140399932861328, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8593446016311646, + "num_tokens": 115278330.0, + "step": 3019 + }, + { + "epoch": 0.3841750413433405, + "ewc_loss": 0.02594676800072193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2973384400538635e-05, + "grad_norm": 22.1237850189209, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8457894325256348, + "num_tokens": 115321258.0, + "step": 3020 + }, + { + "epoch": 0.38430225162193105, + "ewc_loss": 0.025966554880142212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2983276974409819e-05, + "grad_norm": 22.090187072753906, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8521077036857605, + "num_tokens": 115362609.0, + "step": 3021 + }, + { + "epoch": 0.3844294619005216, + "ewc_loss": 0.025950590148568153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.297529524890706e-05, + "grad_norm": 22.10490608215332, + "learning_rate": 1e-06, + "loss": 0.5683, + "mean_token_accuracy": 0.8240301012992859, + "num_tokens": 115405814.0, + "step": 3022 + }, + { + "epoch": 0.38455667217911205, + "ewc_loss": 0.02598610520362854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2993052223464474e-05, + "grad_norm": 22.140207290649414, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.844534158706665, + "num_tokens": 115440856.0, + "step": 3023 + }, + { + "epoch": 0.3846838824577026, + "ewc_loss": 0.02600354515016079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3001772458665073e-05, + "grad_norm": 22.115787506103516, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8561109900474548, + "num_tokens": 115482582.0, + "step": 3024 + }, + { + "epoch": 0.3848110927362931, + "ewc_loss": 0.025996705517172813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2998352758586407e-05, + "grad_norm": 22.148393630981445, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8421617150306702, + "num_tokens": 115527268.0, + "step": 3025 + }, + { + "epoch": 0.3849383030148836, + "ewc_loss": 0.02599756047129631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.299878022109624e-05, + "grad_norm": 22.12622833251953, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8453831076622009, + "num_tokens": 115563757.0, + "step": 3026 + }, + { + "epoch": 0.3850655132934741, + "ewc_loss": 0.026007024571299553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3003512322029565e-05, + "grad_norm": 22.199047088623047, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8536689281463623, + "num_tokens": 115599629.0, + "step": 3027 + }, + { + "epoch": 0.38519272357206463, + "ewc_loss": 0.026009799912571907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.300490021094447e-05, + "grad_norm": 22.142913818359375, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8535031080245972, + "num_tokens": 115634261.0, + "step": 3028 + }, + { + "epoch": 0.3853199338506551, + "ewc_loss": 0.02597789093852043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.298894585488597e-05, + "grad_norm": 22.102705001831055, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8623180389404297, + "num_tokens": 115671150.0, + "step": 3029 + }, + { + "epoch": 0.38544714412924563, + "ewc_loss": 0.025979401543736458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.298970073548844e-05, + "grad_norm": 22.098772048950195, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8643717765808105, + "num_tokens": 115714328.0, + "step": 3030 + }, + { + "epoch": 0.38557435440783616, + "ewc_loss": 0.02599586918950081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2997934391023591e-05, + "grad_norm": 22.2288818359375, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8662729263305664, + "num_tokens": 115747118.0, + "step": 3031 + }, + { + "epoch": 0.3857015646864267, + "ewc_loss": 0.025992564857006073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2996282748645172e-05, + "grad_norm": 22.18819808959961, + "learning_rate": 1e-06, + "loss": 0.5402, + "mean_token_accuracy": 0.8302109241485596, + "num_tokens": 115790309.0, + "step": 3032 + }, + { + "epoch": 0.38582877496501716, + "ewc_loss": 0.026027444750070572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3013722309551667e-05, + "grad_norm": 22.279802322387695, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8624821305274963, + "num_tokens": 115825630.0, + "step": 3033 + }, + { + "epoch": 0.3859559852436077, + "ewc_loss": 0.026002297177910805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3001148545299657e-05, + "grad_norm": 22.070621490478516, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8578238487243652, + "num_tokens": 115858848.0, + "step": 3034 + }, + { + "epoch": 0.3860831955221982, + "ewc_loss": 0.025990361347794533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2995180441066623e-05, + "grad_norm": 22.27900505065918, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8586224913597107, + "num_tokens": 115900024.0, + "step": 3035 + }, + { + "epoch": 0.3862104058007887, + "ewc_loss": 0.026032332330942154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3016166121815331e-05, + "grad_norm": 22.14787483215332, + "learning_rate": 1e-06, + "loss": 0.5545, + "mean_token_accuracy": 0.8257480263710022, + "num_tokens": 115938899.0, + "step": 3036 + }, + { + "epoch": 0.3863376160793792, + "ewc_loss": 0.025957008823752403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.2978504855709616e-05, + "grad_norm": 22.080812454223633, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.859128475189209, + "num_tokens": 115972199.0, + "step": 3037 + }, + { + "epoch": 0.38646482635796975, + "ewc_loss": 0.02601184882223606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3005924301978666e-05, + "grad_norm": 22.206195831298828, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8599921464920044, + "num_tokens": 116011506.0, + "step": 3038 + }, + { + "epoch": 0.3865920366365602, + "ewc_loss": 0.026014316827058792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3007158486288972e-05, + "grad_norm": 22.15908432006836, + "learning_rate": 1e-06, + "loss": 0.5527, + "mean_token_accuracy": 0.8287874460220337, + "num_tokens": 116051428.0, + "step": 3039 + }, + { + "epoch": 0.38671924691515075, + "ewc_loss": 0.026013845577836037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3006922927161213e-05, + "grad_norm": 22.150943756103516, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8596053123474121, + "num_tokens": 116090307.0, + "step": 3040 + }, + { + "epoch": 0.3868464571937413, + "ewc_loss": 0.026019709184765816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3009854228585027e-05, + "grad_norm": 22.166175842285156, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8408930897712708, + "num_tokens": 116123381.0, + "step": 3041 + }, + { + "epoch": 0.38697366747233175, + "ewc_loss": 0.026044631376862526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.302231612498872e-05, + "grad_norm": 22.236600875854492, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8532227873802185, + "num_tokens": 116157755.0, + "step": 3042 + }, + { + "epoch": 0.3871008777509223, + "ewc_loss": 0.026041964069008827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3020981896261219e-05, + "grad_norm": 22.163782119750977, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.851372241973877, + "num_tokens": 116197371.0, + "step": 3043 + }, + { + "epoch": 0.3872280880295128, + "ewc_loss": 0.02601781114935875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3008905625611078e-05, + "grad_norm": 22.144428253173828, + "learning_rate": 1e-06, + "loss": 0.5305, + "mean_token_accuracy": 0.83585524559021, + "num_tokens": 116240544.0, + "step": 3044 + }, + { + "epoch": 0.3873552983081033, + "ewc_loss": 0.026053696870803833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3026848137087654e-05, + "grad_norm": 22.217655181884766, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8475760221481323, + "num_tokens": 116272079.0, + "step": 3045 + }, + { + "epoch": 0.3874825085866938, + "ewc_loss": 0.02606048807501793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3030244190304074e-05, + "grad_norm": 22.20197296142578, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8485407829284668, + "num_tokens": 116309641.0, + "step": 3046 + }, + { + "epoch": 0.38760971886528434, + "ewc_loss": 0.02604803629219532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3024017789575737e-05, + "grad_norm": 22.134521484375, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8450549244880676, + "num_tokens": 116344470.0, + "step": 3047 + }, + { + "epoch": 0.3877369291438748, + "ewc_loss": 0.026116646826267242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.305832302023191e-05, + "grad_norm": 22.26420021057129, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8539667725563049, + "num_tokens": 116380177.0, + "step": 3048 + }, + { + "epoch": 0.38786413942246534, + "ewc_loss": 0.026107005774974823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3053502698312514e-05, + "grad_norm": 22.169851303100586, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8613892793655396, + "num_tokens": 116416482.0, + "step": 3049 + }, + { + "epoch": 0.38799134970105587, + "ewc_loss": 0.026097238063812256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3048618711763993e-05, + "grad_norm": 22.384265899658203, + "learning_rate": 1e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.8389459848403931, + "num_tokens": 116456103.0, + "step": 3050 + }, + { + "epoch": 0.38811855997964634, + "ewc_loss": 0.02606610767543316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3033053619437851e-05, + "grad_norm": 22.14280891418457, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8570117950439453, + "num_tokens": 116496810.0, + "step": 3051 + }, + { + "epoch": 0.38824577025823687, + "ewc_loss": 0.026063276454806328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3031638445681892e-05, + "grad_norm": 22.246110916137695, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8464672565460205, + "num_tokens": 116532999.0, + "step": 3052 + }, + { + "epoch": 0.3883729805368274, + "ewc_loss": 0.02610212005674839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3051059795543551e-05, + "grad_norm": 22.100234985351562, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8594681024551392, + "num_tokens": 116565700.0, + "step": 3053 + }, + { + "epoch": 0.38850019081541787, + "ewc_loss": 0.026098981499671936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3049490917182993e-05, + "grad_norm": 22.31228256225586, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8548412919044495, + "num_tokens": 116602798.0, + "step": 3054 + }, + { + "epoch": 0.3886274010940084, + "ewc_loss": 0.026112569496035576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3056284842605237e-05, + "grad_norm": 22.12731170654297, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8615682125091553, + "num_tokens": 116638743.0, + "step": 3055 + }, + { + "epoch": 0.3887546113725989, + "ewc_loss": 0.026091909036040306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.30459548017825e-05, + "grad_norm": 22.223419189453125, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8627122640609741, + "num_tokens": 116672290.0, + "step": 3056 + }, + { + "epoch": 0.3888818216511894, + "ewc_loss": 0.026087338104844093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3043669241596945e-05, + "grad_norm": 22.10401153564453, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.853039026260376, + "num_tokens": 116712593.0, + "step": 3057 + }, + { + "epoch": 0.3890090319297799, + "ewc_loss": 0.026096973568201065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3048486835032236e-05, + "grad_norm": 22.2586612701416, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8483372330665588, + "num_tokens": 116748663.0, + "step": 3058 + }, + { + "epoch": 0.38913624220837045, + "ewc_loss": 0.026154913008213043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3077456060273107e-05, + "grad_norm": 22.091615676879883, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8475309610366821, + "num_tokens": 116796018.0, + "step": 3059 + }, + { + "epoch": 0.3892634524869609, + "ewc_loss": 0.02614983730018139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.307491857005516e-05, + "grad_norm": 22.36870002746582, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8475666046142578, + "num_tokens": 116837287.0, + "step": 3060 + }, + { + "epoch": 0.38939066276555145, + "ewc_loss": 0.026172572746872902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.308628634433262e-05, + "grad_norm": 22.14834213256836, + "learning_rate": 1e-06, + "loss": 0.5453, + "mean_token_accuracy": 0.8348127603530884, + "num_tokens": 116872163.0, + "step": 3061 + }, + { + "epoch": 0.389517873044142, + "ewc_loss": 0.026107413694262505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.305370642512571e-05, + "grad_norm": 22.371522903442383, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8595605492591858, + "num_tokens": 116913445.0, + "step": 3062 + }, + { + "epoch": 0.38964508332273246, + "ewc_loss": 0.026149984449148178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3074992239126004e-05, + "grad_norm": 22.211488723754883, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8416130542755127, + "num_tokens": 116954469.0, + "step": 3063 + }, + { + "epoch": 0.389772293601323, + "ewc_loss": 0.02603093348443508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3015466720389668e-05, + "grad_norm": 22.14571189880371, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8494171500205994, + "num_tokens": 116994339.0, + "step": 3064 + }, + { + "epoch": 0.3898995038799135, + "ewc_loss": 0.026103388518095016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3051694622845389e-05, + "grad_norm": 22.178874969482422, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8629857301712036, + "num_tokens": 117030867.0, + "step": 3065 + }, + { + "epoch": 0.390026714158504, + "ewc_loss": 0.026113897562026978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3056948773737531e-05, + "grad_norm": 22.12555694580078, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8593554496765137, + "num_tokens": 117068309.0, + "step": 3066 + }, + { + "epoch": 0.3901539244370945, + "ewc_loss": 0.026123082265257835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3061540812486783e-05, + "grad_norm": 22.268512725830078, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8470600843429565, + "num_tokens": 117105278.0, + "step": 3067 + }, + { + "epoch": 0.39028113471568504, + "ewc_loss": 0.02612183429300785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3060916899121366e-05, + "grad_norm": 22.167238235473633, + "learning_rate": 1e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8435113430023193, + "num_tokens": 117143096.0, + "step": 3068 + }, + { + "epoch": 0.3904083449942755, + "ewc_loss": 0.02611188404262066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3055941963102669e-05, + "grad_norm": 22.1951847076416, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8520739674568176, + "num_tokens": 117179638.0, + "step": 3069 + }, + { + "epoch": 0.39053555527286604, + "ewc_loss": 0.026194509118795395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3097254850436002e-05, + "grad_norm": 22.313493728637695, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.864456295967102, + "num_tokens": 117211560.0, + "step": 3070 + }, + { + "epoch": 0.39066276555145657, + "ewc_loss": 0.026100870221853256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3050434972683433e-05, + "grad_norm": 22.18375587463379, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8427918553352356, + "num_tokens": 117250622.0, + "step": 3071 + }, + { + "epoch": 0.39078997583004704, + "ewc_loss": 0.02610839530825615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3054197552264668e-05, + "grad_norm": 22.180744171142578, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8711156845092773, + "num_tokens": 117287600.0, + "step": 3072 + }, + { + "epoch": 0.39091718610863757, + "ewc_loss": 0.02615673653781414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3078368283458985e-05, + "grad_norm": 22.197628021240234, + "learning_rate": 1e-06, + "loss": 0.5236, + "mean_token_accuracy": 0.8360422849655151, + "num_tokens": 117328188.0, + "step": 3073 + }, + { + "epoch": 0.3910443963872281, + "ewc_loss": 0.026123039424419403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3061519894108642e-05, + "grad_norm": 22.218257904052734, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8589290976524353, + "num_tokens": 117367282.0, + "step": 3074 + }, + { + "epoch": 0.39117160666581857, + "ewc_loss": 0.02617247775197029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3086239050608128e-05, + "grad_norm": 22.370403289794922, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8615910410881042, + "num_tokens": 117403376.0, + "step": 3075 + }, + { + "epoch": 0.3912988169444091, + "ewc_loss": 0.026141023263335228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.307051206822507e-05, + "grad_norm": 22.130067825317383, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8411612510681152, + "num_tokens": 117438661.0, + "step": 3076 + }, + { + "epoch": 0.39142602722299963, + "ewc_loss": 0.026116449385881424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3058224794804119e-05, + "grad_norm": 22.21354866027832, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8555017709732056, + "num_tokens": 117477351.0, + "step": 3077 + }, + { + "epoch": 0.3915532375015901, + "ewc_loss": 0.026154588907957077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3077294170216192e-05, + "grad_norm": 22.215930938720703, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8483197689056396, + "num_tokens": 117512773.0, + "step": 3078 + }, + { + "epoch": 0.39168044778018063, + "ewc_loss": 0.02616061270236969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3080306416668463e-05, + "grad_norm": 22.387008666992188, + "learning_rate": 1e-06, + "loss": 0.5233, + "mean_token_accuracy": 0.835946798324585, + "num_tokens": 117555370.0, + "step": 3079 + }, + { + "epoch": 0.39180765805877116, + "ewc_loss": 0.026197513565421104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3098756426188629e-05, + "grad_norm": 22.38801383972168, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8588166236877441, + "num_tokens": 117594208.0, + "step": 3080 + }, + { + "epoch": 0.3919348683373617, + "ewc_loss": 0.026077784597873688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3038892575423233e-05, + "grad_norm": 22.17466163635254, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8537266254425049, + "num_tokens": 117632715.0, + "step": 3081 + }, + { + "epoch": 0.39206207861595216, + "ewc_loss": 0.026092922315001488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3046461390331388e-05, + "grad_norm": 22.213611602783203, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.847299337387085, + "num_tokens": 117669221.0, + "step": 3082 + }, + { + "epoch": 0.3921892888945427, + "ewc_loss": 0.026164738461375237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3082369150652084e-05, + "grad_norm": 22.403106689453125, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8454997539520264, + "num_tokens": 117709559.0, + "step": 3083 + }, + { + "epoch": 0.3923164991731332, + "ewc_loss": 0.02612885646522045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3064428458164912e-05, + "grad_norm": 22.24633026123047, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8580952882766724, + "num_tokens": 117745240.0, + "step": 3084 + }, + { + "epoch": 0.3924437094517237, + "ewc_loss": 0.026113973930478096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3056986972515006e-05, + "grad_norm": 22.396852493286133, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8518515825271606, + "num_tokens": 117787897.0, + "step": 3085 + }, + { + "epoch": 0.3925709197303142, + "ewc_loss": 0.026135016232728958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3067508007225115e-05, + "grad_norm": 22.225841522216797, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8479088544845581, + "num_tokens": 117826862.0, + "step": 3086 + }, + { + "epoch": 0.39269813000890474, + "ewc_loss": 0.02608417719602585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3042088539805263e-05, + "grad_norm": 22.296802520751953, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8464390635490417, + "num_tokens": 117866932.0, + "step": 3087 + }, + { + "epoch": 0.3928253402874952, + "ewc_loss": 0.026138177141547203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3069088709016796e-05, + "grad_norm": 22.250110626220703, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.852956235408783, + "num_tokens": 117905538.0, + "step": 3088 + }, + { + "epoch": 0.39295255056608575, + "ewc_loss": 0.026069408282637596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3034704352321569e-05, + "grad_norm": 22.1684627532959, + "learning_rate": 1e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8398300409317017, + "num_tokens": 117945123.0, + "step": 3089 + }, + { + "epoch": 0.3930797608446763, + "ewc_loss": 0.026143940165638924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3071969988232013e-05, + "grad_norm": 22.299793243408203, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8544037342071533, + "num_tokens": 117988743.0, + "step": 3090 + }, + { + "epoch": 0.39320697112326675, + "ewc_loss": 0.026085689663887024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3042845239397138e-05, + "grad_norm": 22.184228897094727, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8656815886497498, + "num_tokens": 118026078.0, + "step": 3091 + }, + { + "epoch": 0.3933341814018573, + "ewc_loss": 0.02611013874411583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3055069757683668e-05, + "grad_norm": 22.317106246948242, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8630176782608032, + "num_tokens": 118060105.0, + "step": 3092 + }, + { + "epoch": 0.3934613916804478, + "ewc_loss": 0.026151379570364952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3075689821562264e-05, + "grad_norm": 22.213783264160156, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8784983158111572, + "num_tokens": 118089677.0, + "step": 3093 + }, + { + "epoch": 0.3935886019590383, + "ewc_loss": 0.026076771318912506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3038385986874346e-05, + "grad_norm": 22.325668334960938, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8645744323730469, + "num_tokens": 118128449.0, + "step": 3094 + }, + { + "epoch": 0.3937158122376288, + "ewc_loss": 0.026142871007323265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.307143520534737e-05, + "grad_norm": 22.219881057739258, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8601986169815063, + "num_tokens": 118168411.0, + "step": 3095 + }, + { + "epoch": 0.39384302251621933, + "ewc_loss": 0.026073452085256577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.303672615904361e-05, + "grad_norm": 22.22916603088379, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8618252873420715, + "num_tokens": 118206097.0, + "step": 3096 + }, + { + "epoch": 0.3939702327948098, + "ewc_loss": 0.02611592970788479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3057964679319412e-05, + "grad_norm": 22.187162399291992, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8649666905403137, + "num_tokens": 118244429.0, + "step": 3097 + }, + { + "epoch": 0.39409744307340033, + "ewc_loss": 0.026140866801142693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3070433851680718e-05, + "grad_norm": 22.33381462097168, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8573145866394043, + "num_tokens": 118279125.0, + "step": 3098 + }, + { + "epoch": 0.39422465335199086, + "ewc_loss": 0.026138344779610634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.306917238252936e-05, + "grad_norm": 22.153215408325195, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8486409187316895, + "num_tokens": 118313156.0, + "step": 3099 + }, + { + "epoch": 0.39435186363058133, + "ewc_loss": 0.026146190240979195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3073095033178106e-05, + "grad_norm": 22.307891845703125, + "learning_rate": 1e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8367051482200623, + "num_tokens": 118349868.0, + "step": 3100 + }, + { + "epoch": 0.39447907390917186, + "ewc_loss": 0.02619243413209915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3096217116981279e-05, + "grad_norm": 22.240983963012695, + "learning_rate": 1e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.8425084352493286, + "num_tokens": 118394998.0, + "step": 3101 + }, + { + "epoch": 0.3946062841877624, + "ewc_loss": 0.02614450640976429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3072252841084264e-05, + "grad_norm": 22.196857452392578, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8627960085868835, + "num_tokens": 118436002.0, + "step": 3102 + }, + { + "epoch": 0.39473349446635286, + "ewc_loss": 0.026197224855422974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3098612726025749e-05, + "grad_norm": 22.270965576171875, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8572166562080383, + "num_tokens": 118475094.0, + "step": 3103 + }, + { + "epoch": 0.3948607047449434, + "ewc_loss": 0.026163900271058083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3081949873594567e-05, + "grad_norm": 22.299861907958984, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8495069742202759, + "num_tokens": 118514164.0, + "step": 3104 + }, + { + "epoch": 0.3949879150235339, + "ewc_loss": 0.02616499550640583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3082497389405034e-05, + "grad_norm": 22.191137313842773, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8466765880584717, + "num_tokens": 118554271.0, + "step": 3105 + }, + { + "epoch": 0.3951151253021244, + "ewc_loss": 0.026205716654658318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3102858247293625e-05, + "grad_norm": 22.27975845336914, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8527948260307312, + "num_tokens": 118597123.0, + "step": 3106 + }, + { + "epoch": 0.3952423355807149, + "ewc_loss": 0.02617284469306469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3086422768537886e-05, + "grad_norm": 22.23094940185547, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8465381264686584, + "num_tokens": 118636683.0, + "step": 3107 + }, + { + "epoch": 0.39536954585930545, + "ewc_loss": 0.02619130350649357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3095651411276776e-05, + "grad_norm": 22.2211971282959, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8539304733276367, + "num_tokens": 118665770.0, + "step": 3108 + }, + { + "epoch": 0.3954967561378959, + "ewc_loss": 0.026236340403556824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3118170500092674e-05, + "grad_norm": 22.217098236083984, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.862488329410553, + "num_tokens": 118701652.0, + "step": 3109 + }, + { + "epoch": 0.39562396641648645, + "ewc_loss": 0.02618952840566635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3094764653942548e-05, + "grad_norm": 22.143875122070312, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8586357235908508, + "num_tokens": 118741756.0, + "step": 3110 + }, + { + "epoch": 0.395751176695077, + "ewc_loss": 0.026194864884018898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3097432201902848e-05, + "grad_norm": 22.2030029296875, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8520792722702026, + "num_tokens": 118784320.0, + "step": 3111 + }, + { + "epoch": 0.39587838697366745, + "ewc_loss": 0.02621009573340416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3105047401040792e-05, + "grad_norm": 22.198131561279297, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8460965156555176, + "num_tokens": 118821456.0, + "step": 3112 + }, + { + "epoch": 0.396005597252258, + "ewc_loss": 0.026211107149720192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.310555398958968e-05, + "grad_norm": 22.247814178466797, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8502829074859619, + "num_tokens": 118861746.0, + "step": 3113 + }, + { + "epoch": 0.3961328075308485, + "ewc_loss": 0.026243997737765312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3121998563292436e-05, + "grad_norm": 22.185155868530273, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8576315641403198, + "num_tokens": 118900346.0, + "step": 3114 + }, + { + "epoch": 0.396260017809439, + "ewc_loss": 0.026270661503076553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3135330846125726e-05, + "grad_norm": 22.218544006347656, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8523856401443481, + "num_tokens": 118939707.0, + "step": 3115 + }, + { + "epoch": 0.3963872280880295, + "ewc_loss": 0.026243582367897034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3121791198500432e-05, + "grad_norm": 22.27701187133789, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8508167266845703, + "num_tokens": 118971154.0, + "step": 3116 + }, + { + "epoch": 0.39651443836662004, + "ewc_loss": 0.026233728975057602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3116864465700928e-05, + "grad_norm": 22.159210205078125, + "learning_rate": 1e-06, + "loss": 0.5318, + "mean_token_accuracy": 0.8367301225662231, + "num_tokens": 119012344.0, + "step": 3117 + }, + { + "epoch": 0.3966416486452105, + "ewc_loss": 0.02626543678343296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3132718777342234e-05, + "grad_norm": 22.37660026550293, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8518491983413696, + "num_tokens": 119047677.0, + "step": 3118 + }, + { + "epoch": 0.39676885892380104, + "ewc_loss": 0.02626822516322136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3134113032720052e-05, + "grad_norm": 22.18683433532715, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8670560121536255, + "num_tokens": 119086357.0, + "step": 3119 + }, + { + "epoch": 0.39689606920239157, + "ewc_loss": 0.026211222633719444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3105611287755892e-05, + "grad_norm": 22.192893981933594, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8588531017303467, + "num_tokens": 119122854.0, + "step": 3120 + }, + { + "epoch": 0.39702327948098204, + "ewc_loss": 0.026290273293852806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3145137018000241e-05, + "grad_norm": 22.27163314819336, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8591035604476929, + "num_tokens": 119155449.0, + "step": 3121 + }, + { + "epoch": 0.39715048975957257, + "ewc_loss": 0.026318617165088654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3159308764443267e-05, + "grad_norm": 22.324562072753906, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.847435712814331, + "num_tokens": 119197287.0, + "step": 3122 + }, + { + "epoch": 0.3972777000381631, + "ewc_loss": 0.026297936215996742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3148967809684109e-05, + "grad_norm": 22.237403869628906, + "learning_rate": 1e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8403639793395996, + "num_tokens": 119235749.0, + "step": 3123 + }, + { + "epoch": 0.39740491031675357, + "ewc_loss": 0.026275726035237312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3137862879375461e-05, + "grad_norm": 22.188602447509766, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.847378671169281, + "num_tokens": 119267282.0, + "step": 3124 + }, + { + "epoch": 0.3975321205953441, + "ewc_loss": 0.02633945271372795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3169726116757374e-05, + "grad_norm": 22.32300567626953, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8436453938484192, + "num_tokens": 119304387.0, + "step": 3125 + }, + { + "epoch": 0.3976593308739346, + "ewc_loss": 0.026320261880755424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.316013094765367e-05, + "grad_norm": 22.24256134033203, + "learning_rate": 1e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8420414924621582, + "num_tokens": 119341470.0, + "step": 3126 + }, + { + "epoch": 0.3977865411525251, + "ewc_loss": 0.026284461840987206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3142231182428077e-05, + "grad_norm": 22.25188636779785, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8410981893539429, + "num_tokens": 119381067.0, + "step": 3127 + }, + { + "epoch": 0.3979137514311156, + "ewc_loss": 0.026323867961764336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3161933566152584e-05, + "grad_norm": 22.22738265991211, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8518615961074829, + "num_tokens": 119415680.0, + "step": 3128 + }, + { + "epoch": 0.39804096170970615, + "ewc_loss": 0.026318859308958054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3159429727238603e-05, + "grad_norm": 22.313304901123047, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8427190780639648, + "num_tokens": 119454300.0, + "step": 3129 + }, + { + "epoch": 0.3981681719882967, + "ewc_loss": 0.02634565159678459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3172825674701016e-05, + "grad_norm": 22.30162811279297, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8651775121688843, + "num_tokens": 119485599.0, + "step": 3130 + }, + { + "epoch": 0.39829538226688715, + "ewc_loss": 0.026338867843151093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3169434168958105e-05, + "grad_norm": 22.235061645507812, + "learning_rate": 1e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8378081321716309, + "num_tokens": 119525820.0, + "step": 3131 + }, + { + "epoch": 0.3984225925454777, + "ewc_loss": 0.026352405548095703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.31762026285287e-05, + "grad_norm": 22.38248634338379, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.858723521232605, + "num_tokens": 119568216.0, + "step": 3132 + }, + { + "epoch": 0.3985498028240682, + "ewc_loss": 0.026374638080596924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.318731938226847e-05, + "grad_norm": 22.284814834594727, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8398048877716064, + "num_tokens": 119602914.0, + "step": 3133 + }, + { + "epoch": 0.3986770131026587, + "ewc_loss": 0.026321373879909515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3160686648916453e-05, + "grad_norm": 22.30451202392578, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8475204706192017, + "num_tokens": 119637024.0, + "step": 3134 + }, + { + "epoch": 0.3988042233812492, + "ewc_loss": 0.026405127719044685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3202563422964886e-05, + "grad_norm": 22.3319091796875, + "learning_rate": 1e-06, + "loss": 0.5031, + "mean_token_accuracy": 0.8426144123077393, + "num_tokens": 119679265.0, + "step": 3135 + }, + { + "epoch": 0.39893143365983974, + "ewc_loss": 0.02634149044752121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3170745660318062e-05, + "grad_norm": 22.273441314697266, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8429672718048096, + "num_tokens": 119716254.0, + "step": 3136 + }, + { + "epoch": 0.3990586439384302, + "ewc_loss": 0.026363231241703033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3181615940993652e-05, + "grad_norm": 22.194656372070312, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8434275388717651, + "num_tokens": 119761325.0, + "step": 3137 + }, + { + "epoch": 0.39918585421702074, + "ewc_loss": 0.026306714862585068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3153357031114865e-05, + "grad_norm": 22.277568817138672, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.875296413898468, + "num_tokens": 119795468.0, + "step": 3138 + }, + { + "epoch": 0.39931306449561127, + "ewc_loss": 0.026417193934321404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3208597010816447e-05, + "grad_norm": 22.242528915405273, + "learning_rate": 1e-06, + "loss": 0.5184, + "mean_token_accuracy": 0.8418580293655396, + "num_tokens": 119833804.0, + "step": 3139 + }, + { + "epoch": 0.39944027477420174, + "ewc_loss": 0.026368411257863045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.31842052724096e-05, + "grad_norm": 22.294328689575195, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8469659090042114, + "num_tokens": 119870852.0, + "step": 3140 + }, + { + "epoch": 0.39956748505279227, + "ewc_loss": 0.026356352493166924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3178176232031547e-05, + "grad_norm": 22.29606819152832, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8544137477874756, + "num_tokens": 119909258.0, + "step": 3141 + }, + { + "epoch": 0.3996946953313828, + "ewc_loss": 0.026312170550227165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3156085515220184e-05, + "grad_norm": 22.340599060058594, + "learning_rate": 1e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8402799367904663, + "num_tokens": 119940701.0, + "step": 3142 + }, + { + "epoch": 0.39982190560997327, + "ewc_loss": 0.026366567239165306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3183283954276703e-05, + "grad_norm": 22.29954719543457, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8515865802764893, + "num_tokens": 119976780.0, + "step": 3143 + }, + { + "epoch": 0.3999491158885638, + "ewc_loss": 0.0263860821723938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3193041013437323e-05, + "grad_norm": 22.321332931518555, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8436855673789978, + "num_tokens": 120017543.0, + "step": 3144 + }, + { + "epoch": 0.40007632616715433, + "ewc_loss": 0.026402179151773453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3201089132053312e-05, + "grad_norm": 22.387609481811523, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8490332961082458, + "num_tokens": 120050036.0, + "step": 3145 + }, + { + "epoch": 0.4002035364457448, + "ewc_loss": 0.02640751376748085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3203756680013612e-05, + "grad_norm": 22.40528678894043, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8654464483261108, + "num_tokens": 120084582.0, + "step": 3146 + }, + { + "epoch": 0.40033074672433533, + "ewc_loss": 0.026375673711299896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3187836884753779e-05, + "grad_norm": 22.30901336669922, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8475124835968018, + "num_tokens": 120120218.0, + "step": 3147 + }, + { + "epoch": 0.40045795700292586, + "ewc_loss": 0.02636447362601757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3182237125874963e-05, + "grad_norm": 22.321332931518555, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8727799654006958, + "num_tokens": 120161650.0, + "step": 3148 + }, + { + "epoch": 0.40058516728151633, + "ewc_loss": 0.026377277448773384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3188639059080742e-05, + "grad_norm": 22.30322265625, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8607538342475891, + "num_tokens": 120197409.0, + "step": 3149 + }, + { + "epoch": 0.40071237756010686, + "ewc_loss": 0.026387769728899002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3193885024520569e-05, + "grad_norm": 22.263267517089844, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8397416472434998, + "num_tokens": 120240700.0, + "step": 3150 + }, + { + "epoch": 0.4008395878386974, + "ewc_loss": 0.026360906660556793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3180453606764786e-05, + "grad_norm": 22.249231338500977, + "learning_rate": 1e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8412320613861084, + "num_tokens": 120286741.0, + "step": 3151 + }, + { + "epoch": 0.40096679811728786, + "ewc_loss": 0.026412546634674072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3206273251853418e-05, + "grad_norm": 22.323204040527344, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8542274236679077, + "num_tokens": 120323236.0, + "step": 3152 + }, + { + "epoch": 0.4010940083958784, + "ewc_loss": 0.026389872655272484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3194936400395818e-05, + "grad_norm": 22.169225692749023, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8499407172203064, + "num_tokens": 120362185.0, + "step": 3153 + }, + { + "epoch": 0.4012212186744689, + "ewc_loss": 0.026413390412926674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.320669525739504e-05, + "grad_norm": 22.413066864013672, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8427009582519531, + "num_tokens": 120397524.0, + "step": 3154 + }, + { + "epoch": 0.4013484289530594, + "ewc_loss": 0.0264970064163208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3248502909846138e-05, + "grad_norm": 22.347740173339844, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8550631999969482, + "num_tokens": 120439723.0, + "step": 3155 + }, + { + "epoch": 0.4014756392316499, + "ewc_loss": 0.026358330622315407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3179164852772374e-05, + "grad_norm": 22.282556533813477, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8561081886291504, + "num_tokens": 120483693.0, + "step": 3156 + }, + { + "epoch": 0.40160284951024044, + "ewc_loss": 0.02639741823077202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3198708984418772e-05, + "grad_norm": 22.349952697753906, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8601486682891846, + "num_tokens": 120520558.0, + "step": 3157 + }, + { + "epoch": 0.4017300597888309, + "ewc_loss": 0.026382653042674065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3191326615924481e-05, + "grad_norm": 22.21158790588379, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8612107634544373, + "num_tokens": 120557901.0, + "step": 3158 + }, + { + "epoch": 0.40185727006742145, + "ewc_loss": 0.02630242519080639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3151212442608085e-05, + "grad_norm": 22.30829429626465, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8570836186408997, + "num_tokens": 120593851.0, + "step": 3159 + }, + { + "epoch": 0.401984480346012, + "ewc_loss": 0.02645762823522091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3228814168542158e-05, + "grad_norm": 22.315500259399414, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8453863263130188, + "num_tokens": 120624533.0, + "step": 3160 + }, + { + "epoch": 0.40211169062460245, + "ewc_loss": 0.026348551735281944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3174276318750344e-05, + "grad_norm": 22.237159729003906, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8463602066040039, + "num_tokens": 120660820.0, + "step": 3161 + }, + { + "epoch": 0.402238900903193, + "ewc_loss": 0.026426630094647408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3213315469329245e-05, + "grad_norm": 22.291847229003906, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8627667427062988, + "num_tokens": 120701485.0, + "step": 3162 + }, + { + "epoch": 0.4023661111817835, + "ewc_loss": 0.026404578238725662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.320228875556495e-05, + "grad_norm": 22.255043029785156, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8606282472610474, + "num_tokens": 120736677.0, + "step": 3163 + }, + { + "epoch": 0.402493321460374, + "ewc_loss": 0.026415476575493813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3207738447817974e-05, + "grad_norm": 22.252944946289062, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8484612703323364, + "num_tokens": 120773933.0, + "step": 3164 + }, + { + "epoch": 0.4026205317389645, + "ewc_loss": 0.026392323896288872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3196162399253808e-05, + "grad_norm": 22.277664184570312, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.856168270111084, + "num_tokens": 120804671.0, + "step": 3165 + }, + { + "epoch": 0.40274774201755503, + "ewc_loss": 0.026447882875800133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3223941095930059e-05, + "grad_norm": 22.375059127807617, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8553165197372437, + "num_tokens": 120837042.0, + "step": 3166 + }, + { + "epoch": 0.4028749522961455, + "ewc_loss": 0.026497257873415947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3248629329609685e-05, + "grad_norm": 22.39871597290039, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8485857248306274, + "num_tokens": 120875127.0, + "step": 3167 + }, + { + "epoch": 0.40300216257473603, + "ewc_loss": 0.026417575776576996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.320878800470382e-05, + "grad_norm": 22.36739158630371, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8508868217468262, + "num_tokens": 120915937.0, + "step": 3168 + }, + { + "epoch": 0.40312937285332656, + "ewc_loss": 0.02645067684352398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3225338079791982e-05, + "grad_norm": 22.222654342651367, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8364360332489014, + "num_tokens": 120953334.0, + "step": 3169 + }, + { + "epoch": 0.40325658313191703, + "ewc_loss": 0.026463275775313377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3231638149591163e-05, + "grad_norm": 22.3445987701416, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.851546585559845, + "num_tokens": 120986674.0, + "step": 3170 + }, + { + "epoch": 0.40338379341050756, + "ewc_loss": 0.02650781162083149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3253906217869371e-05, + "grad_norm": 22.309961318969727, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8578535318374634, + "num_tokens": 121026305.0, + "step": 3171 + }, + { + "epoch": 0.4035110036890981, + "ewc_loss": 0.026493296027183533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3246648450149223e-05, + "grad_norm": 22.283761978149414, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8658217191696167, + "num_tokens": 121061644.0, + "step": 3172 + }, + { + "epoch": 0.40363821396768856, + "ewc_loss": 0.026508141309022903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3254070836410392e-05, + "grad_norm": 22.29512596130371, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8473854064941406, + "num_tokens": 121102860.0, + "step": 3173 + }, + { + "epoch": 0.4037654242462791, + "ewc_loss": 0.026475608348846436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3237804523669183e-05, + "grad_norm": 22.209566116333008, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8654351234436035, + "num_tokens": 121136566.0, + "step": 3174 + }, + { + "epoch": 0.4038926345248696, + "ewc_loss": 0.02656196430325508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3280981875141151e-05, + "grad_norm": 22.415555953979492, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8610657453536987, + "num_tokens": 121173759.0, + "step": 3175 + }, + { + "epoch": 0.4040198448034601, + "ewc_loss": 0.026556795462965965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3278398000693414e-05, + "grad_norm": 22.271163940429688, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.851597249507904, + "num_tokens": 121219881.0, + "step": 3176 + }, + { + "epoch": 0.4041470550820506, + "ewc_loss": 0.026493266224861145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3246632988739293e-05, + "grad_norm": 22.352081298828125, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8506425023078918, + "num_tokens": 121258719.0, + "step": 3177 + }, + { + "epoch": 0.40427426536064115, + "ewc_loss": 0.026575328782200813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3287664842209779e-05, + "grad_norm": 22.22222900390625, + "learning_rate": 1e-06, + "loss": 0.5403, + "mean_token_accuracy": 0.8294177651405334, + "num_tokens": 121298013.0, + "step": 3178 + }, + { + "epoch": 0.4044014756392316, + "ewc_loss": 0.026499371975660324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3249686162453145e-05, + "grad_norm": 22.367149353027344, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8631395101547241, + "num_tokens": 121329615.0, + "step": 3179 + }, + { + "epoch": 0.40452868591782215, + "ewc_loss": 0.026579199358820915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.328959933744045e-05, + "grad_norm": 22.220104217529297, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8614515662193298, + "num_tokens": 121365199.0, + "step": 3180 + }, + { + "epoch": 0.4046558961964127, + "ewc_loss": 0.026526760309934616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.326338042417774e-05, + "grad_norm": 22.387948989868164, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8472785949707031, + "num_tokens": 121402393.0, + "step": 3181 + }, + { + "epoch": 0.4047831064750032, + "ewc_loss": 0.02660142257809639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3300711543706711e-05, + "grad_norm": 22.395116806030273, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8511905074119568, + "num_tokens": 121439678.0, + "step": 3182 + }, + { + "epoch": 0.4049103167535937, + "ewc_loss": 0.02654970996081829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3274854609335307e-05, + "grad_norm": 22.361814498901367, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.846882164478302, + "num_tokens": 121480781.0, + "step": 3183 + }, + { + "epoch": 0.4050375270321842, + "ewc_loss": 0.026535166427493095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3267583199194632e-05, + "grad_norm": 22.23737335205078, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8595560789108276, + "num_tokens": 121517614.0, + "step": 3184 + }, + { + "epoch": 0.40516473731077474, + "ewc_loss": 0.02648797817528248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.324398908764124e-05, + "grad_norm": 22.393661499023438, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8490264415740967, + "num_tokens": 121552225.0, + "step": 3185 + }, + { + "epoch": 0.4052919475893652, + "ewc_loss": 0.026565317064523697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3282658073876519e-05, + "grad_norm": 22.403244018554688, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8458995819091797, + "num_tokens": 121587177.0, + "step": 3186 + }, + { + "epoch": 0.40541915786795574, + "ewc_loss": 0.026483450084924698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3241725355328526e-05, + "grad_norm": 22.347654342651367, + "learning_rate": 1e-06, + "loss": 0.5245, + "mean_token_accuracy": 0.8339320421218872, + "num_tokens": 121619547.0, + "step": 3187 + }, + { + "epoch": 0.40554636814654627, + "ewc_loss": 0.026492994278669357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3246497474028729e-05, + "grad_norm": 22.33492088317871, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8588648438453674, + "num_tokens": 121657396.0, + "step": 3188 + }, + { + "epoch": 0.40567357842513674, + "ewc_loss": 0.026507165282964706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.325358243775554e-05, + "grad_norm": 22.34786605834961, + "learning_rate": 1e-06, + "loss": 0.5189, + "mean_token_accuracy": 0.8340916037559509, + "num_tokens": 121691867.0, + "step": 3189 + }, + { + "epoch": 0.40580078870372727, + "ewc_loss": 0.026483187451958656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.324159347859677e-05, + "grad_norm": 22.299365997314453, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8455018401145935, + "num_tokens": 121723546.0, + "step": 3190 + }, + { + "epoch": 0.4059279989823178, + "ewc_loss": 0.026529280468821526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3264640074339695e-05, + "grad_norm": 22.426992416381836, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.851260781288147, + "num_tokens": 121764589.0, + "step": 3191 + }, + { + "epoch": 0.40605520926090827, + "ewc_loss": 0.026571091264486313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3285545719554648e-05, + "grad_norm": 22.290964126586914, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8414408564567566, + "num_tokens": 121803281.0, + "step": 3192 + }, + { + "epoch": 0.4061824195394988, + "ewc_loss": 0.026531292125582695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3265645975479856e-05, + "grad_norm": 22.330690383911133, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8387854099273682, + "num_tokens": 121841710.0, + "step": 3193 + }, + { + "epoch": 0.4063096298180893, + "ewc_loss": 0.026602841913700104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3301420949574094e-05, + "grad_norm": 22.296646118164062, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8505747318267822, + "num_tokens": 121881979.0, + "step": 3194 + }, + { + "epoch": 0.4064368400966798, + "ewc_loss": 0.026537828147411346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3268913789943326e-05, + "grad_norm": 22.278854370117188, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.856545090675354, + "num_tokens": 121921599.0, + "step": 3195 + }, + { + "epoch": 0.4065640503752703, + "ewc_loss": 0.026665959507226944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3332979506230913e-05, + "grad_norm": 22.433002471923828, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8621615767478943, + "num_tokens": 121960545.0, + "step": 3196 + }, + { + "epoch": 0.40669126065386085, + "ewc_loss": 0.0266000647097826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3300032151164487e-05, + "grad_norm": 22.302719116210938, + "learning_rate": 1e-06, + "loss": 0.5328, + "mean_token_accuracy": 0.8329654335975647, + "num_tokens": 121999259.0, + "step": 3197 + }, + { + "epoch": 0.4068184709324513, + "ewc_loss": 0.026630224660038948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3315112482814584e-05, + "grad_norm": 22.4415225982666, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8653004169464111, + "num_tokens": 122039066.0, + "step": 3198 + }, + { + "epoch": 0.40694568121104185, + "ewc_loss": 0.02657925710082054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3289628441270906e-05, + "grad_norm": 22.314329147338867, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8604928255081177, + "num_tokens": 122085035.0, + "step": 3199 + }, + { + "epoch": 0.4070728914896324, + "ewc_loss": 0.026544906198978424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3272452633827925e-05, + "grad_norm": 22.457298278808594, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8661439418792725, + "num_tokens": 122128007.0, + "step": 3200 + }, + { + "epoch": 0.40720010176822286, + "ewc_loss": 0.02658051624894142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3290257811604533e-05, + "grad_norm": 22.34346580505371, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8499101400375366, + "num_tokens": 122169623.0, + "step": 3201 + }, + { + "epoch": 0.4073273120468134, + "ewc_loss": 0.026514509692788124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3257254977361299e-05, + "grad_norm": 22.41459846496582, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8464847803115845, + "num_tokens": 122201755.0, + "step": 3202 + }, + { + "epoch": 0.4074545223254039, + "ewc_loss": 0.026544051244854927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3272025171318091e-05, + "grad_norm": 22.44495964050293, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.862517774105072, + "num_tokens": 122233680.0, + "step": 3203 + }, + { + "epoch": 0.4075817326039944, + "ewc_loss": 0.026525171473622322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.326258552580839e-05, + "grad_norm": 22.42578887939453, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8494340181350708, + "num_tokens": 122265107.0, + "step": 3204 + }, + { + "epoch": 0.4077089428825849, + "ewc_loss": 0.026530183851718903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3265092093206476e-05, + "grad_norm": 22.414352416992188, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8560024499893188, + "num_tokens": 122302750.0, + "step": 3205 + }, + { + "epoch": 0.40783615316117544, + "ewc_loss": 0.02651897631585598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3259487786854152e-05, + "grad_norm": 22.35844612121582, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8422417640686035, + "num_tokens": 122346820.0, + "step": 3206 + }, + { + "epoch": 0.4079633634397659, + "ewc_loss": 0.026518773287534714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3259386832942255e-05, + "grad_norm": 22.379777908325195, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8569512367248535, + "num_tokens": 122383254.0, + "step": 3207 + }, + { + "epoch": 0.40809057371835644, + "ewc_loss": 0.02651587687432766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.325793800788233e-05, + "grad_norm": 22.388050079345703, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8641195297241211, + "num_tokens": 122416693.0, + "step": 3208 + }, + { + "epoch": 0.40821778399694697, + "ewc_loss": 0.026541665196418762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3270832823764067e-05, + "grad_norm": 22.435012817382812, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.856913685798645, + "num_tokens": 122452403.0, + "step": 3209 + }, + { + "epoch": 0.40834499427553744, + "ewc_loss": 0.026567162945866585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3283581210998818e-05, + "grad_norm": 22.43374252319336, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8411434888839722, + "num_tokens": 122489685.0, + "step": 3210 + }, + { + "epoch": 0.40847220455412797, + "ewc_loss": 0.02651825360953808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3259126717457548e-05, + "grad_norm": 22.3333683013916, + "learning_rate": 1e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.8345679640769958, + "num_tokens": 122529079.0, + "step": 3211 + }, + { + "epoch": 0.4085994148327185, + "ewc_loss": 0.026584560051560402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3292279618326575e-05, + "grad_norm": 22.40618133544922, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8458054661750793, + "num_tokens": 122569933.0, + "step": 3212 + }, + { + "epoch": 0.40872662511130897, + "ewc_loss": 0.026558561250567436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3279280210554134e-05, + "grad_norm": 22.4215087890625, + "learning_rate": 1e-06, + "loss": 0.5345, + "mean_token_accuracy": 0.8333827257156372, + "num_tokens": 122607884.0, + "step": 3213 + }, + { + "epoch": 0.4088538353898995, + "ewc_loss": 0.02656778134405613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3283890439197421e-05, + "grad_norm": 22.389551162719727, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8534235954284668, + "num_tokens": 122651968.0, + "step": 3214 + }, + { + "epoch": 0.40898104566849003, + "ewc_loss": 0.026540854945778847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3270427189127076e-05, + "grad_norm": 22.31914710998535, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8557088971138, + "num_tokens": 122688680.0, + "step": 3215 + }, + { + "epoch": 0.4091082559470805, + "ewc_loss": 0.026606854051351547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3303427294886205e-05, + "grad_norm": 22.444927215576172, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8580217361450195, + "num_tokens": 122728271.0, + "step": 3216 + }, + { + "epoch": 0.40923546622567103, + "ewc_loss": 0.02660035528242588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.330017767031677e-05, + "grad_norm": 22.39010238647461, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8504759073257446, + "num_tokens": 122759691.0, + "step": 3217 + }, + { + "epoch": 0.40936267650426156, + "ewc_loss": 0.026564253494143486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3282126928970683e-05, + "grad_norm": 22.43507194519043, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8505252003669739, + "num_tokens": 122800185.0, + "step": 3218 + }, + { + "epoch": 0.40948988678285203, + "ewc_loss": 0.026542887091636658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3271443094708957e-05, + "grad_norm": 22.30940055847168, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8608562350273132, + "num_tokens": 122839548.0, + "step": 3219 + }, + { + "epoch": 0.40961709706144256, + "ewc_loss": 0.02650958113372326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3254790246719494e-05, + "grad_norm": 22.291534423828125, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8375805020332336, + "num_tokens": 122879621.0, + "step": 3220 + }, + { + "epoch": 0.4097443073400331, + "ewc_loss": 0.02656521461904049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.328260714217322e-05, + "grad_norm": 22.327993392944336, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8460019826889038, + "num_tokens": 122921495.0, + "step": 3221 + }, + { + "epoch": 0.40987151761862356, + "ewc_loss": 0.026565108448266983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3282554391480517e-05, + "grad_norm": 22.285114288330078, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.858063817024231, + "num_tokens": 122957197.0, + "step": 3222 + }, + { + "epoch": 0.4099987278972141, + "ewc_loss": 0.026583749800920486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3291874893184286e-05, + "grad_norm": 22.33705711364746, + "learning_rate": 1e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.8326553702354431, + "num_tokens": 122999999.0, + "step": 3223 + }, + { + "epoch": 0.4101259381758046, + "ewc_loss": 0.026646148413419724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3323074199433904e-05, + "grad_norm": 22.41731071472168, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8566271066665649, + "num_tokens": 123040402.0, + "step": 3224 + }, + { + "epoch": 0.4102531484543951, + "ewc_loss": 0.026619793847203255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3309896530699916e-05, + "grad_norm": 22.317214965820312, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8560546636581421, + "num_tokens": 123080086.0, + "step": 3225 + }, + { + "epoch": 0.4103803587329856, + "ewc_loss": 0.02662237361073494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3311187103681732e-05, + "grad_norm": 22.403467178344727, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8545932769775391, + "num_tokens": 123120058.0, + "step": 3226 + }, + { + "epoch": 0.41050756901157615, + "ewc_loss": 0.02662084624171257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3310423128132243e-05, + "grad_norm": 22.38013458251953, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8529195785522461, + "num_tokens": 123162003.0, + "step": 3227 + }, + { + "epoch": 0.4106347792901666, + "ewc_loss": 0.026582280173897743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3291140021465253e-05, + "grad_norm": 22.46107292175293, + "learning_rate": 1e-06, + "loss": 0.5669, + "mean_token_accuracy": 0.8196500539779663, + "num_tokens": 123202249.0, + "step": 3228 + }, + { + "epoch": 0.41076198956875715, + "ewc_loss": 0.026595955714583397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3297977602633182e-05, + "grad_norm": 22.367156982421875, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8468826413154602, + "num_tokens": 123237401.0, + "step": 3229 + }, + { + "epoch": 0.4108891998473477, + "ewc_loss": 0.02655652165412903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3278260666993447e-05, + "grad_norm": 22.363754272460938, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8681610822677612, + "num_tokens": 123269364.0, + "step": 3230 + }, + { + "epoch": 0.4110164101259382, + "ewc_loss": 0.02657996490597725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3289982234709896e-05, + "grad_norm": 22.40802574157715, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8544301986694336, + "num_tokens": 123307743.0, + "step": 3231 + }, + { + "epoch": 0.4111436204045287, + "ewc_loss": 0.02658085525035858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3290427887113765e-05, + "grad_norm": 22.466537475585938, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8569105863571167, + "num_tokens": 123340102.0, + "step": 3232 + }, + { + "epoch": 0.4112708306831192, + "ewc_loss": 0.02657889947295189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3289450180309359e-05, + "grad_norm": 22.349950790405273, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8473958969116211, + "num_tokens": 123374956.0, + "step": 3233 + }, + { + "epoch": 0.41139804096170973, + "ewc_loss": 0.026571840047836304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3285920431371778e-05, + "grad_norm": 22.429689407348633, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8621187210083008, + "num_tokens": 123418070.0, + "step": 3234 + }, + { + "epoch": 0.4115252512403002, + "ewc_loss": 0.026659023016691208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3329511602933053e-05, + "grad_norm": 22.489612579345703, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8500367999076843, + "num_tokens": 123453183.0, + "step": 3235 + }, + { + "epoch": 0.41165246151889073, + "ewc_loss": 0.026598753407597542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3299376405484509e-05, + "grad_norm": 22.387243270874023, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.851824164390564, + "num_tokens": 123491526.0, + "step": 3236 + }, + { + "epoch": 0.41177967179748126, + "ewc_loss": 0.02657579630613327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3287898582348134e-05, + "grad_norm": 22.432865142822266, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8538752198219299, + "num_tokens": 123529743.0, + "step": 3237 + }, + { + "epoch": 0.41190688207607173, + "ewc_loss": 0.0266036968678236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3301848412083928e-05, + "grad_norm": 22.412200927734375, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8554577827453613, + "num_tokens": 123573402.0, + "step": 3238 + }, + { + "epoch": 0.41203409235466226, + "ewc_loss": 0.02659996598958969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3299983038450591e-05, + "grad_norm": 22.409385681152344, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8484557271003723, + "num_tokens": 123604167.0, + "step": 3239 + }, + { + "epoch": 0.4121613026332528, + "ewc_loss": 0.026648027822375298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3324013707460836e-05, + "grad_norm": 22.394689559936523, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8491601943969727, + "num_tokens": 123648104.0, + "step": 3240 + }, + { + "epoch": 0.41228851291184326, + "ewc_loss": 0.02661219611763954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3306098480825312e-05, + "grad_norm": 22.45808219909668, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8412752151489258, + "num_tokens": 123689678.0, + "step": 3241 + }, + { + "epoch": 0.4124157231904338, + "ewc_loss": 0.026665834710001945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3332917660591193e-05, + "grad_norm": 22.40481185913086, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8598387241363525, + "num_tokens": 123729257.0, + "step": 3242 + }, + { + "epoch": 0.4125429334690243, + "ewc_loss": 0.026652062311768532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3326030966709368e-05, + "grad_norm": 22.491703033447266, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8544018864631653, + "num_tokens": 123772251.0, + "step": 3243 + }, + { + "epoch": 0.4126701437476148, + "ewc_loss": 0.026634298264980316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3317148841451854e-05, + "grad_norm": 22.44344711303711, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8657494187355042, + "num_tokens": 123810345.0, + "step": 3244 + }, + { + "epoch": 0.4127973540262053, + "ewc_loss": 0.026593757793307304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.329687893303344e-05, + "grad_norm": 22.418550491333008, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8488892912864685, + "num_tokens": 123855276.0, + "step": 3245 + }, + { + "epoch": 0.41292456430479585, + "ewc_loss": 0.026663681492209435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3331840818864293e-05, + "grad_norm": 22.54033088684082, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8554895520210266, + "num_tokens": 123896613.0, + "step": 3246 + }, + { + "epoch": 0.4130517745833863, + "ewc_loss": 0.026643969118595123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.332198462478118e-05, + "grad_norm": 22.29503631591797, + "learning_rate": 1e-06, + "loss": 0.5129, + "mean_token_accuracy": 0.8363194465637207, + "num_tokens": 123940713.0, + "step": 3247 + }, + { + "epoch": 0.41317898486197685, + "ewc_loss": 0.026575038209557533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3287519323057495e-05, + "grad_norm": 22.42802619934082, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8414850234985352, + "num_tokens": 123981485.0, + "step": 3248 + }, + { + "epoch": 0.4133061951405674, + "ewc_loss": 0.02667964994907379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3339825272851158e-05, + "grad_norm": 22.387672424316406, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8524039387702942, + "num_tokens": 124018159.0, + "step": 3249 + }, + { + "epoch": 0.41343340541915785, + "ewc_loss": 0.026610659435391426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3305329957802314e-05, + "grad_norm": 22.432857513427734, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8618102073669434, + "num_tokens": 124049911.0, + "step": 3250 + }, + { + "epoch": 0.4135606156977484, + "ewc_loss": 0.02667449787259102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3337248674361035e-05, + "grad_norm": 22.354047775268555, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8730697631835938, + "num_tokens": 124084260.0, + "step": 3251 + }, + { + "epoch": 0.4136878259763389, + "ewc_loss": 0.0266430526971817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3321526239451487e-05, + "grad_norm": 22.54786491394043, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8557860255241394, + "num_tokens": 124122741.0, + "step": 3252 + }, + { + "epoch": 0.4138150362549294, + "ewc_loss": 0.0266917385160923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3345868865144439e-05, + "grad_norm": 22.451244354248047, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8486539125442505, + "num_tokens": 124162793.0, + "step": 3253 + }, + { + "epoch": 0.4139422465335199, + "ewc_loss": 0.026589766144752502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.329488350165775e-05, + "grad_norm": 22.393535614013672, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8443983793258667, + "num_tokens": 124206889.0, + "step": 3254 + }, + { + "epoch": 0.41406945681211044, + "ewc_loss": 0.026681886985898018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3340943951334339e-05, + "grad_norm": 22.489768981933594, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8633995652198792, + "num_tokens": 124244551.0, + "step": 3255 + }, + { + "epoch": 0.4141966670907009, + "ewc_loss": 0.026580754667520523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3290376955410466e-05, + "grad_norm": 22.364599227905273, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8558932542800903, + "num_tokens": 124290485.0, + "step": 3256 + }, + { + "epoch": 0.41432387736929144, + "ewc_loss": 0.0266428105533123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.332140527665615e-05, + "grad_norm": 22.48136329650879, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8593929409980774, + "num_tokens": 124323763.0, + "step": 3257 + }, + { + "epoch": 0.41445108764788197, + "ewc_loss": 0.026577334851026535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3288667105371132e-05, + "grad_norm": 22.38291358947754, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8532243371009827, + "num_tokens": 124363318.0, + "step": 3258 + }, + { + "epoch": 0.41457829792647244, + "ewc_loss": 0.02663506753742695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3317533557710703e-05, + "grad_norm": 22.413352966308594, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8760657906532288, + "num_tokens": 124400845.0, + "step": 3259 + }, + { + "epoch": 0.41470550820506297, + "ewc_loss": 0.026617035269737244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3308517736732028e-05, + "grad_norm": 22.37312126159668, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8586993217468262, + "num_tokens": 124442546.0, + "step": 3260 + }, + { + "epoch": 0.4148327184836535, + "ewc_loss": 0.02660685032606125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3303425475896802e-05, + "grad_norm": 22.39409065246582, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8480432629585266, + "num_tokens": 124477737.0, + "step": 3261 + }, + { + "epoch": 0.41495992876224397, + "ewc_loss": 0.02657731994986534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3288659829413518e-05, + "grad_norm": 22.274965286254883, + "learning_rate": 1e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8389087915420532, + "num_tokens": 124518081.0, + "step": 3262 + }, + { + "epoch": 0.4150871390408345, + "ewc_loss": 0.026650363579392433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3325181498657912e-05, + "grad_norm": 22.427637100219727, + "learning_rate": 1e-06, + "loss": 0.498, + "mean_token_accuracy": 0.8423992991447449, + "num_tokens": 124556361.0, + "step": 3263 + }, + { + "epoch": 0.415214349319425, + "ewc_loss": 0.026745175942778587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3372588000493124e-05, + "grad_norm": 22.460880279541016, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8511105179786682, + "num_tokens": 124588490.0, + "step": 3264 + }, + { + "epoch": 0.4153415595980155, + "ewc_loss": 0.026667088270187378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3333544302440714e-05, + "grad_norm": 22.279247283935547, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8503362536430359, + "num_tokens": 124624656.0, + "step": 3265 + }, + { + "epoch": 0.415468769876606, + "ewc_loss": 0.026696713641285896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.334835724264849e-05, + "grad_norm": 22.424373626708984, + "learning_rate": 1e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8388002514839172, + "num_tokens": 124660681.0, + "step": 3266 + }, + { + "epoch": 0.41559598015519655, + "ewc_loss": 0.026780402287840843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3390201274887659e-05, + "grad_norm": 22.30809211730957, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8532885313034058, + "num_tokens": 124703448.0, + "step": 3267 + }, + { + "epoch": 0.415723190433787, + "ewc_loss": 0.026694854721426964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3347427739063278e-05, + "grad_norm": 22.440324783325195, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8494951128959656, + "num_tokens": 124742078.0, + "step": 3268 + }, + { + "epoch": 0.41585040071237755, + "ewc_loss": 0.02674173377454281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.337086723651737e-05, + "grad_norm": 22.34703254699707, + "learning_rate": 1e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.8398677110671997, + "num_tokens": 124779894.0, + "step": 3269 + }, + { + "epoch": 0.4159776109909681, + "ewc_loss": 0.026726430281996727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3363214748096652e-05, + "grad_norm": 22.407672882080078, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8475520014762878, + "num_tokens": 124817918.0, + "step": 3270 + }, + { + "epoch": 0.41610482126955856, + "ewc_loss": 0.02683200314640999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3416001820587553e-05, + "grad_norm": 22.413190841674805, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.868729293346405, + "num_tokens": 124854839.0, + "step": 3271 + }, + { + "epoch": 0.4162320315481491, + "ewc_loss": 0.02677854895591736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3389274499786552e-05, + "grad_norm": 22.4411678314209, + "learning_rate": 1e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.836039662361145, + "num_tokens": 124886779.0, + "step": 3272 + }, + { + "epoch": 0.4163592418267396, + "ewc_loss": 0.026835652068257332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.341782626695931e-05, + "grad_norm": 22.45020866394043, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8609741926193237, + "num_tokens": 124927919.0, + "step": 3273 + }, + { + "epoch": 0.4164864521053301, + "ewc_loss": 0.026778077706694603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3389038940658793e-05, + "grad_norm": 22.38945770263672, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.850304126739502, + "num_tokens": 124966475.0, + "step": 3274 + }, + { + "epoch": 0.4166136623839206, + "ewc_loss": 0.026827339082956314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3413669876172207e-05, + "grad_norm": 22.526351928710938, + "learning_rate": 1e-06, + "loss": 0.5535, + "mean_token_accuracy": 0.8279210925102234, + "num_tokens": 125004110.0, + "step": 3275 + }, + { + "epoch": 0.41674087266251114, + "ewc_loss": 0.02681858465075493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3409292478172574e-05, + "grad_norm": 22.353473663330078, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8666523098945618, + "num_tokens": 125043164.0, + "step": 3276 + }, + { + "epoch": 0.4168680829411016, + "ewc_loss": 0.02682582102715969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3412910448096227e-05, + "grad_norm": 22.47598648071289, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8554338216781616, + "num_tokens": 125084255.0, + "step": 3277 + }, + { + "epoch": 0.41699529321969214, + "ewc_loss": 0.026845620945096016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3422810297925025e-05, + "grad_norm": 22.311046600341797, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.865453839302063, + "num_tokens": 125124971.0, + "step": 3278 + }, + { + "epoch": 0.41712250349828267, + "ewc_loss": 0.02680530957877636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3402654985839035e-05, + "grad_norm": 22.610782623291016, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8483679294586182, + "num_tokens": 125161470.0, + "step": 3279 + }, + { + "epoch": 0.4172497137768732, + "ewc_loss": 0.02687656693160534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3438283531286288e-05, + "grad_norm": 22.508996963500977, + "learning_rate": 1e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8363857865333557, + "num_tokens": 125199117.0, + "step": 3280 + }, + { + "epoch": 0.41737692405546367, + "ewc_loss": 0.026750262826681137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3375130947679281e-05, + "grad_norm": 22.43423080444336, + "learning_rate": 1e-06, + "loss": 0.5315, + "mean_token_accuracy": 0.8334833979606628, + "num_tokens": 125238136.0, + "step": 3281 + }, + { + "epoch": 0.4175041343340542, + "ewc_loss": 0.02680898830294609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.340449398412602e-05, + "grad_norm": 22.466552734375, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8497413396835327, + "num_tokens": 125281041.0, + "step": 3282 + }, + { + "epoch": 0.41763134461264473, + "ewc_loss": 0.026798386126756668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3399193449004088e-05, + "grad_norm": 22.552522659301758, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.854303240776062, + "num_tokens": 125314739.0, + "step": 3283 + }, + { + "epoch": 0.4177585548912352, + "ewc_loss": 0.026796288788318634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3398144801612943e-05, + "grad_norm": 22.456098556518555, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8451114296913147, + "num_tokens": 125353129.0, + "step": 3284 + }, + { + "epoch": 0.41788576516982573, + "ewc_loss": 0.026776207610964775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.338810398010537e-05, + "grad_norm": 22.353809356689453, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8511750102043152, + "num_tokens": 125394092.0, + "step": 3285 + }, + { + "epoch": 0.41801297544841626, + "ewc_loss": 0.026767486706376076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3383742953010369e-05, + "grad_norm": 22.530553817749023, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8450349569320679, + "num_tokens": 125438262.0, + "step": 3286 + }, + { + "epoch": 0.41814018572700673, + "ewc_loss": 0.02683800272643566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3419001334114e-05, + "grad_norm": 22.358142852783203, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8503563404083252, + "num_tokens": 125477842.0, + "step": 3287 + }, + { + "epoch": 0.41826739600559726, + "ewc_loss": 0.026769591495394707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3384796147875022e-05, + "grad_norm": 22.38859748840332, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8638830184936523, + "num_tokens": 125517236.0, + "step": 3288 + }, + { + "epoch": 0.4183946062841878, + "ewc_loss": 0.026837868615984917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3418934031506069e-05, + "grad_norm": 22.461204528808594, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8452624082565308, + "num_tokens": 125561598.0, + "step": 3289 + }, + { + "epoch": 0.41852181656277826, + "ewc_loss": 0.02678830362856388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.339415211987216e-05, + "grad_norm": 22.455759048461914, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8504990339279175, + "num_tokens": 125599104.0, + "step": 3290 + }, + { + "epoch": 0.4186490268413688, + "ewc_loss": 0.026785889640450478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3392945220402908e-05, + "grad_norm": 22.447599411010742, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.852258563041687, + "num_tokens": 125635286.0, + "step": 3291 + }, + { + "epoch": 0.4187762371199593, + "ewc_loss": 0.026785951107740402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3392975233728066e-05, + "grad_norm": 22.491207122802734, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8561923503875732, + "num_tokens": 125669534.0, + "step": 3292 + }, + { + "epoch": 0.4189034473985498, + "ewc_loss": 0.026790661737322807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3395330825005658e-05, + "grad_norm": 22.534364700317383, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8509820103645325, + "num_tokens": 125707966.0, + "step": 3293 + }, + { + "epoch": 0.4190306576771403, + "ewc_loss": 0.026735447347164154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3367724022828043e-05, + "grad_norm": 22.429845809936523, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8462613821029663, + "num_tokens": 125741448.0, + "step": 3294 + }, + { + "epoch": 0.41915786795573085, + "ewc_loss": 0.02670571580529213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3352858331927564e-05, + "grad_norm": 22.43688201904297, + "learning_rate": 1e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.845696210861206, + "num_tokens": 125776887.0, + "step": 3295 + }, + { + "epoch": 0.4192850782343213, + "ewc_loss": 0.026822475716471672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3411237887339666e-05, + "grad_norm": 22.41569709777832, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8569695353507996, + "num_tokens": 125807899.0, + "step": 3296 + }, + { + "epoch": 0.41941228851291185, + "ewc_loss": 0.026766931638121605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3383465557126328e-05, + "grad_norm": 22.347593307495117, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8602094650268555, + "num_tokens": 125847711.0, + "step": 3297 + }, + { + "epoch": 0.4195394987915024, + "ewc_loss": 0.02685460075736046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3427300473267678e-05, + "grad_norm": 22.5319766998291, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8646149039268494, + "num_tokens": 125883248.0, + "step": 3298 + }, + { + "epoch": 0.41966670907009285, + "ewc_loss": 0.026833031326532364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3416515685094055e-05, + "grad_norm": 22.349794387817383, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8393373489379883, + "num_tokens": 125925771.0, + "step": 3299 + }, + { + "epoch": 0.4197939193486834, + "ewc_loss": 0.02681552991271019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3407764527073596e-05, + "grad_norm": 22.413166046142578, + "learning_rate": 1e-06, + "loss": 0.509, + "mean_token_accuracy": 0.8381751179695129, + "num_tokens": 125968815.0, + "step": 3300 + }, + { + "epoch": 0.4199211296272739, + "ewc_loss": 0.02689727395772934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3448637218971271e-05, + "grad_norm": 22.515703201293945, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8463792204856873, + "num_tokens": 126007761.0, + "step": 3301 + }, + { + "epoch": 0.4200483399058644, + "ewc_loss": 0.026894306764006615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3447152923617978e-05, + "grad_norm": 22.471532821655273, + "learning_rate": 1e-06, + "loss": 0.5266, + "mean_token_accuracy": 0.8327826261520386, + "num_tokens": 126047872.0, + "step": 3302 + }, + { + "epoch": 0.4201755501844549, + "ewc_loss": 0.02684921585023403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.342460836895043e-05, + "grad_norm": 22.50484848022461, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8582628965377808, + "num_tokens": 126081273.0, + "step": 3303 + }, + { + "epoch": 0.42030276046304543, + "ewc_loss": 0.026882315054535866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.344115753454389e-05, + "grad_norm": 22.3754940032959, + "learning_rate": 1e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8373126983642578, + "num_tokens": 126121590.0, + "step": 3304 + }, + { + "epoch": 0.4204299707416359, + "ewc_loss": 0.026828886941075325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3414443856163416e-05, + "grad_norm": 22.562246322631836, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8357744216918945, + "num_tokens": 126168396.0, + "step": 3305 + }, + { + "epoch": 0.42055718102022643, + "ewc_loss": 0.026877084746956825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3438542737276293e-05, + "grad_norm": 22.48528480529785, + "learning_rate": 1e-06, + "loss": 0.5017, + "mean_token_accuracy": 0.8479323387145996, + "num_tokens": 126205220.0, + "step": 3306 + }, + { + "epoch": 0.42068439129881696, + "ewc_loss": 0.026842908933758736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3421454241324682e-05, + "grad_norm": 22.51001739501953, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.845022439956665, + "num_tokens": 126239131.0, + "step": 3307 + }, + { + "epoch": 0.42081160157740743, + "ewc_loss": 0.026833970099687576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.341698498436017e-05, + "grad_norm": 22.4519100189209, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8537706136703491, + "num_tokens": 126271931.0, + "step": 3308 + }, + { + "epoch": 0.42093881185599796, + "ewc_loss": 0.026833776384592056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3416888577921782e-05, + "grad_norm": 22.485639572143555, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8603694438934326, + "num_tokens": 126307730.0, + "step": 3309 + }, + { + "epoch": 0.4210660221345885, + "ewc_loss": 0.026889966800808907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.344498377875425e-05, + "grad_norm": 22.6145076751709, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8552321791648865, + "num_tokens": 126344528.0, + "step": 3310 + }, + { + "epoch": 0.42119323241317896, + "ewc_loss": 0.02682195045053959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3410975043370854e-05, + "grad_norm": 22.411401748657227, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8599854707717896, + "num_tokens": 126377260.0, + "step": 3311 + }, + { + "epoch": 0.4213204426917695, + "ewc_loss": 0.02686181105673313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3430905710265506e-05, + "grad_norm": 22.486221313476562, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8479588627815247, + "num_tokens": 126409723.0, + "step": 3312 + }, + { + "epoch": 0.42144765297036, + "ewc_loss": 0.026906898245215416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3453449355438352e-05, + "grad_norm": 22.577861785888672, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8551157712936401, + "num_tokens": 126450929.0, + "step": 3313 + }, + { + "epoch": 0.4215748632489505, + "ewc_loss": 0.026878219097852707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3439109352475498e-05, + "grad_norm": 22.491640090942383, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.861172080039978, + "num_tokens": 126486878.0, + "step": 3314 + }, + { + "epoch": 0.421702073527541, + "ewc_loss": 0.026917774230241776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3458887224260252e-05, + "grad_norm": 22.614761352539062, + "learning_rate": 1e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8397104740142822, + "num_tokens": 126522884.0, + "step": 3315 + }, + { + "epoch": 0.42182928380613155, + "ewc_loss": 0.026869120076298714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.343456005997723e-05, + "grad_norm": 22.50341033935547, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8539347648620605, + "num_tokens": 126560848.0, + "step": 3316 + }, + { + "epoch": 0.421956494084722, + "ewc_loss": 0.02685847319662571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3429236787487753e-05, + "grad_norm": 22.51819610595703, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8537729978561401, + "num_tokens": 126592947.0, + "step": 3317 + }, + { + "epoch": 0.42208370436331255, + "ewc_loss": 0.02694575861096382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3472878890752327e-05, + "grad_norm": 22.805465698242188, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8627091646194458, + "num_tokens": 126630631.0, + "step": 3318 + }, + { + "epoch": 0.4222109146419031, + "ewc_loss": 0.02687031216919422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3435155779006891e-05, + "grad_norm": 22.556535720825195, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8478468656539917, + "num_tokens": 126673395.0, + "step": 3319 + }, + { + "epoch": 0.42233812492049355, + "ewc_loss": 0.02684626914560795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3423134987533558e-05, + "grad_norm": 22.6224308013916, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8687200546264648, + "num_tokens": 126711504.0, + "step": 3320 + }, + { + "epoch": 0.4224653351990841, + "ewc_loss": 0.026904696598649025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3452347957354505e-05, + "grad_norm": 22.656696319580078, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8652362823486328, + "num_tokens": 126753217.0, + "step": 3321 + }, + { + "epoch": 0.4225925454776746, + "ewc_loss": 0.026794476434588432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3397238035395276e-05, + "grad_norm": 22.412843704223633, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8580490350723267, + "num_tokens": 126792982.0, + "step": 3322 + }, + { + "epoch": 0.4227197557562651, + "ewc_loss": 0.026852285489439964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3426142686512321e-05, + "grad_norm": 22.586111068725586, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8457423448562622, + "num_tokens": 126823771.0, + "step": 3323 + }, + { + "epoch": 0.4228469660348556, + "ewc_loss": 0.026905840262770653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.345292002952192e-05, + "grad_norm": 22.620481491088867, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8554147481918335, + "num_tokens": 126867137.0, + "step": 3324 + }, + { + "epoch": 0.42297417631344614, + "ewc_loss": 0.02683279849588871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3416399269772228e-05, + "grad_norm": 22.43891143798828, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.857437014579773, + "num_tokens": 126902680.0, + "step": 3325 + }, + { + "epoch": 0.4231013865920366, + "ewc_loss": 0.026850145310163498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3425073120743036e-05, + "grad_norm": 22.53150177001953, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8643144369125366, + "num_tokens": 126941037.0, + "step": 3326 + }, + { + "epoch": 0.42322859687062714, + "ewc_loss": 0.02685024030506611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3425120414467528e-05, + "grad_norm": 22.511812210083008, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8562021255493164, + "num_tokens": 126980009.0, + "step": 3327 + }, + { + "epoch": 0.42335580714921767, + "ewc_loss": 0.026860086247324944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3430043509288225e-05, + "grad_norm": 22.560102462768555, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8646734356880188, + "num_tokens": 127020022.0, + "step": 3328 + }, + { + "epoch": 0.42348301742780814, + "ewc_loss": 0.02685725688934326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3428628335532267e-05, + "grad_norm": 22.537677764892578, + "learning_rate": 1e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.8328236937522888, + "num_tokens": 127060500.0, + "step": 3329 + }, + { + "epoch": 0.42361022770639867, + "ewc_loss": 0.026792509481310844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.339625487162266e-05, + "grad_norm": 22.50803565979004, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8489634394645691, + "num_tokens": 127095886.0, + "step": 3330 + }, + { + "epoch": 0.4237374379849892, + "ewc_loss": 0.026849299669265747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3424650205706712e-05, + "grad_norm": 22.559507369995117, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8440532088279724, + "num_tokens": 127134334.0, + "step": 3331 + }, + { + "epoch": 0.4238646482635797, + "ewc_loss": 0.0268546249717474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3427312296698801e-05, + "grad_norm": 22.458663940429688, + "learning_rate": 1e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8446515798568726, + "num_tokens": 127174288.0, + "step": 3332 + }, + { + "epoch": 0.4239918585421702, + "ewc_loss": 0.026870442554354668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3435221262625419e-05, + "grad_norm": 22.538362503051758, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8498419523239136, + "num_tokens": 127210311.0, + "step": 3333 + }, + { + "epoch": 0.4241190688207607, + "ewc_loss": 0.02684852108359337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3424260941974353e-05, + "grad_norm": 22.44795799255371, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8739594221115112, + "num_tokens": 127252813.0, + "step": 3334 + }, + { + "epoch": 0.42424627909935125, + "ewc_loss": 0.026859896257519722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.342994801234454e-05, + "grad_norm": 22.559988021850586, + "learning_rate": 1e-06, + "loss": 0.5073, + "mean_token_accuracy": 0.8398153781890869, + "num_tokens": 127285847.0, + "step": 3335 + }, + { + "epoch": 0.4243734893779417, + "ewc_loss": 0.026902562007308006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3451281120069325e-05, + "grad_norm": 22.564205169677734, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8670889735221863, + "num_tokens": 127325360.0, + "step": 3336 + }, + { + "epoch": 0.42450069965653225, + "ewc_loss": 0.026827776804566383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3413888154900633e-05, + "grad_norm": 22.45983123779297, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8569696545600891, + "num_tokens": 127366446.0, + "step": 3337 + }, + { + "epoch": 0.4246279099351228, + "ewc_loss": 0.02692185714840889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3460929039865732e-05, + "grad_norm": 22.62541389465332, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8621782064437866, + "num_tokens": 127402608.0, + "step": 3338 + }, + { + "epoch": 0.42475512021371326, + "ewc_loss": 0.026904569938778877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3452285202220082e-05, + "grad_norm": 22.520721435546875, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8579672574996948, + "num_tokens": 127445355.0, + "step": 3339 + }, + { + "epoch": 0.4248823304923038, + "ewc_loss": 0.02684544213116169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3422721167444251e-05, + "grad_norm": 22.532865524291992, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8485093116760254, + "num_tokens": 127480523.0, + "step": 3340 + }, + { + "epoch": 0.4250095407708943, + "ewc_loss": 0.02687721513211727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.343860731140012e-05, + "grad_norm": 22.44033432006836, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8726043105125427, + "num_tokens": 127520787.0, + "step": 3341 + }, + { + "epoch": 0.4251367510494848, + "ewc_loss": 0.02692367695271969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3461838534567505e-05, + "grad_norm": 22.477142333984375, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8481409549713135, + "num_tokens": 127565576.0, + "step": 3342 + }, + { + "epoch": 0.4252639613280753, + "ewc_loss": 0.026936780661344528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3468390534399077e-05, + "grad_norm": 22.49221420288086, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8631700873374939, + "num_tokens": 127599144.0, + "step": 3343 + }, + { + "epoch": 0.42539117160666584, + "ewc_loss": 0.026942890137434006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3471445527102333e-05, + "grad_norm": 22.459884643554688, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8721466064453125, + "num_tokens": 127637361.0, + "step": 3344 + }, + { + "epoch": 0.4255183818852563, + "ewc_loss": 0.026914672926068306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3457336535793729e-05, + "grad_norm": 22.501922607421875, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8432719111442566, + "num_tokens": 127675139.0, + "step": 3345 + }, + { + "epoch": 0.42564559216384684, + "ewc_loss": 0.02696416527032852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3482082977134269e-05, + "grad_norm": 22.53424835205078, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8585691452026367, + "num_tokens": 127718822.0, + "step": 3346 + }, + { + "epoch": 0.42577280244243737, + "ewc_loss": 0.02695578709244728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3477893844537903e-05, + "grad_norm": 22.563474655151367, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8576676249504089, + "num_tokens": 127763332.0, + "step": 3347 + }, + { + "epoch": 0.42590001272102784, + "ewc_loss": 0.02697564661502838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.348782370769186e-05, + "grad_norm": 22.514217376708984, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8644071817398071, + "num_tokens": 127800346.0, + "step": 3348 + }, + { + "epoch": 0.42602722299961837, + "ewc_loss": 0.02690458670258522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3452293387672398e-05, + "grad_norm": 22.547075271606445, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8499231338500977, + "num_tokens": 127845142.0, + "step": 3349 + }, + { + "epoch": 0.4261544332782089, + "ewc_loss": 0.026971254497766495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3485627277987078e-05, + "grad_norm": 22.557241439819336, + "learning_rate": 1e-06, + "loss": 0.5331, + "mean_token_accuracy": 0.8354924917221069, + "num_tokens": 127881131.0, + "step": 3350 + }, + { + "epoch": 0.4262816435567994, + "ewc_loss": 0.026936164125800133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3468082215695176e-05, + "grad_norm": 22.754135131835938, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8670487403869629, + "num_tokens": 127917941.0, + "step": 3351 + }, + { + "epoch": 0.4264088538353899, + "ewc_loss": 0.026958031579852104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.347901616099989e-05, + "grad_norm": 22.593236923217773, + "learning_rate": 1e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8389708399772644, + "num_tokens": 127963345.0, + "step": 3352 + }, + { + "epoch": 0.42653606411398043, + "ewc_loss": 0.0268559567630291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3427978046820499e-05, + "grad_norm": 22.541675567626953, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.861640214920044, + "num_tokens": 128004839.0, + "step": 3353 + }, + { + "epoch": 0.4266632743925709, + "ewc_loss": 0.02691171132028103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3455855878419243e-05, + "grad_norm": 22.644596099853516, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8456588983535767, + "num_tokens": 128042709.0, + "step": 3354 + }, + { + "epoch": 0.42679048467116143, + "ewc_loss": 0.026860054582357407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3430027138383593e-05, + "grad_norm": 22.377992630004883, + "learning_rate": 1e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.8383488059043884, + "num_tokens": 128085069.0, + "step": 3355 + }, + { + "epoch": 0.42691769494975196, + "ewc_loss": 0.026857007294893265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3428503734758124e-05, + "grad_norm": 22.560935974121094, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8472963571548462, + "num_tokens": 128124300.0, + "step": 3356 + }, + { + "epoch": 0.42704490522834243, + "ewc_loss": 0.02693825028836727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.346912540611811e-05, + "grad_norm": 22.512496948242188, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8507645130157471, + "num_tokens": 128164410.0, + "step": 3357 + }, + { + "epoch": 0.42717211550693296, + "ewc_loss": 0.026909155771136284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3454578038363252e-05, + "grad_norm": 22.515316009521484, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8634226322174072, + "num_tokens": 128206524.0, + "step": 3358 + }, + { + "epoch": 0.4272993257855235, + "ewc_loss": 0.026915881782770157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3457941349770408e-05, + "grad_norm": 22.69303321838379, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8444647192955017, + "num_tokens": 128246387.0, + "step": 3359 + }, + { + "epoch": 0.42742653606411396, + "ewc_loss": 0.026866678148508072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.343333951808745e-05, + "grad_norm": 22.434398651123047, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8634504079818726, + "num_tokens": 128287794.0, + "step": 3360 + }, + { + "epoch": 0.4275537463427045, + "ewc_loss": 0.02687956392765045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.343978237855481e-05, + "grad_norm": 22.622102737426758, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8500189781188965, + "num_tokens": 128328467.0, + "step": 3361 + }, + { + "epoch": 0.427680956621295, + "ewc_loss": 0.02689979039132595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3449895050143823e-05, + "grad_norm": 22.624929428100586, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8618137240409851, + "num_tokens": 128364319.0, + "step": 3362 + }, + { + "epoch": 0.4278081668998855, + "ewc_loss": 0.02688228152692318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3441141163639259e-05, + "grad_norm": 22.522531509399414, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8474589586257935, + "num_tokens": 128403892.0, + "step": 3363 + }, + { + "epoch": 0.427935377178476, + "ewc_loss": 0.026881663128733635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3440831935440656e-05, + "grad_norm": 22.467208862304688, + "learning_rate": 1e-06, + "loss": 0.5328, + "mean_token_accuracy": 0.8330491781234741, + "num_tokens": 128446760.0, + "step": 3364 + }, + { + "epoch": 0.42806258745706655, + "ewc_loss": 0.02695699781179428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3478498658514582e-05, + "grad_norm": 22.562902450561523, + "learning_rate": 1e-06, + "loss": 0.5447, + "mean_token_accuracy": 0.8287475109100342, + "num_tokens": 128485498.0, + "step": 3365 + }, + { + "epoch": 0.428189797735657, + "ewc_loss": 0.026917416602373123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3458708053804003e-05, + "grad_norm": 22.546401977539062, + "learning_rate": 1e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8414825201034546, + "num_tokens": 128527871.0, + "step": 3366 + }, + { + "epoch": 0.42831700801424755, + "ewc_loss": 0.02684934437274933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3424672033579554e-05, + "grad_norm": 22.536457061767578, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8388738036155701, + "num_tokens": 128567792.0, + "step": 3367 + }, + { + "epoch": 0.4284442182928381, + "ewc_loss": 0.026937609538435936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3468804354488384e-05, + "grad_norm": 22.498027801513672, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8433960676193237, + "num_tokens": 128607217.0, + "step": 3368 + }, + { + "epoch": 0.42857142857142855, + "ewc_loss": 0.02693891152739525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3469455552694853e-05, + "grad_norm": 22.556827545166016, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8582415580749512, + "num_tokens": 128642787.0, + "step": 3369 + }, + { + "epoch": 0.4286986388500191, + "ewc_loss": 0.026924312114715576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3462155948218424e-05, + "grad_norm": 22.465978622436523, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8638606667518616, + "num_tokens": 128680462.0, + "step": 3370 + }, + { + "epoch": 0.4288258491286096, + "ewc_loss": 0.026942795142531395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3471397323883139e-05, + "grad_norm": 22.766014099121094, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8491179943084717, + "num_tokens": 128715101.0, + "step": 3371 + }, + { + "epoch": 0.4289530594072001, + "ewc_loss": 0.027006207033991814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3503103218681645e-05, + "grad_norm": 22.660263061523438, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8445864915847778, + "num_tokens": 128749601.0, + "step": 3372 + }, + { + "epoch": 0.4290802696857906, + "ewc_loss": 0.026958439499139786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3479219887813088e-05, + "grad_norm": 22.640764236450195, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8481057286262512, + "num_tokens": 128788744.0, + "step": 3373 + }, + { + "epoch": 0.42920747996438113, + "ewc_loss": 0.026916608214378357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3458304238156416e-05, + "grad_norm": 22.332225799560547, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8480709195137024, + "num_tokens": 128828102.0, + "step": 3374 + }, + { + "epoch": 0.4293346902429716, + "ewc_loss": 0.026969674974679947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3484837836585939e-05, + "grad_norm": 22.95816993713379, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8472387790679932, + "num_tokens": 128863610.0, + "step": 3375 + }, + { + "epoch": 0.42946190052156213, + "ewc_loss": 0.02705635130405426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3528175259125419e-05, + "grad_norm": 22.572872161865234, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8662040829658508, + "num_tokens": 128898836.0, + "step": 3376 + }, + { + "epoch": 0.42958911080015266, + "ewc_loss": 0.026846595108509064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3423297787085176e-05, + "grad_norm": 22.47837257385254, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.853897750377655, + "num_tokens": 128934821.0, + "step": 3377 + }, + { + "epoch": 0.42971632107874314, + "ewc_loss": 0.027029331773519516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3514665624825284e-05, + "grad_norm": 22.814990997314453, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8507965207099915, + "num_tokens": 128966914.0, + "step": 3378 + }, + { + "epoch": 0.42984353135733366, + "ewc_loss": 0.026946498081088066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3473249055095948e-05, + "grad_norm": 22.36823272705078, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8586180210113525, + "num_tokens": 129008305.0, + "step": 3379 + }, + { + "epoch": 0.4299707416359242, + "ewc_loss": 0.026889115571975708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.344455813523382e-05, + "grad_norm": 22.48173713684082, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8728814125061035, + "num_tokens": 129043331.0, + "step": 3380 + }, + { + "epoch": 0.4300979519145147, + "ewc_loss": 0.02710023894906044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3550119547289796e-05, + "grad_norm": 22.625228881835938, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8579446077346802, + "num_tokens": 129083513.0, + "step": 3381 + }, + { + "epoch": 0.4302251621931052, + "ewc_loss": 0.027030901983380318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3515451428247616e-05, + "grad_norm": 22.563417434692383, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8514046669006348, + "num_tokens": 129113463.0, + "step": 3382 + }, + { + "epoch": 0.4303523724716957, + "ewc_loss": 0.027054540812969208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3527270311897155e-05, + "grad_norm": 22.515213012695312, + "learning_rate": 1e-06, + "loss": 0.5334, + "mean_token_accuracy": 0.8284589052200317, + "num_tokens": 129153080.0, + "step": 3383 + }, + { + "epoch": 0.43047958275028625, + "ewc_loss": 0.027121996507048607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3560998013417702e-05, + "grad_norm": 22.578292846679688, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8487343192100525, + "num_tokens": 129193338.0, + "step": 3384 + }, + { + "epoch": 0.4306067930288767, + "ewc_loss": 0.027073826640844345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3536913684220053e-05, + "grad_norm": 22.48371124267578, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8431857824325562, + "num_tokens": 129234816.0, + "step": 3385 + }, + { + "epoch": 0.43073400330746725, + "ewc_loss": 0.02706853114068508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.353426523564849e-05, + "grad_norm": 22.597026824951172, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.854489803314209, + "num_tokens": 129275228.0, + "step": 3386 + }, + { + "epoch": 0.4308612135860578, + "ewc_loss": 0.027126649394631386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3563324500864837e-05, + "grad_norm": 22.5118465423584, + "learning_rate": 1e-06, + "loss": 0.5275, + "mean_token_accuracy": 0.8351907730102539, + "num_tokens": 129309747.0, + "step": 3387 + }, + { + "epoch": 0.43098842386464825, + "ewc_loss": 0.027071518823504448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3535759535443503e-05, + "grad_norm": 22.574125289916992, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.854568362236023, + "num_tokens": 129342580.0, + "step": 3388 + }, + { + "epoch": 0.4311156341432388, + "ewc_loss": 0.027127347886562347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3563673746830318e-05, + "grad_norm": 22.469173431396484, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8470464944839478, + "num_tokens": 129376599.0, + "step": 3389 + }, + { + "epoch": 0.4312428444218293, + "ewc_loss": 0.02712962031364441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3564809705712833e-05, + "grad_norm": 22.625608444213867, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8510191440582275, + "num_tokens": 129414783.0, + "step": 3390 + }, + { + "epoch": 0.4313700547004198, + "ewc_loss": 0.02710689790546894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3553449207392987e-05, + "grad_norm": 22.471139907836914, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8515698909759521, + "num_tokens": 129450287.0, + "step": 3391 + }, + { + "epoch": 0.4314972649790103, + "ewc_loss": 0.02711821161210537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3559105354943313e-05, + "grad_norm": 22.590106964111328, + "learning_rate": 1e-06, + "loss": 0.524, + "mean_token_accuracy": 0.8348679542541504, + "num_tokens": 129486733.0, + "step": 3392 + }, + { + "epoch": 0.43162447525760084, + "ewc_loss": 0.027181869372725487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.359093494102126e-05, + "grad_norm": 22.586915969848633, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8527283072471619, + "num_tokens": 129524392.0, + "step": 3393 + }, + { + "epoch": 0.4317516855361913, + "ewc_loss": 0.02717340923845768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3586704881163314e-05, + "grad_norm": 22.575777053833008, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8395531177520752, + "num_tokens": 129558452.0, + "step": 3394 + }, + { + "epoch": 0.43187889581478184, + "ewc_loss": 0.027186742052435875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.359337147732731e-05, + "grad_norm": 22.552684783935547, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8507169485092163, + "num_tokens": 129597076.0, + "step": 3395 + }, + { + "epoch": 0.43200610609337237, + "ewc_loss": 0.027161842212080956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.358092140435474e-05, + "grad_norm": 22.51546287536621, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8446522951126099, + "num_tokens": 129635861.0, + "step": 3396 + }, + { + "epoch": 0.43213331637196284, + "ewc_loss": 0.02724144421517849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3620721801999025e-05, + "grad_norm": 22.713481903076172, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8472731113433838, + "num_tokens": 129673760.0, + "step": 3397 + }, + { + "epoch": 0.43226052665055337, + "ewc_loss": 0.02715279720723629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3576398487202823e-05, + "grad_norm": 22.440114974975586, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8534584045410156, + "num_tokens": 129710730.0, + "step": 3398 + }, + { + "epoch": 0.4323877369291439, + "ewc_loss": 0.027214137837290764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.360706846753601e-05, + "grad_norm": 22.63673210144043, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8483842015266418, + "num_tokens": 129748724.0, + "step": 3399 + }, + { + "epoch": 0.43251494720773437, + "ewc_loss": 0.027243265882134438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3621633115690202e-05, + "grad_norm": 22.578012466430664, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8643542528152466, + "num_tokens": 129787478.0, + "step": 3400 + }, + { + "epoch": 0.4326421574863249, + "ewc_loss": 0.027207138016819954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3603568731923588e-05, + "grad_norm": 22.578428268432617, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8619245290756226, + "num_tokens": 129829852.0, + "step": 3401 + }, + { + "epoch": 0.4327693677649154, + "ewc_loss": 0.02719874121248722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3599370504380204e-05, + "grad_norm": 22.577550888061523, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8514067530632019, + "num_tokens": 129869279.0, + "step": 3402 + }, + { + "epoch": 0.4328965780435059, + "ewc_loss": 0.02716381847858429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3581909115600865e-05, + "grad_norm": 22.684141159057617, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8468924760818481, + "num_tokens": 129903347.0, + "step": 3403 + }, + { + "epoch": 0.4330237883220964, + "ewc_loss": 0.027172362431883812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3586181012215093e-05, + "grad_norm": 22.61931610107422, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8625500202178955, + "num_tokens": 129940028.0, + "step": 3404 + }, + { + "epoch": 0.43315099860068695, + "ewc_loss": 0.027205271646380424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3602635590359569e-05, + "grad_norm": 22.54978370666504, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8615729212760925, + "num_tokens": 129976438.0, + "step": 3405 + }, + { + "epoch": 0.4332782088792774, + "ewc_loss": 0.027184324339032173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3592161849373952e-05, + "grad_norm": 22.589792251586914, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8574837446212769, + "num_tokens": 130015698.0, + "step": 3406 + }, + { + "epoch": 0.43340541915786795, + "ewc_loss": 0.027198877185583115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3599438716482837e-05, + "grad_norm": 22.691110610961914, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8667590618133545, + "num_tokens": 130056000.0, + "step": 3407 + }, + { + "epoch": 0.4335326294364585, + "ewc_loss": 0.027188267558813095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3594133633887395e-05, + "grad_norm": 22.489442825317383, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8495900630950928, + "num_tokens": 130096990.0, + "step": 3408 + }, + { + "epoch": 0.43365983971504896, + "ewc_loss": 0.02717343345284462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3586716704594437e-05, + "grad_norm": 22.64965057373047, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8618911504745483, + "num_tokens": 130128556.0, + "step": 3409 + }, + { + "epoch": 0.4337870499936395, + "ewc_loss": 0.02720525674521923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3602628314401954e-05, + "grad_norm": 22.533601760864258, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8582756519317627, + "num_tokens": 130170607.0, + "step": 3410 + }, + { + "epoch": 0.43391426027223, + "ewc_loss": 0.02718420885503292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.359210455120774e-05, + "grad_norm": 22.63987159729004, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8473669290542603, + "num_tokens": 130212511.0, + "step": 3411 + }, + { + "epoch": 0.4340414705508205, + "ewc_loss": 0.027188647538423538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3594323718280066e-05, + "grad_norm": 22.71796226501465, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8442861437797546, + "num_tokens": 130249246.0, + "step": 3412 + }, + { + "epoch": 0.434168680829411, + "ewc_loss": 0.02716650627553463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3583253348770086e-05, + "grad_norm": 22.638513565063477, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.872981071472168, + "num_tokens": 130291250.0, + "step": 3413 + }, + { + "epoch": 0.43429589110800154, + "ewc_loss": 0.027116797864437103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3558398677560035e-05, + "grad_norm": 22.63021469116211, + "learning_rate": 1e-06, + "loss": 0.5286, + "mean_token_accuracy": 0.8318477272987366, + "num_tokens": 130327094.0, + "step": 3414 + }, + { + "epoch": 0.434423101386592, + "ewc_loss": 0.0271418746560812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3570936971518677e-05, + "grad_norm": 22.48743438720703, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8444192409515381, + "num_tokens": 130366482.0, + "step": 3415 + }, + { + "epoch": 0.43455031166518254, + "ewc_loss": 0.027143696323037148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3571848285209853e-05, + "grad_norm": 22.676002502441406, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8525441884994507, + "num_tokens": 130407416.0, + "step": 3416 + }, + { + "epoch": 0.43467752194377307, + "ewc_loss": 0.027194594964385033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3597297765954863e-05, + "grad_norm": 22.632173538208008, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8583123683929443, + "num_tokens": 130447125.0, + "step": 3417 + }, + { + "epoch": 0.43480473222236354, + "ewc_loss": 0.027104364708065987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3552182281273417e-05, + "grad_norm": 22.537660598754883, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8580529689788818, + "num_tokens": 130480955.0, + "step": 3418 + }, + { + "epoch": 0.43493194250095407, + "ewc_loss": 0.02714974246919155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3574871445598546e-05, + "grad_norm": 22.644582748413086, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.864252507686615, + "num_tokens": 130511771.0, + "step": 3419 + }, + { + "epoch": 0.4350591527795446, + "ewc_loss": 0.02716945670545101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3584728549176361e-05, + "grad_norm": 22.68471908569336, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8512462377548218, + "num_tokens": 130550080.0, + "step": 3420 + }, + { + "epoch": 0.4351863630581351, + "ewc_loss": 0.027164043858647346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3582021892943885e-05, + "grad_norm": 22.693809509277344, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8413456678390503, + "num_tokens": 130588109.0, + "step": 3421 + }, + { + "epoch": 0.4353135733367256, + "ewc_loss": 0.027184082195162773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3592040886578616e-05, + "grad_norm": 22.55568504333496, + "learning_rate": 1e-06, + "loss": 0.5268, + "mean_token_accuracy": 0.8377912640571594, + "num_tokens": 130626832.0, + "step": 3422 + }, + { + "epoch": 0.43544078361531613, + "ewc_loss": 0.027183735743165016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3591868082585279e-05, + "grad_norm": 22.730125427246094, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8569183349609375, + "num_tokens": 130662202.0, + "step": 3423 + }, + { + "epoch": 0.4355679938939066, + "ewc_loss": 0.027241146191954613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3620573554362636e-05, + "grad_norm": 22.662002563476562, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8472227454185486, + "num_tokens": 130700747.0, + "step": 3424 + }, + { + "epoch": 0.43569520417249713, + "ewc_loss": 0.027137689292430878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3568844224209897e-05, + "grad_norm": 22.6018123626709, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8501932621002197, + "num_tokens": 130742849.0, + "step": 3425 + }, + { + "epoch": 0.43582241445108766, + "ewc_loss": 0.027229083701968193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3614541785500478e-05, + "grad_norm": 22.633113861083984, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8660778403282166, + "num_tokens": 130780156.0, + "step": 3426 + }, + { + "epoch": 0.43594962472967813, + "ewc_loss": 0.02716490998864174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3582454812421929e-05, + "grad_norm": 22.61375617980957, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8524261713027954, + "num_tokens": 130825824.0, + "step": 3427 + }, + { + "epoch": 0.43607683500826866, + "ewc_loss": 0.027205435559153557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3602717444882728e-05, + "grad_norm": 22.61766242980957, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8611421585083008, + "num_tokens": 130863603.0, + "step": 3428 + }, + { + "epoch": 0.4362040452868592, + "ewc_loss": 0.027210576459765434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.360528858640464e-05, + "grad_norm": 22.68413734436035, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8547743558883667, + "num_tokens": 130906689.0, + "step": 3429 + }, + { + "epoch": 0.4363312555654497, + "ewc_loss": 0.027178766205906868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3589383343060035e-05, + "grad_norm": 22.68773078918457, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8493258357048035, + "num_tokens": 130947785.0, + "step": 3430 + }, + { + "epoch": 0.4364584658440402, + "ewc_loss": 0.027148183435201645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3574092008639127e-05, + "grad_norm": 22.56182098388672, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8635390400886536, + "num_tokens": 130985489.0, + "step": 3431 + }, + { + "epoch": 0.4365856761226307, + "ewc_loss": 0.027158012613654137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3579006008512806e-05, + "grad_norm": 22.671722412109375, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8624663352966309, + "num_tokens": 131025274.0, + "step": 3432 + }, + { + "epoch": 0.43671288640122125, + "ewc_loss": 0.027185222133994102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3592611139756627e-05, + "grad_norm": 22.59327507019043, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8519155979156494, + "num_tokens": 131060132.0, + "step": 3433 + }, + { + "epoch": 0.4368400966798117, + "ewc_loss": 0.027124473825097084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3562236745201517e-05, + "grad_norm": 22.68365478515625, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8422233462333679, + "num_tokens": 131105109.0, + "step": 3434 + }, + { + "epoch": 0.43696730695840225, + "ewc_loss": 0.027169885113835335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3584942280431278e-05, + "grad_norm": 22.66034507751465, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8669091463088989, + "num_tokens": 131142094.0, + "step": 3435 + }, + { + "epoch": 0.4370945172369928, + "ewc_loss": 0.027131827548146248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3565913832280785e-05, + "grad_norm": 22.644309997558594, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8520771861076355, + "num_tokens": 131174648.0, + "step": 3436 + }, + { + "epoch": 0.43722172751558325, + "ewc_loss": 0.027140671387314796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3570335795520805e-05, + "grad_norm": 22.61746597290039, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8689206838607788, + "num_tokens": 131213447.0, + "step": 3437 + }, + { + "epoch": 0.4373489377941738, + "ewc_loss": 0.02716347947716713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3581739949586336e-05, + "grad_norm": 22.69280242919922, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8498279452323914, + "num_tokens": 131249622.0, + "step": 3438 + }, + { + "epoch": 0.4374761480727643, + "ewc_loss": 0.027140555903315544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3570277587859891e-05, + "grad_norm": 22.613052368164062, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.869404673576355, + "num_tokens": 131285883.0, + "step": 3439 + }, + { + "epoch": 0.4376033583513548, + "ewc_loss": 0.027162030339241028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3581015082309023e-05, + "grad_norm": 22.55985450744629, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8574863076210022, + "num_tokens": 131322774.0, + "step": 3440 + }, + { + "epoch": 0.4377305686299453, + "ewc_loss": 0.02712295576930046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3561478226620238e-05, + "grad_norm": 22.472570419311523, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8653565645217896, + "num_tokens": 131364513.0, + "step": 3441 + }, + { + "epoch": 0.43785777890853583, + "ewc_loss": 0.027210082858800888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3605041203845758e-05, + "grad_norm": 22.702590942382812, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8442221879959106, + "num_tokens": 131402299.0, + "step": 3442 + }, + { + "epoch": 0.4379849891871263, + "ewc_loss": 0.027243971824645996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.362198599963449e-05, + "grad_norm": 22.479095458984375, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.860663652420044, + "num_tokens": 131447830.0, + "step": 3443 + }, + { + "epoch": 0.43811219946571683, + "ewc_loss": 0.027229974046349525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3614987437904347e-05, + "grad_norm": 22.731027603149414, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.846872866153717, + "num_tokens": 131481058.0, + "step": 3444 + }, + { + "epoch": 0.43823940974430736, + "ewc_loss": 0.02732696197926998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3663480785908177e-05, + "grad_norm": 22.63129997253418, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8490749597549438, + "num_tokens": 131518376.0, + "step": 3445 + }, + { + "epoch": 0.43836662002289783, + "ewc_loss": 0.027220305055379868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3610152564069722e-05, + "grad_norm": 22.712121963500977, + "learning_rate": 1e-06, + "loss": 0.5323, + "mean_token_accuracy": 0.8380008339881897, + "num_tokens": 131553702.0, + "step": 3446 + }, + { + "epoch": 0.43849383030148836, + "ewc_loss": 0.027289623394608498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3644811588164885e-05, + "grad_norm": 22.721200942993164, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8622718453407288, + "num_tokens": 131587026.0, + "step": 3447 + }, + { + "epoch": 0.4386210405800789, + "ewc_loss": 0.027154898270964622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3577448953583371e-05, + "grad_norm": 22.471033096313477, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8591514229774475, + "num_tokens": 131621145.0, + "step": 3448 + }, + { + "epoch": 0.43874825085866936, + "ewc_loss": 0.027271030470728874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3635515642818063e-05, + "grad_norm": 22.753408432006836, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8595797419548035, + "num_tokens": 131661618.0, + "step": 3449 + }, + { + "epoch": 0.4388754611372599, + "ewc_loss": 0.027241164818406105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3620582649309654e-05, + "grad_norm": 22.603715896606445, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.846470832824707, + "num_tokens": 131693840.0, + "step": 3450 + }, + { + "epoch": 0.4390026714158504, + "ewc_loss": 0.027276642620563507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3638321433973033e-05, + "grad_norm": 22.76517105102539, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8559792637825012, + "num_tokens": 131730856.0, + "step": 3451 + }, + { + "epoch": 0.4391298816944409, + "ewc_loss": 0.027256418019533157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.362820876238402e-05, + "grad_norm": 22.6699275970459, + "learning_rate": 1e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8302383422851562, + "num_tokens": 131769172.0, + "step": 3452 + }, + { + "epoch": 0.4392570919730314, + "ewc_loss": 0.027251502498984337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.362575130769983e-05, + "grad_norm": 22.68304443359375, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8560203313827515, + "num_tokens": 131810481.0, + "step": 3453 + }, + { + "epoch": 0.43938430225162195, + "ewc_loss": 0.02727249264717102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3636245967063587e-05, + "grad_norm": 22.647287368774414, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8548033833503723, + "num_tokens": 131850742.0, + "step": 3454 + }, + { + "epoch": 0.4395115125302124, + "ewc_loss": 0.027276407927274704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3638204109156504e-05, + "grad_norm": 22.671653747558594, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.852800190448761, + "num_tokens": 131886675.0, + "step": 3455 + }, + { + "epoch": 0.43963872280880295, + "ewc_loss": 0.0272898580878973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3644928912981413e-05, + "grad_norm": 22.547334671020508, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8584032654762268, + "num_tokens": 131926831.0, + "step": 3456 + }, + { + "epoch": 0.4397659330873935, + "ewc_loss": 0.027238421142101288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3619210221804678e-05, + "grad_norm": 22.631067276000977, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8522378206253052, + "num_tokens": 131962265.0, + "step": 3457 + }, + { + "epoch": 0.43989314336598395, + "ewc_loss": 0.02734353207051754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3671766282641329e-05, + "grad_norm": 22.7181339263916, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.847588300704956, + "num_tokens": 131999906.0, + "step": 3458 + }, + { + "epoch": 0.4400203536445745, + "ewc_loss": 0.0273310337215662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3665517144545447e-05, + "grad_norm": 22.60114097595215, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8686631917953491, + "num_tokens": 132031330.0, + "step": 3459 + }, + { + "epoch": 0.440147563923165, + "ewc_loss": 0.027285223826766014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3642611520481296e-05, + "grad_norm": 22.631324768066406, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8529613018035889, + "num_tokens": 132065049.0, + "step": 3460 + }, + { + "epoch": 0.4402747742017555, + "ewc_loss": 0.02732851915061474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3664259313372895e-05, + "grad_norm": 22.659923553466797, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8567991852760315, + "num_tokens": 132100993.0, + "step": 3461 + }, + { + "epoch": 0.440401984480346, + "ewc_loss": 0.027324514463543892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3662257515534293e-05, + "grad_norm": 22.695852279663086, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8518645763397217, + "num_tokens": 132138169.0, + "step": 3462 + }, + { + "epoch": 0.44052919475893654, + "ewc_loss": 0.027339892461895943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3669946383743081e-05, + "grad_norm": 22.65635108947754, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8612441420555115, + "num_tokens": 132170498.0, + "step": 3463 + }, + { + "epoch": 0.440656405037527, + "ewc_loss": 0.027337679639458656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3668839528691024e-05, + "grad_norm": 22.70207405090332, + "learning_rate": 1e-06, + "loss": 0.5255, + "mean_token_accuracy": 0.8349165916442871, + "num_tokens": 132213583.0, + "step": 3464 + }, + { + "epoch": 0.44078361531611754, + "ewc_loss": 0.027332374826073647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3666187442140654e-05, + "grad_norm": 22.62296485900879, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8489141464233398, + "num_tokens": 132255427.0, + "step": 3465 + }, + { + "epoch": 0.44091082559470807, + "ewc_loss": 0.02731207385659218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3656036571774166e-05, + "grad_norm": 22.634531021118164, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8519220948219299, + "num_tokens": 132295795.0, + "step": 3466 + }, + { + "epoch": 0.44103803587329854, + "ewc_loss": 0.027357323095202446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.367866116197547e-05, + "grad_norm": 22.677881240844727, + "learning_rate": 1e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8357294797897339, + "num_tokens": 132336881.0, + "step": 3467 + }, + { + "epoch": 0.44116524615188907, + "ewc_loss": 0.027339493855834007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3669747204403393e-05, + "grad_norm": 22.70232582092285, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8542087078094482, + "num_tokens": 132375136.0, + "step": 3468 + }, + { + "epoch": 0.4412924564304796, + "ewc_loss": 0.02730691432952881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3653457244799938e-05, + "grad_norm": 22.517974853515625, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8516684770584106, + "num_tokens": 132412893.0, + "step": 3469 + }, + { + "epoch": 0.44141966670907007, + "ewc_loss": 0.027401577681303024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3700789168069605e-05, + "grad_norm": 22.80364418029785, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8460309505462646, + "num_tokens": 132444360.0, + "step": 3470 + }, + { + "epoch": 0.4415468769876606, + "ewc_loss": 0.02736249379813671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.368124685541261e-05, + "grad_norm": 22.547744750976562, + "learning_rate": 1e-06, + "loss": 0.5668, + "mean_token_accuracy": 0.8255815505981445, + "num_tokens": 132485022.0, + "step": 3471 + }, + { + "epoch": 0.4416740872662511, + "ewc_loss": 0.02734648622572422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3673243302037008e-05, + "grad_norm": 22.715131759643555, + "learning_rate": 1e-06, + "loss": 0.5364, + "mean_token_accuracy": 0.838870644569397, + "num_tokens": 132522800.0, + "step": 3472 + }, + { + "epoch": 0.4418012975448416, + "ewc_loss": 0.02738775499165058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3693877917830832e-05, + "grad_norm": 22.620014190673828, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8554500341415405, + "num_tokens": 132560527.0, + "step": 3473 + }, + { + "epoch": 0.4419285078234321, + "ewc_loss": 0.027383262291550636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3691631465917453e-05, + "grad_norm": 22.596641540527344, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8507587909698486, + "num_tokens": 132601075.0, + "step": 3474 + }, + { + "epoch": 0.44205571810202265, + "ewc_loss": 0.02741226926445961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3706134268431924e-05, + "grad_norm": 22.632566452026367, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8436256647109985, + "num_tokens": 132639851.0, + "step": 3475 + }, + { + "epoch": 0.4421829283806131, + "ewc_loss": 0.027390198782086372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3695099369215313e-05, + "grad_norm": 22.59105110168457, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8548314571380615, + "num_tokens": 132679849.0, + "step": 3476 + }, + { + "epoch": 0.44231013865920366, + "ewc_loss": 0.027453426271677017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.372671340504894e-05, + "grad_norm": 22.70851707458496, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8569810390472412, + "num_tokens": 132721020.0, + "step": 3477 + }, + { + "epoch": 0.4424373489377942, + "ewc_loss": 0.02742244116961956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3711220162804238e-05, + "grad_norm": 22.58124542236328, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8396933078765869, + "num_tokens": 132760933.0, + "step": 3478 + }, + { + "epoch": 0.44256455921638466, + "ewc_loss": 0.027406120672822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3703060176339932e-05, + "grad_norm": 22.65452003479004, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8633326888084412, + "num_tokens": 132800627.0, + "step": 3479 + }, + { + "epoch": 0.4426917694949752, + "ewc_loss": 0.027459707111120224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3729853890254162e-05, + "grad_norm": 22.604719161987305, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8507567644119263, + "num_tokens": 132833232.0, + "step": 3480 + }, + { + "epoch": 0.4428189797735657, + "ewc_loss": 0.027463139966130257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3731570106756408e-05, + "grad_norm": 22.689281463623047, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.85111403465271, + "num_tokens": 132874670.0, + "step": 3481 + }, + { + "epoch": 0.44294619005215624, + "ewc_loss": 0.027477489784359932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3738745110458694e-05, + "grad_norm": 22.674053192138672, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8548333644866943, + "num_tokens": 132916187.0, + "step": 3482 + }, + { + "epoch": 0.4430734003307467, + "ewc_loss": 0.02740330621600151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3701653188036289e-05, + "grad_norm": 22.650333404541016, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8558310270309448, + "num_tokens": 132956948.0, + "step": 3483 + }, + { + "epoch": 0.44320061060933724, + "ewc_loss": 0.027391614392399788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3695806956093293e-05, + "grad_norm": 22.558300018310547, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8587204217910767, + "num_tokens": 132995213.0, + "step": 3484 + }, + { + "epoch": 0.44332782088792777, + "ewc_loss": 0.027423648163676262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3711824067286216e-05, + "grad_norm": 22.648963928222656, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8497995138168335, + "num_tokens": 133038850.0, + "step": 3485 + }, + { + "epoch": 0.44345503116651824, + "ewc_loss": 0.02746521681547165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3732608749705832e-05, + "grad_norm": 22.67340087890625, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8549647331237793, + "num_tokens": 133080766.0, + "step": 3486 + }, + { + "epoch": 0.44358224144510877, + "ewc_loss": 0.027418000623583794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.370900008623721e-05, + "grad_norm": 22.606534957885742, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8494254946708679, + "num_tokens": 133120845.0, + "step": 3487 + }, + { + "epoch": 0.4437094517236993, + "ewc_loss": 0.027381371706724167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.369068559142761e-05, + "grad_norm": 22.56361198425293, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.850954532623291, + "num_tokens": 133166011.0, + "step": 3488 + }, + { + "epoch": 0.4438366620022898, + "ewc_loss": 0.027404271066188812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3702135220228229e-05, + "grad_norm": 22.624820709228516, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8519060611724854, + "num_tokens": 133203507.0, + "step": 3489 + }, + { + "epoch": 0.4439638722808803, + "ewc_loss": 0.027439018711447716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3719509297516197e-05, + "grad_norm": 22.694473266601562, + "learning_rate": 1e-06, + "loss": 0.5399, + "mean_token_accuracy": 0.8296937346458435, + "num_tokens": 133240554.0, + "step": 3490 + }, + { + "epoch": 0.44409108255947083, + "ewc_loss": 0.027432525530457497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3716262401430868e-05, + "grad_norm": 22.69003677368164, + "learning_rate": 1e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8365890383720398, + "num_tokens": 133274987.0, + "step": 3491 + }, + { + "epoch": 0.4442182928380613, + "ewc_loss": 0.02743622660636902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3718113223148976e-05, + "grad_norm": 22.626510620117188, + "learning_rate": 1e-06, + "loss": 0.5518, + "mean_token_accuracy": 0.8251607418060303, + "num_tokens": 133319287.0, + "step": 3492 + }, + { + "epoch": 0.44434550311665183, + "ewc_loss": 0.027407843619585037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.370392146782251e-05, + "grad_norm": 22.703344345092773, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.857931911945343, + "num_tokens": 133353670.0, + "step": 3493 + }, + { + "epoch": 0.44447271339524236, + "ewc_loss": 0.027470922097563744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3735460925090592e-05, + "grad_norm": 22.586807250976562, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8542605638504028, + "num_tokens": 133391950.0, + "step": 3494 + }, + { + "epoch": 0.44459992367383283, + "ewc_loss": 0.027441933751106262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3720967217523139e-05, + "grad_norm": 22.711824417114258, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8572642803192139, + "num_tokens": 133424758.0, + "step": 3495 + }, + { + "epoch": 0.44472713395242336, + "ewc_loss": 0.02745303139090538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3726516044698656e-05, + "grad_norm": 22.596891403198242, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8811401128768921, + "num_tokens": 133459869.0, + "step": 3496 + }, + { + "epoch": 0.4448543442310139, + "ewc_loss": 0.02743314392864704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.371657162962947e-05, + "grad_norm": 22.688926696777344, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8510133028030396, + "num_tokens": 133494598.0, + "step": 3497 + }, + { + "epoch": 0.44498155450960436, + "ewc_loss": 0.02745669148862362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3728345948038623e-05, + "grad_norm": 22.536376953125, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.862045407295227, + "num_tokens": 133536403.0, + "step": 3498 + }, + { + "epoch": 0.4451087647881949, + "ewc_loss": 0.02748309262096882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3741546354140155e-05, + "grad_norm": 22.721664428710938, + "learning_rate": 1e-06, + "loss": 0.538, + "mean_token_accuracy": 0.8312819004058838, + "num_tokens": 133567319.0, + "step": 3499 + }, + { + "epoch": 0.4452359750667854, + "ewc_loss": 0.02752622961997986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.37631150209927e-05, + "grad_norm": 22.668729782104492, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8451657891273499, + "num_tokens": 133604829.0, + "step": 3500 + }, + { + "epoch": 0.4453631853453759, + "ewc_loss": 0.02746753580868244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3733768355450593e-05, + "grad_norm": 22.649930953979492, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8521904945373535, + "num_tokens": 133638373.0, + "step": 3501 + }, + { + "epoch": 0.4454903956239664, + "ewc_loss": 0.02752908505499363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3764542927674484e-05, + "grad_norm": 22.66168212890625, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8635550141334534, + "num_tokens": 133674110.0, + "step": 3502 + }, + { + "epoch": 0.44561760590255695, + "ewc_loss": 0.027524277567863464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3762139133177698e-05, + "grad_norm": 22.657499313354492, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8541772365570068, + "num_tokens": 133711005.0, + "step": 3503 + }, + { + "epoch": 0.4457448161811474, + "ewc_loss": 0.027538064867258072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3769032193522435e-05, + "grad_norm": 22.650114059448242, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8640580177307129, + "num_tokens": 133746258.0, + "step": 3504 + }, + { + "epoch": 0.44587202645973795, + "ewc_loss": 0.02746947668492794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3734738786297385e-05, + "grad_norm": 22.57150650024414, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8610851764678955, + "num_tokens": 133790695.0, + "step": 3505 + }, + { + "epoch": 0.4459992367383285, + "ewc_loss": 0.027542652562260628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3771325939160306e-05, + "grad_norm": 22.56316375732422, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8491247296333313, + "num_tokens": 133830181.0, + "step": 3506 + }, + { + "epoch": 0.44612644701691895, + "ewc_loss": 0.027577269822359085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3788634532829747e-05, + "grad_norm": 22.666749954223633, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8562681078910828, + "num_tokens": 133866849.0, + "step": 3507 + }, + { + "epoch": 0.4462536572955095, + "ewc_loss": 0.027637049555778503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3818525076203514e-05, + "grad_norm": 22.81833839416504, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8519251346588135, + "num_tokens": 133906812.0, + "step": 3508 + }, + { + "epoch": 0.4463808675741, + "ewc_loss": 0.027512257918715477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3756129192188382e-05, + "grad_norm": 22.461181640625, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8464322090148926, + "num_tokens": 133948956.0, + "step": 3509 + }, + { + "epoch": 0.4465080778526905, + "ewc_loss": 0.027506403625011444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3753201528743375e-05, + "grad_norm": 22.73627281188965, + "learning_rate": 1e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.8352425694465637, + "num_tokens": 133988764.0, + "step": 3510 + }, + { + "epoch": 0.446635288131281, + "ewc_loss": 0.027597971260547638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3798985492030624e-05, + "grad_norm": 22.65870475769043, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8581529259681702, + "num_tokens": 134028148.0, + "step": 3511 + }, + { + "epoch": 0.44676249840987153, + "ewc_loss": 0.027478789910674095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3739395399170462e-05, + "grad_norm": 22.72880744934082, + "learning_rate": 1e-06, + "loss": 0.5484, + "mean_token_accuracy": 0.8287286758422852, + "num_tokens": 134065909.0, + "step": 3512 + }, + { + "epoch": 0.446889708688462, + "ewc_loss": 0.027556855231523514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3778427273791749e-05, + "grad_norm": 22.71779441833496, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8598710298538208, + "num_tokens": 134108342.0, + "step": 3513 + }, + { + "epoch": 0.44701691896705253, + "ewc_loss": 0.027463681995868683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3731841136177536e-05, + "grad_norm": 22.586063385009766, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8515619039535522, + "num_tokens": 134145928.0, + "step": 3514 + }, + { + "epoch": 0.44714412924564306, + "ewc_loss": 0.027547679841518402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3773839782516006e-05, + "grad_norm": 22.75090980529785, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8483555912971497, + "num_tokens": 134185955.0, + "step": 3515 + }, + { + "epoch": 0.44727133952423354, + "ewc_loss": 0.027521450072526932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.376072486891644e-05, + "grad_norm": 22.599794387817383, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8595483899116516, + "num_tokens": 134225829.0, + "step": 3516 + }, + { + "epoch": 0.44739854980282406, + "ewc_loss": 0.027496935799717903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3748467608820647e-05, + "grad_norm": 22.689912796020508, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.842082142829895, + "num_tokens": 134261381.0, + "step": 3517 + }, + { + "epoch": 0.4475257600814146, + "ewc_loss": 0.027541980147361755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3770990335615352e-05, + "grad_norm": 22.618257522583008, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8597980737686157, + "num_tokens": 134294697.0, + "step": 3518 + }, + { + "epoch": 0.44765297036000506, + "ewc_loss": 0.02752993069589138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3764964933216106e-05, + "grad_norm": 22.674238204956055, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8727241158485413, + "num_tokens": 134334349.0, + "step": 3519 + }, + { + "epoch": 0.4477801806385956, + "ewc_loss": 0.027491986751556396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3745993783231825e-05, + "grad_norm": 22.651409149169922, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8413921594619751, + "num_tokens": 134373381.0, + "step": 3520 + }, + { + "epoch": 0.4479073909171861, + "ewc_loss": 0.027505816891789436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3752908671449404e-05, + "grad_norm": 22.722244262695312, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8581002354621887, + "num_tokens": 134407952.0, + "step": 3521 + }, + { + "epoch": 0.4480346011957766, + "ewc_loss": 0.027536582201719284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.376829095534049e-05, + "grad_norm": 22.631256103515625, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.842767596244812, + "num_tokens": 134458862.0, + "step": 3522 + }, + { + "epoch": 0.4481618114743671, + "ewc_loss": 0.02749205380678177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3746026525041088e-05, + "grad_norm": 22.705514907836914, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8503082990646362, + "num_tokens": 134494007.0, + "step": 3523 + }, + { + "epoch": 0.44828902175295765, + "ewc_loss": 0.027519790455698967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3759895409748424e-05, + "grad_norm": 22.684123992919922, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8383603096008301, + "num_tokens": 134534195.0, + "step": 3524 + }, + { + "epoch": 0.4484162320315481, + "ewc_loss": 0.02750897966325283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3754489373241086e-05, + "grad_norm": 22.662574768066406, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8605190515518188, + "num_tokens": 134570241.0, + "step": 3525 + }, + { + "epoch": 0.44854344231013865, + "ewc_loss": 0.027498668059706688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3749334357271437e-05, + "grad_norm": 22.71085548400879, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8514148592948914, + "num_tokens": 134611904.0, + "step": 3526 + }, + { + "epoch": 0.4486706525887292, + "ewc_loss": 0.027549732476472855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3774866602034308e-05, + "grad_norm": 22.669456481933594, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8437185287475586, + "num_tokens": 134644426.0, + "step": 3527 + }, + { + "epoch": 0.44879786286731965, + "ewc_loss": 0.027480727061629295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3740363101533148e-05, + "grad_norm": 22.724010467529297, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8612402677536011, + "num_tokens": 134678979.0, + "step": 3528 + }, + { + "epoch": 0.4489250731459102, + "ewc_loss": 0.027577906847000122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3788953765470069e-05, + "grad_norm": 22.691509246826172, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8553133010864258, + "num_tokens": 134717270.0, + "step": 3529 + }, + { + "epoch": 0.4490522834245007, + "ewc_loss": 0.02755402773618698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3777013919025194e-05, + "grad_norm": 22.721134185791016, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8535330295562744, + "num_tokens": 134755464.0, + "step": 3530 + }, + { + "epoch": 0.44917949370309124, + "ewc_loss": 0.027511490508913994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3755745385424234e-05, + "grad_norm": 22.659896850585938, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8453234434127808, + "num_tokens": 134793737.0, + "step": 3531 + }, + { + "epoch": 0.4493067039816817, + "ewc_loss": 0.027542632073163986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3771315934718587e-05, + "grad_norm": 22.67648696899414, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8475639820098877, + "num_tokens": 134834332.0, + "step": 3532 + }, + { + "epoch": 0.44943391426027224, + "ewc_loss": 0.027503835037350655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3751917322224472e-05, + "grad_norm": 22.65376091003418, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8624242544174194, + "num_tokens": 134871027.0, + "step": 3533 + }, + { + "epoch": 0.44956112453886277, + "ewc_loss": 0.027594897896051407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.379744935547933e-05, + "grad_norm": 22.719266891479492, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8521906137466431, + "num_tokens": 134908529.0, + "step": 3534 + }, + { + "epoch": 0.44968833481745324, + "ewc_loss": 0.027485733851790428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.374286694044713e-05, + "grad_norm": 22.587854385375977, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.861884355545044, + "num_tokens": 134944177.0, + "step": 3535 + }, + { + "epoch": 0.44981554509604377, + "ewc_loss": 0.0275628212839365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3781410416413564e-05, + "grad_norm": 22.731342315673828, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8521585464477539, + "num_tokens": 134977928.0, + "step": 3536 + }, + { + "epoch": 0.4499427553746343, + "ewc_loss": 0.02755201794207096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3776008927379735e-05, + "grad_norm": 22.64674949645996, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8524237275123596, + "num_tokens": 135017554.0, + "step": 3537 + }, + { + "epoch": 0.45006996565322477, + "ewc_loss": 0.027555329725146294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3777665117231663e-05, + "grad_norm": 22.63780975341797, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8589496612548828, + "num_tokens": 135058694.0, + "step": 3538 + }, + { + "epoch": 0.4501971759318153, + "ewc_loss": 0.027592748403549194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3796374332741834e-05, + "grad_norm": 22.73380470275879, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8544391989707947, + "num_tokens": 135094037.0, + "step": 3539 + }, + { + "epoch": 0.4503243862104058, + "ewc_loss": 0.02758187986910343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3790940101898741e-05, + "grad_norm": 22.689851760864258, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.847259521484375, + "num_tokens": 135132758.0, + "step": 3540 + }, + { + "epoch": 0.4504515964889963, + "ewc_loss": 0.027526749297976494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3763374226982705e-05, + "grad_norm": 22.65456771850586, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8660773038864136, + "num_tokens": 135169398.0, + "step": 3541 + }, + { + "epoch": 0.4505788067675868, + "ewc_loss": 0.027564356103539467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.378217802994186e-05, + "grad_norm": 22.685993194580078, + "learning_rate": 1e-06, + "loss": 0.5409, + "mean_token_accuracy": 0.8389940857887268, + "num_tokens": 135204312.0, + "step": 3542 + }, + { + "epoch": 0.45070601704617735, + "ewc_loss": 0.027621304616332054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3810652490064967e-05, + "grad_norm": 22.68768882751465, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8503459692001343, + "num_tokens": 135246287.0, + "step": 3543 + }, + { + "epoch": 0.4508332273247678, + "ewc_loss": 0.02755197510123253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3775987099506892e-05, + "grad_norm": 22.665422439575195, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8566317558288574, + "num_tokens": 135280970.0, + "step": 3544 + }, + { + "epoch": 0.45096043760335836, + "ewc_loss": 0.02762739546597004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3813697478326503e-05, + "grad_norm": 22.723302841186523, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8745602965354919, + "num_tokens": 135317873.0, + "step": 3545 + }, + { + "epoch": 0.4510876478819489, + "ewc_loss": 0.027643870562314987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3821935681335162e-05, + "grad_norm": 22.695507049560547, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8607078194618225, + "num_tokens": 135354994.0, + "step": 3546 + }, + { + "epoch": 0.45121485816053936, + "ewc_loss": 0.027600962668657303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.380048161081504e-05, + "grad_norm": 22.665740966796875, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8474504351615906, + "num_tokens": 135393050.0, + "step": 3547 + }, + { + "epoch": 0.4513420684391299, + "ewc_loss": 0.027684669941663742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3842334738001227e-05, + "grad_norm": 22.727140426635742, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8578636646270752, + "num_tokens": 135431704.0, + "step": 3548 + }, + { + "epoch": 0.4514692787177204, + "ewc_loss": 0.027639193460345268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3819596460962202e-05, + "grad_norm": 22.69579315185547, + "learning_rate": 1e-06, + "loss": 0.5349, + "mean_token_accuracy": 0.8307340741157532, + "num_tokens": 135471306.0, + "step": 3549 + }, + { + "epoch": 0.4515964889963109, + "ewc_loss": 0.02767265774309635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3836328434990719e-05, + "grad_norm": 22.824125289916992, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8403240442276001, + "num_tokens": 135509647.0, + "step": 3550 + }, + { + "epoch": 0.4517236992749014, + "ewc_loss": 0.027682170271873474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3841085092280991e-05, + "grad_norm": 22.67803192138672, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8604075908660889, + "num_tokens": 135550469.0, + "step": 3551 + }, + { + "epoch": 0.45185090955349194, + "ewc_loss": 0.027643557637929916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3821779248246457e-05, + "grad_norm": 22.760488510131836, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8527098894119263, + "num_tokens": 135589290.0, + "step": 3552 + }, + { + "epoch": 0.4519781198320824, + "ewc_loss": 0.02761765941977501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3808829862682614e-05, + "grad_norm": 22.608999252319336, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8471167087554932, + "num_tokens": 135624416.0, + "step": 3553 + }, + { + "epoch": 0.45210533011067294, + "ewc_loss": 0.027626903727650642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3813451914757024e-05, + "grad_norm": 22.71125602722168, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8572304248809814, + "num_tokens": 135662316.0, + "step": 3554 + }, + { + "epoch": 0.45223254038926347, + "ewc_loss": 0.027660010382533073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3830005627823994e-05, + "grad_norm": 22.717336654663086, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8554653525352478, + "num_tokens": 135701739.0, + "step": 3555 + }, + { + "epoch": 0.45235975066785394, + "ewc_loss": 0.02765568345785141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3827841939928476e-05, + "grad_norm": 22.636451721191406, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.850056529045105, + "num_tokens": 135741372.0, + "step": 3556 + }, + { + "epoch": 0.45248696094644447, + "ewc_loss": 0.027651960030198097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3825980204273947e-05, + "grad_norm": 22.840927124023438, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8762049674987793, + "num_tokens": 135779945.0, + "step": 3557 + }, + { + "epoch": 0.452614171225035, + "ewc_loss": 0.027679964900016785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3839982784702443e-05, + "grad_norm": 22.79350471496582, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8479355573654175, + "num_tokens": 135813179.0, + "step": 3558 + }, + { + "epoch": 0.4527413815036255, + "ewc_loss": 0.027615122497081757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3807561117573641e-05, + "grad_norm": 22.789291381835938, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8549755811691284, + "num_tokens": 135855147.0, + "step": 3559 + }, + { + "epoch": 0.452868591782216, + "ewc_loss": 0.02764185518026352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3820927961205598e-05, + "grad_norm": 22.837894439697266, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8613203167915344, + "num_tokens": 135892990.0, + "step": 3560 + }, + { + "epoch": 0.45299580206080653, + "ewc_loss": 0.027526594698429108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3763296919933055e-05, + "grad_norm": 22.777587890625, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8723843097686768, + "num_tokens": 135926911.0, + "step": 3561 + }, + { + "epoch": 0.453123012339397, + "ewc_loss": 0.027598440647125244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3799220141663682e-05, + "grad_norm": 22.859420776367188, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8613202571868896, + "num_tokens": 135963530.0, + "step": 3562 + }, + { + "epoch": 0.45325022261798753, + "ewc_loss": 0.0275703314691782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3785165720037185e-05, + "grad_norm": 22.77623748779297, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8428616523742676, + "num_tokens": 136000677.0, + "step": 3563 + }, + { + "epoch": 0.45337743289657806, + "ewc_loss": 0.027576366439461708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3788183423457667e-05, + "grad_norm": 22.80229377746582, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8432563543319702, + "num_tokens": 136037283.0, + "step": 3564 + }, + { + "epoch": 0.45350464317516853, + "ewc_loss": 0.02757720835506916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3788604519504588e-05, + "grad_norm": 22.805461883544922, + "learning_rate": 1e-06, + "loss": 0.5276, + "mean_token_accuracy": 0.8349307775497437, + "num_tokens": 136071454.0, + "step": 3565 + }, + { + "epoch": 0.45363185345375906, + "ewc_loss": 0.027557728812098503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3778864740743302e-05, + "grad_norm": 22.775238037109375, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8425521850585938, + "num_tokens": 136105638.0, + "step": 3566 + }, + { + "epoch": 0.4537590637323496, + "ewc_loss": 0.02760162018239498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.380080993840238e-05, + "grad_norm": 22.86014747619629, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8563665747642517, + "num_tokens": 136147968.0, + "step": 3567 + }, + { + "epoch": 0.45388627401094006, + "ewc_loss": 0.027612797915935516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3806398783344775e-05, + "grad_norm": 22.850547790527344, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.853381335735321, + "num_tokens": 136190468.0, + "step": 3568 + }, + { + "epoch": 0.4540134842895306, + "ewc_loss": 0.02758372388780117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3791862329526339e-05, + "grad_norm": 22.681663513183594, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8574472665786743, + "num_tokens": 136229363.0, + "step": 3569 + }, + { + "epoch": 0.4541406945681211, + "ewc_loss": 0.027590563520789146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3795282029605005e-05, + "grad_norm": 22.819459915161133, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8468223214149475, + "num_tokens": 136268834.0, + "step": 3570 + }, + { + "epoch": 0.4542679048467116, + "ewc_loss": 0.027611957862973213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3805978596792556e-05, + "grad_norm": 22.795631408691406, + "learning_rate": 1e-06, + "loss": 0.5528, + "mean_token_accuracy": 0.8304978609085083, + "num_tokens": 136306761.0, + "step": 3571 + }, + { + "epoch": 0.4543951151253021, + "ewc_loss": 0.02758241817355156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3791209312330466e-05, + "grad_norm": 22.726991653442383, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8531115651130676, + "num_tokens": 136347736.0, + "step": 3572 + }, + { + "epoch": 0.45452232540389265, + "ewc_loss": 0.027615554630756378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3807777577312663e-05, + "grad_norm": 22.79697608947754, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8554184436798096, + "num_tokens": 136381009.0, + "step": 3573 + }, + { + "epoch": 0.4546495356824831, + "ewc_loss": 0.02763054147362709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.381527090416057e-05, + "grad_norm": 22.741886138916016, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8636071681976318, + "num_tokens": 136418050.0, + "step": 3574 + }, + { + "epoch": 0.45477674596107365, + "ewc_loss": 0.027633192017674446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3816596037941054e-05, + "grad_norm": 22.906219482421875, + "learning_rate": 1e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.835990309715271, + "num_tokens": 136459109.0, + "step": 3575 + }, + { + "epoch": 0.4549039562396642, + "ewc_loss": 0.027671242132782936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.383562084811274e-05, + "grad_norm": 22.802186965942383, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8503771424293518, + "num_tokens": 136497993.0, + "step": 3576 + }, + { + "epoch": 0.45503116651825465, + "ewc_loss": 0.02757987752556801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3789938748232089e-05, + "grad_norm": 22.81908416748047, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8554161787033081, + "num_tokens": 136534446.0, + "step": 3577 + }, + { + "epoch": 0.4551583767968452, + "ewc_loss": 0.027660410851240158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3830205716658384e-05, + "grad_norm": 22.810617446899414, + "learning_rate": 1e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8374521732330322, + "num_tokens": 136571117.0, + "step": 3578 + }, + { + "epoch": 0.4552855870754357, + "ewc_loss": 0.027614399790763855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3807200048177037e-05, + "grad_norm": 22.73199462890625, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8604927062988281, + "num_tokens": 136601706.0, + "step": 3579 + }, + { + "epoch": 0.4554127973540262, + "ewc_loss": 0.027668388560414314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3834193850925658e-05, + "grad_norm": 22.779443740844727, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8541070818901062, + "num_tokens": 136639735.0, + "step": 3580 + }, + { + "epoch": 0.4555400076326167, + "ewc_loss": 0.027678167447447777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.383908329444239e-05, + "grad_norm": 22.80815887451172, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8464936017990112, + "num_tokens": 136675647.0, + "step": 3581 + }, + { + "epoch": 0.45566721791120723, + "ewc_loss": 0.02767934277653694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3839671737514436e-05, + "grad_norm": 22.74336051940918, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8602488040924072, + "num_tokens": 136713361.0, + "step": 3582 + }, + { + "epoch": 0.45579442818979776, + "ewc_loss": 0.027687054127454758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3843527085555252e-05, + "grad_norm": 22.782737731933594, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8521944880485535, + "num_tokens": 136751440.0, + "step": 3583 + }, + { + "epoch": 0.45592163846838824, + "ewc_loss": 0.02768600732088089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3843004126101732e-05, + "grad_norm": 22.700180053710938, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8490003347396851, + "num_tokens": 136786078.0, + "step": 3584 + }, + { + "epoch": 0.45604884874697876, + "ewc_loss": 0.027711467817425728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3855734323442448e-05, + "grad_norm": 22.803640365600586, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8611207604408264, + "num_tokens": 136828313.0, + "step": 3585 + }, + { + "epoch": 0.4561760590255693, + "ewc_loss": 0.027718594297766685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3859296814189292e-05, + "grad_norm": 22.65624237060547, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8519482612609863, + "num_tokens": 136862549.0, + "step": 3586 + }, + { + "epoch": 0.45630326930415976, + "ewc_loss": 0.027678247541189194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3839124221703969e-05, + "grad_norm": 22.74748420715332, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.856961727142334, + "num_tokens": 136906423.0, + "step": 3587 + }, + { + "epoch": 0.4564304795827503, + "ewc_loss": 0.027747947722673416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3873974239686504e-05, + "grad_norm": 22.758480072021484, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8414365649223328, + "num_tokens": 136941550.0, + "step": 3588 + }, + { + "epoch": 0.4565576898613408, + "ewc_loss": 0.027719825506210327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3859912542102393e-05, + "grad_norm": 22.758209228515625, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.856408417224884, + "num_tokens": 136980234.0, + "step": 3589 + }, + { + "epoch": 0.4566849001399313, + "ewc_loss": 0.027758628129959106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3879313883080613e-05, + "grad_norm": 22.734758377075195, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8567928671836853, + "num_tokens": 137023991.0, + "step": 3590 + }, + { + "epoch": 0.4568121104185218, + "ewc_loss": 0.02771293930709362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3856470104656182e-05, + "grad_norm": 22.742828369140625, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8439493179321289, + "num_tokens": 137063261.0, + "step": 3591 + }, + { + "epoch": 0.45693932069711235, + "ewc_loss": 0.027755962684750557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3877981473342516e-05, + "grad_norm": 22.783578872680664, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8623322248458862, + "num_tokens": 137103935.0, + "step": 3592 + }, + { + "epoch": 0.4570665309757028, + "ewc_loss": 0.02772073820233345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3860369108442683e-05, + "grad_norm": 22.694602966308594, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8541532754898071, + "num_tokens": 137140194.0, + "step": 3593 + }, + { + "epoch": 0.45719374125429335, + "ewc_loss": 0.02779117412865162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3895587471779436e-05, + "grad_norm": 22.84282875061035, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8436416387557983, + "num_tokens": 137179086.0, + "step": 3594 + }, + { + "epoch": 0.4573209515328839, + "ewc_loss": 0.02784220688045025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3921103345637675e-05, + "grad_norm": 22.899810791015625, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8588842749595642, + "num_tokens": 137216887.0, + "step": 3595 + }, + { + "epoch": 0.45744816181147435, + "ewc_loss": 0.027698403224349022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3849201422999613e-05, + "grad_norm": 22.763277053833008, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8689017295837402, + "num_tokens": 137255029.0, + "step": 3596 + }, + { + "epoch": 0.4575753720900649, + "ewc_loss": 0.027724530547857285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.386226540489588e-05, + "grad_norm": 22.799283981323242, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8566360473632812, + "num_tokens": 137291399.0, + "step": 3597 + }, + { + "epoch": 0.4577025823686554, + "ewc_loss": 0.027722079306840897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.386103940603789e-05, + "grad_norm": 22.781579971313477, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8570432662963867, + "num_tokens": 137326569.0, + "step": 3598 + }, + { + "epoch": 0.4578297926472459, + "ewc_loss": 0.027739010751247406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3869504982721992e-05, + "grad_norm": 22.779722213745117, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8499991297721863, + "num_tokens": 137370035.0, + "step": 3599 + }, + { + "epoch": 0.4579570029258364, + "ewc_loss": 0.02772262878715992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3861314073437825e-05, + "grad_norm": 22.85689926147461, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8494724035263062, + "num_tokens": 137407969.0, + "step": 3600 + }, + { + "epoch": 0.45808421320442694, + "ewc_loss": 0.02774849906563759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3874249816581141e-05, + "grad_norm": 22.98731231689453, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8510143756866455, + "num_tokens": 137447256.0, + "step": 3601 + }, + { + "epoch": 0.4582114234830174, + "ewc_loss": 0.027689088135957718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3844543900631834e-05, + "grad_norm": 22.713523864746094, + "learning_rate": 1e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.83750319480896, + "num_tokens": 137479762.0, + "step": 3602 + }, + { + "epoch": 0.45833863376160794, + "ewc_loss": 0.02770930714905262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3854653843736742e-05, + "grad_norm": 22.947935104370117, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8459131717681885, + "num_tokens": 137521823.0, + "step": 3603 + }, + { + "epoch": 0.45846584404019847, + "ewc_loss": 0.02772512473165989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3862562809663359e-05, + "grad_norm": 22.67833709716797, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8509380221366882, + "num_tokens": 137558174.0, + "step": 3604 + }, + { + "epoch": 0.45859305431878894, + "ewc_loss": 0.027693506330251694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.384675306326244e-05, + "grad_norm": 22.954082489013672, + "learning_rate": 1e-06, + "loss": 0.5529, + "mean_token_accuracy": 0.828101396560669, + "num_tokens": 137597265.0, + "step": 3605 + }, + { + "epoch": 0.45872026459737947, + "ewc_loss": 0.027725357562303543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3862678315490484e-05, + "grad_norm": 22.71257209777832, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8605470657348633, + "num_tokens": 137635954.0, + "step": 3606 + }, + { + "epoch": 0.45884747487597, + "ewc_loss": 0.027685705572366714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3842853149981238e-05, + "grad_norm": 22.84055519104004, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8563703298568726, + "num_tokens": 137676137.0, + "step": 3607 + }, + { + "epoch": 0.45897468515456047, + "ewc_loss": 0.02773231454193592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3866157132724766e-05, + "grad_norm": 22.75458335876465, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8562524318695068, + "num_tokens": 137716723.0, + "step": 3608 + }, + { + "epoch": 0.459101895433151, + "ewc_loss": 0.027729038149118423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3864519132766873e-05, + "grad_norm": 22.799772262573242, + "learning_rate": 1e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8351048231124878, + "num_tokens": 137752435.0, + "step": 3609 + }, + { + "epoch": 0.4592291057117415, + "ewc_loss": 0.027752522379159927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3876261618861463e-05, + "grad_norm": 22.717342376708984, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.844443678855896, + "num_tokens": 137786684.0, + "step": 3610 + }, + { + "epoch": 0.459356315990332, + "ewc_loss": 0.027762163430452347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.388108194078086e-05, + "grad_norm": 22.839122772216797, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8449399471282959, + "num_tokens": 137830668.0, + "step": 3611 + }, + { + "epoch": 0.4594835262689225, + "ewc_loss": 0.027726437896490097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3863218555343337e-05, + "grad_norm": 22.688968658447266, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8585596084594727, + "num_tokens": 137865262.0, + "step": 3612 + }, + { + "epoch": 0.45961073654751305, + "ewc_loss": 0.027735969051718712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3867984307580628e-05, + "grad_norm": 22.798358917236328, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8592520952224731, + "num_tokens": 137898867.0, + "step": 3613 + }, + { + "epoch": 0.4597379468261035, + "ewc_loss": 0.027781857177615166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3890928130422253e-05, + "grad_norm": 22.789432525634766, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8541443943977356, + "num_tokens": 137935108.0, + "step": 3614 + }, + { + "epoch": 0.45986515710469406, + "ewc_loss": 0.027811944484710693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.390597208228428e-05, + "grad_norm": 22.930492401123047, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8650151491165161, + "num_tokens": 137977714.0, + "step": 3615 + }, + { + "epoch": 0.4599923673832846, + "ewc_loss": 0.027804885059595108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3902442333346698e-05, + "grad_norm": 22.795120239257812, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8657428026199341, + "num_tokens": 138006345.0, + "step": 3616 + }, + { + "epoch": 0.46011957766187506, + "ewc_loss": 0.02773359790444374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3866799235984217e-05, + "grad_norm": 22.796045303344727, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8535959124565125, + "num_tokens": 138045106.0, + "step": 3617 + }, + { + "epoch": 0.4602467879404656, + "ewc_loss": 0.027787121012806892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3893560208089184e-05, + "grad_norm": 22.868122100830078, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8468323349952698, + "num_tokens": 138079993.0, + "step": 3618 + }, + { + "epoch": 0.4603739982190561, + "ewc_loss": 0.027770787477493286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3885393855161965e-05, + "grad_norm": 22.781476974487305, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8559931516647339, + "num_tokens": 138118555.0, + "step": 3619 + }, + { + "epoch": 0.4605012084976466, + "ewc_loss": 0.027818778529763222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.390938905387884e-05, + "grad_norm": 22.828153610229492, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8617357015609741, + "num_tokens": 138155096.0, + "step": 3620 + }, + { + "epoch": 0.4606284187762371, + "ewc_loss": 0.027809837833046913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3904918887419626e-05, + "grad_norm": 22.779193878173828, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8550311923027039, + "num_tokens": 138194529.0, + "step": 3621 + }, + { + "epoch": 0.46075562905482764, + "ewc_loss": 0.02782198041677475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3910990674048662e-05, + "grad_norm": 22.866397857666016, + "learning_rate": 1e-06, + "loss": 0.5422, + "mean_token_accuracy": 0.8282867670059204, + "num_tokens": 138235520.0, + "step": 3622 + }, + { + "epoch": 0.4608828393334181, + "ewc_loss": 0.027889035642147064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3944517377240118e-05, + "grad_norm": 23.14927101135254, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8628348708152771, + "num_tokens": 138271391.0, + "step": 3623 + }, + { + "epoch": 0.46101004961200864, + "ewc_loss": 0.02777450904250145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3887254681321792e-05, + "grad_norm": 22.7356014251709, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8483791351318359, + "num_tokens": 138313923.0, + "step": 3624 + }, + { + "epoch": 0.46113725989059917, + "ewc_loss": 0.027748113498091698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3874057003704365e-05, + "grad_norm": 22.852418899536133, + "learning_rate": 1e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8457499742507935, + "num_tokens": 138355992.0, + "step": 3625 + }, + { + "epoch": 0.46126447016918964, + "ewc_loss": 0.027798691764473915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3899345503887162e-05, + "grad_norm": 22.74583625793457, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8730184435844421, + "num_tokens": 138393074.0, + "step": 3626 + }, + { + "epoch": 0.4613916804477802, + "ewc_loss": 0.02779691107571125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3898455108574126e-05, + "grad_norm": 22.815587997436523, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8606439828872681, + "num_tokens": 138427854.0, + "step": 3627 + }, + { + "epoch": 0.4615188907263707, + "ewc_loss": 0.027797194197773933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3898596989747602e-05, + "grad_norm": 22.693756103515625, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8630906343460083, + "num_tokens": 138467359.0, + "step": 3628 + }, + { + "epoch": 0.4616461010049612, + "ewc_loss": 0.02784135192632675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3920675883127842e-05, + "grad_norm": 22.835622787475586, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8414978981018066, + "num_tokens": 138505355.0, + "step": 3629 + }, + { + "epoch": 0.4617733112835517, + "ewc_loss": 0.0278761126101017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3938056326878723e-05, + "grad_norm": 22.849966049194336, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.857784628868103, + "num_tokens": 138543703.0, + "step": 3630 + }, + { + "epoch": 0.46190052156214223, + "ewc_loss": 0.027833791449666023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3916896023147274e-05, + "grad_norm": 22.687767028808594, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8460503816604614, + "num_tokens": 138579860.0, + "step": 3631 + }, + { + "epoch": 0.46202773184073276, + "ewc_loss": 0.027858318760991096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.392915964970598e-05, + "grad_norm": 22.78327178955078, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8624452352523804, + "num_tokens": 138615755.0, + "step": 3632 + }, + { + "epoch": 0.46215494211932323, + "ewc_loss": 0.027912935242056847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3956467228126712e-05, + "grad_norm": 22.757909774780273, + "learning_rate": 1e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8483845591545105, + "num_tokens": 138647607.0, + "step": 3633 + }, + { + "epoch": 0.46228215239791376, + "ewc_loss": 0.027883799746632576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3941899851488415e-05, + "grad_norm": 22.899028778076172, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.859332799911499, + "num_tokens": 138679284.0, + "step": 3634 + }, + { + "epoch": 0.4624093626765043, + "ewc_loss": 0.02793201617896557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.396600782754831e-05, + "grad_norm": 22.74884796142578, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8629063963890076, + "num_tokens": 138719690.0, + "step": 3635 + }, + { + "epoch": 0.46253657295509476, + "ewc_loss": 0.02786262333393097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3931311514170375e-05, + "grad_norm": 22.84868812561035, + "learning_rate": 1e-06, + "loss": 0.5076, + "mean_token_accuracy": 0.8412137031555176, + "num_tokens": 138758031.0, + "step": 3636 + }, + { + "epoch": 0.4626637832336853, + "ewc_loss": 0.027920790016651154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.396039533574367e-05, + "grad_norm": 22.84649085998535, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8532922863960266, + "num_tokens": 138797283.0, + "step": 3637 + }, + { + "epoch": 0.4627909935122758, + "ewc_loss": 0.027893448248505592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3946723811386619e-05, + "grad_norm": 22.814638137817383, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8494569659233093, + "num_tokens": 138834672.0, + "step": 3638 + }, + { + "epoch": 0.4629182037908663, + "ewc_loss": 0.02785894274711609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3929471606388688e-05, + "grad_norm": 22.847566604614258, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8642655611038208, + "num_tokens": 138871105.0, + "step": 3639 + }, + { + "epoch": 0.4630454140694568, + "ewc_loss": 0.027894631028175354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3947315892437473e-05, + "grad_norm": 22.789987564086914, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8544602990150452, + "num_tokens": 138903516.0, + "step": 3640 + }, + { + "epoch": 0.46317262434804735, + "ewc_loss": 0.027893172577023506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3946586477686651e-05, + "grad_norm": 22.85865592956543, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8524558544158936, + "num_tokens": 138942021.0, + "step": 3641 + }, + { + "epoch": 0.4632998346266378, + "ewc_loss": 0.027920864522457123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3960432625026442e-05, + "grad_norm": 22.75469970703125, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8623867034912109, + "num_tokens": 138978210.0, + "step": 3642 + }, + { + "epoch": 0.46342704490522835, + "ewc_loss": 0.02792299911379814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3961499462311622e-05, + "grad_norm": 22.75240135192871, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8563010692596436, + "num_tokens": 139019122.0, + "step": 3643 + }, + { + "epoch": 0.4635542551838189, + "ewc_loss": 0.027916623279452324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3958311683381908e-05, + "grad_norm": 22.93536376953125, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8644779920578003, + "num_tokens": 139062187.0, + "step": 3644 + }, + { + "epoch": 0.46368146546240935, + "ewc_loss": 0.027928398922085762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3964199752081186e-05, + "grad_norm": 22.730093002319336, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8724726438522339, + "num_tokens": 139102960.0, + "step": 3645 + }, + { + "epoch": 0.4638086757409999, + "ewc_loss": 0.027929911389946938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3964955542178359e-05, + "grad_norm": 22.91277503967285, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8545898795127869, + "num_tokens": 139137302.0, + "step": 3646 + }, + { + "epoch": 0.4639358860195904, + "ewc_loss": 0.027964742854237556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3982371456222609e-05, + "grad_norm": 22.75797462463379, + "learning_rate": 1e-06, + "loss": 0.5325, + "mean_token_accuracy": 0.8348017930984497, + "num_tokens": 139177195.0, + "step": 3647 + }, + { + "epoch": 0.4640630962981809, + "ewc_loss": 0.02788270264863968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3941351426183246e-05, + "grad_norm": 22.861637115478516, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8504695892333984, + "num_tokens": 139214722.0, + "step": 3648 + }, + { + "epoch": 0.4641903065767714, + "ewc_loss": 0.027960672974586487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3980336916574743e-05, + "grad_norm": 22.7795467376709, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8527957201004028, + "num_tokens": 139248569.0, + "step": 3649 + }, + { + "epoch": 0.46431751685536193, + "ewc_loss": 0.027926679700613022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.396334027958801e-05, + "grad_norm": 22.802406311035156, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8601338267326355, + "num_tokens": 139289141.0, + "step": 3650 + }, + { + "epoch": 0.4644447271339524, + "ewc_loss": 0.02799394726753235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3996973393659573e-05, + "grad_norm": 22.81319808959961, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8704351782798767, + "num_tokens": 139330791.0, + "step": 3651 + }, + { + "epoch": 0.46457193741254293, + "ewc_loss": 0.02794337458908558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3971687621960882e-05, + "grad_norm": 22.873964309692383, + "learning_rate": 1e-06, + "loss": 0.5644, + "mean_token_accuracy": 0.8217210173606873, + "num_tokens": 139373009.0, + "step": 3652 + }, + { + "epoch": 0.46469914769113346, + "ewc_loss": 0.02793000638484955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3965002835902851e-05, + "grad_norm": 22.72427749633789, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8522831797599792, + "num_tokens": 139408894.0, + "step": 3653 + }, + { + "epoch": 0.46482635796972394, + "ewc_loss": 0.02792419120669365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3962095181341283e-05, + "grad_norm": 22.832544326782227, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8515169620513916, + "num_tokens": 139444040.0, + "step": 3654 + }, + { + "epoch": 0.46495356824831446, + "ewc_loss": 0.02797929383814335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.398964650434209e-05, + "grad_norm": 22.836828231811523, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8605002164840698, + "num_tokens": 139479675.0, + "step": 3655 + }, + { + "epoch": 0.465080778526905, + "ewc_loss": 0.02796284295618534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3981421034259256e-05, + "grad_norm": 22.849658966064453, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8613541722297668, + "num_tokens": 139521792.0, + "step": 3656 + }, + { + "epoch": 0.46520798880549546, + "ewc_loss": 0.02792808786034584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3964044228487182e-05, + "grad_norm": 22.7060546875, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8612396121025085, + "num_tokens": 139563471.0, + "step": 3657 + }, + { + "epoch": 0.465335199084086, + "ewc_loss": 0.027975453063845634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3987726561026648e-05, + "grad_norm": 22.90203857421875, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8535144329071045, + "num_tokens": 139604702.0, + "step": 3658 + }, + { + "epoch": 0.4654624093626765, + "ewc_loss": 0.02790690027177334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3953450434200931e-05, + "grad_norm": 22.710575103759766, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8516756296157837, + "num_tokens": 139644573.0, + "step": 3659 + }, + { + "epoch": 0.465589619641267, + "ewc_loss": 0.027957234531641006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.397861706209369e-05, + "grad_norm": 22.88505744934082, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.839064359664917, + "num_tokens": 139680924.0, + "step": 3660 + }, + { + "epoch": 0.4657168299198575, + "ewc_loss": 0.028002146631479263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4001073395775165e-05, + "grad_norm": 22.838394165039062, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8460869789123535, + "num_tokens": 139724369.0, + "step": 3661 + }, + { + "epoch": 0.46584404019844805, + "ewc_loss": 0.02795802243053913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3979010873299558e-05, + "grad_norm": 22.935659408569336, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.854730486869812, + "num_tokens": 139769412.0, + "step": 3662 + }, + { + "epoch": 0.4659712504770385, + "ewc_loss": 0.02794855833053589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3974278772366233e-05, + "grad_norm": 22.760753631591797, + "learning_rate": 1e-06, + "loss": 0.5384, + "mean_token_accuracy": 0.8294681310653687, + "num_tokens": 139808066.0, + "step": 3663 + }, + { + "epoch": 0.46609846075562905, + "ewc_loss": 0.027891647070646286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3945823411631864e-05, + "grad_norm": 22.799036026000977, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8597017526626587, + "num_tokens": 139851169.0, + "step": 3664 + }, + { + "epoch": 0.4662256710342196, + "ewc_loss": 0.02799541875720024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3997709174873307e-05, + "grad_norm": 22.85103416442871, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8632011413574219, + "num_tokens": 139890550.0, + "step": 3665 + }, + { + "epoch": 0.46635288131281005, + "ewc_loss": 0.027943236753344536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3971618500363547e-05, + "grad_norm": 22.825883865356445, + "learning_rate": 1e-06, + "loss": 0.5468, + "mean_token_accuracy": 0.8273721933364868, + "num_tokens": 139931496.0, + "step": 3666 + }, + { + "epoch": 0.4664800915914006, + "ewc_loss": 0.02795342355966568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3976711670693476e-05, + "grad_norm": 22.895559310913086, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8445269465446472, + "num_tokens": 139965872.0, + "step": 3667 + }, + { + "epoch": 0.4666073018699911, + "ewc_loss": 0.02796228602528572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3981142728880513e-05, + "grad_norm": 22.807769775390625, + "learning_rate": 1e-06, + "loss": 0.5377, + "mean_token_accuracy": 0.8382246494293213, + "num_tokens": 140000394.0, + "step": 3668 + }, + { + "epoch": 0.4667345121485816, + "ewc_loss": 0.027883430942893028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3941715224063955e-05, + "grad_norm": 22.838558197021484, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8375044465065002, + "num_tokens": 140039967.0, + "step": 3669 + }, + { + "epoch": 0.4668617224271721, + "ewc_loss": 0.027998700737953186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3999349903315306e-05, + "grad_norm": 23.04736328125, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8544180989265442, + "num_tokens": 140075739.0, + "step": 3670 + }, + { + "epoch": 0.46698893270576264, + "ewc_loss": 0.027984747663140297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3992374078952707e-05, + "grad_norm": 22.865554809570312, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.858460545539856, + "num_tokens": 140110016.0, + "step": 3671 + }, + { + "epoch": 0.4671161429843531, + "ewc_loss": 0.027890587225556374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.394529317622073e-05, + "grad_norm": 22.868513107299805, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8435615301132202, + "num_tokens": 140147905.0, + "step": 3672 + }, + { + "epoch": 0.46724335326294364, + "ewc_loss": 0.027969487011432648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3984743418404832e-05, + "grad_norm": 22.86176109313965, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8472563624382019, + "num_tokens": 140182284.0, + "step": 3673 + }, + { + "epoch": 0.46737056354153417, + "ewc_loss": 0.027966631576418877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3983315511723049e-05, + "grad_norm": 22.959035873413086, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8458483815193176, + "num_tokens": 140220669.0, + "step": 3674 + }, + { + "epoch": 0.46749777382012464, + "ewc_loss": 0.028024820610880852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4012410247232765e-05, + "grad_norm": 22.79637908935547, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8398457169532776, + "num_tokens": 140258399.0, + "step": 3675 + }, + { + "epoch": 0.46762498409871517, + "ewc_loss": 0.027985738590359688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3992868844070472e-05, + "grad_norm": 22.88481330871582, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8672360181808472, + "num_tokens": 140296015.0, + "step": 3676 + }, + { + "epoch": 0.4677521943773057, + "ewc_loss": 0.028013288974761963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4006644050823525e-05, + "grad_norm": 22.830976486206055, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.843352735042572, + "num_tokens": 140338618.0, + "step": 3677 + }, + { + "epoch": 0.46787940465589617, + "ewc_loss": 0.027988091111183167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3994045730214566e-05, + "grad_norm": 22.774702072143555, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.85783851146698, + "num_tokens": 140379151.0, + "step": 3678 + }, + { + "epoch": 0.4680066149344867, + "ewc_loss": 0.028031542897224426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4015771739650518e-05, + "grad_norm": 22.947187423706055, + "learning_rate": 1e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8393571376800537, + "num_tokens": 140417740.0, + "step": 3679 + }, + { + "epoch": 0.4681338252130772, + "ewc_loss": 0.028038397431373596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4019198715686798e-05, + "grad_norm": 22.826269149780273, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8499726057052612, + "num_tokens": 140452454.0, + "step": 3680 + }, + { + "epoch": 0.46826103549166775, + "ewc_loss": 0.027976736426353455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3988367754791398e-05, + "grad_norm": 22.862394332885742, + "learning_rate": 1e-06, + "loss": 0.5482, + "mean_token_accuracy": 0.828954815864563, + "num_tokens": 140492981.0, + "step": 3681 + }, + { + "epoch": 0.4683882457702582, + "ewc_loss": 0.028063451871275902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4031726095709018e-05, + "grad_norm": 22.885425567626953, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8417260646820068, + "num_tokens": 140534131.0, + "step": 3682 + }, + { + "epoch": 0.46851545604884876, + "ewc_loss": 0.027994219213724136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3997109817864839e-05, + "grad_norm": 22.89762306213379, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8455275893211365, + "num_tokens": 140576761.0, + "step": 3683 + }, + { + "epoch": 0.4686426663274393, + "ewc_loss": 0.028056390583515167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4028195437276736e-05, + "grad_norm": 22.919288635253906, + "learning_rate": 1e-06, + "loss": 0.5549, + "mean_token_accuracy": 0.834425687789917, + "num_tokens": 140609538.0, + "step": 3684 + }, + { + "epoch": 0.46876987660602976, + "ewc_loss": 0.02802233397960663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4011166967975441e-05, + "grad_norm": 22.85552406311035, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8493924140930176, + "num_tokens": 140641802.0, + "step": 3685 + }, + { + "epoch": 0.4688970868846203, + "ewc_loss": 0.027985621243715286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.3992810636409558e-05, + "grad_norm": 22.826988220214844, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.8406500816345215, + "num_tokens": 140685832.0, + "step": 3686 + }, + { + "epoch": 0.4690242971632108, + "ewc_loss": 0.028063831850886345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4031916180101689e-05, + "grad_norm": 22.840579986572266, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8409228324890137, + "num_tokens": 140725333.0, + "step": 3687 + }, + { + "epoch": 0.4691515074418013, + "ewc_loss": 0.028063807636499405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4031903447175864e-05, + "grad_norm": 22.865575790405273, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8516176342964172, + "num_tokens": 140760838.0, + "step": 3688 + }, + { + "epoch": 0.4692787177203918, + "ewc_loss": 0.028102099895477295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4051050129637588e-05, + "grad_norm": 22.79242515563965, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8425152897834778, + "num_tokens": 140797367.0, + "step": 3689 + }, + { + "epoch": 0.46940592799898234, + "ewc_loss": 0.028053872287273407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.402693578711478e-05, + "grad_norm": 22.854116439819336, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.851378858089447, + "num_tokens": 140839778.0, + "step": 3690 + }, + { + "epoch": 0.4695331382775728, + "ewc_loss": 0.0281422957777977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4071148143557366e-05, + "grad_norm": 22.969022750854492, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.847093939781189, + "num_tokens": 140880340.0, + "step": 3691 + }, + { + "epoch": 0.46966034855616334, + "ewc_loss": 0.02808467112481594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.40423353514052e-05, + "grad_norm": 22.890512466430664, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8458054065704346, + "num_tokens": 140915451.0, + "step": 3692 + }, + { + "epoch": 0.46978755883475387, + "ewc_loss": 0.02805941551923752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4029707926965784e-05, + "grad_norm": 22.90074920654297, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.84968101978302, + "num_tokens": 140956967.0, + "step": 3693 + }, + { + "epoch": 0.46991476911334434, + "ewc_loss": 0.0280937310308218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.404686554451473e-05, + "grad_norm": 22.910619735717773, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.867547869682312, + "num_tokens": 140996640.0, + "step": 3694 + }, + { + "epoch": 0.47004197939193487, + "ewc_loss": 0.028105465695261955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4052732694835868e-05, + "grad_norm": 22.931840896606445, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8511274456977844, + "num_tokens": 141036094.0, + "step": 3695 + }, + { + "epoch": 0.4701691896705254, + "ewc_loss": 0.028094593435525894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4047296645003371e-05, + "grad_norm": 22.964691162109375, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8417158126831055, + "num_tokens": 141069033.0, + "step": 3696 + }, + { + "epoch": 0.4702963999491159, + "ewc_loss": 0.02809728868305683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4048644516151398e-05, + "grad_norm": 22.90277862548828, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8694897890090942, + "num_tokens": 141111075.0, + "step": 3697 + }, + { + "epoch": 0.4704236102277064, + "ewc_loss": 0.028027605265378952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.401380268362118e-05, + "grad_norm": 22.891265869140625, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8675459623336792, + "num_tokens": 141145270.0, + "step": 3698 + }, + { + "epoch": 0.47055082050629693, + "ewc_loss": 0.02809845842421055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4049229321244638e-05, + "grad_norm": 22.947021484375, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8660352826118469, + "num_tokens": 141182132.0, + "step": 3699 + }, + { + "epoch": 0.4706780307848874, + "ewc_loss": 0.028102928772568703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4051463949726894e-05, + "grad_norm": 22.900012969970703, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8531092405319214, + "num_tokens": 141217455.0, + "step": 3700 + }, + { + "epoch": 0.47080524106347793, + "ewc_loss": 0.02808344177901745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4041720532986801e-05, + "grad_norm": 23.016250610351562, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8517076373100281, + "num_tokens": 141249680.0, + "step": 3701 + }, + { + "epoch": 0.47093245134206846, + "ewc_loss": 0.028082268312573433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4041133908904158e-05, + "grad_norm": 22.83510971069336, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8530582189559937, + "num_tokens": 141292085.0, + "step": 3702 + }, + { + "epoch": 0.47105966162065893, + "ewc_loss": 0.028059063479304314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.402953148499364e-05, + "grad_norm": 22.923429489135742, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8607385158538818, + "num_tokens": 141331455.0, + "step": 3703 + }, + { + "epoch": 0.47118687189924946, + "ewc_loss": 0.028089741244912148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4044871022633743e-05, + "grad_norm": 22.831754684448242, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8530067205429077, + "num_tokens": 141365623.0, + "step": 3704 + }, + { + "epoch": 0.47131408217784, + "ewc_loss": 0.028097951784729958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4048975572222844e-05, + "grad_norm": 22.970325469970703, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8438553810119629, + "num_tokens": 141399033.0, + "step": 3705 + }, + { + "epoch": 0.47144129245643046, + "ewc_loss": 0.028144214302301407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4072106750973035e-05, + "grad_norm": 22.82295036315918, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.858599841594696, + "num_tokens": 141430436.0, + "step": 3706 + }, + { + "epoch": 0.471568502735021, + "ewc_loss": 0.028034741058945656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4017370631336235e-05, + "grad_norm": 22.925861358642578, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8691504001617432, + "num_tokens": 141463169.0, + "step": 3707 + }, + { + "epoch": 0.4716957130136115, + "ewc_loss": 0.028199858963489532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.409992910339497e-05, + "grad_norm": 22.981889724731445, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8696602582931519, + "num_tokens": 141495524.0, + "step": 3708 + }, + { + "epoch": 0.471822923292202, + "ewc_loss": 0.028049061074852943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4024530173628591e-05, + "grad_norm": 22.772443771362305, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8640939593315125, + "num_tokens": 141533099.0, + "step": 3709 + }, + { + "epoch": 0.4719501335707925, + "ewc_loss": 0.028127597644925117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4063798516872339e-05, + "grad_norm": 22.944982528686523, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8580331802368164, + "num_tokens": 141576389.0, + "step": 3710 + }, + { + "epoch": 0.47207734384938305, + "ewc_loss": 0.028146013617515564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.407300715072779e-05, + "grad_norm": 22.99371337890625, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8545089960098267, + "num_tokens": 141611870.0, + "step": 3711 + }, + { + "epoch": 0.4722045541279735, + "ewc_loss": 0.028055688366293907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4027844372321852e-05, + "grad_norm": 22.90361785888672, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8642017841339111, + "num_tokens": 141648539.0, + "step": 3712 + }, + { + "epoch": 0.47233176440656405, + "ewc_loss": 0.028092630207538605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4046315300220158e-05, + "grad_norm": 22.935001373291016, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8570349216461182, + "num_tokens": 141688549.0, + "step": 3713 + }, + { + "epoch": 0.4724589746851546, + "ewc_loss": 0.028114525601267815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4057262887945399e-05, + "grad_norm": 22.860748291015625, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8624799251556396, + "num_tokens": 141725294.0, + "step": 3714 + }, + { + "epoch": 0.47258618496374505, + "ewc_loss": 0.028120778501033783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4060388821235392e-05, + "grad_norm": 22.94438934326172, + "learning_rate": 1e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8377814292907715, + "num_tokens": 141756761.0, + "step": 3715 + }, + { + "epoch": 0.4727133952423356, + "ewc_loss": 0.028150852769613266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4075426406634506e-05, + "grad_norm": 22.844255447387695, + "learning_rate": 1e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8372519016265869, + "num_tokens": 141790417.0, + "step": 3716 + }, + { + "epoch": 0.4728406055209261, + "ewc_loss": 0.028111817315220833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4055908650334459e-05, + "grad_norm": 22.776906967163086, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8645511269569397, + "num_tokens": 141830749.0, + "step": 3717 + }, + { + "epoch": 0.4729678157995166, + "ewc_loss": 0.028130825608968735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4065412869967986e-05, + "grad_norm": 22.901681900024414, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8568255305290222, + "num_tokens": 141868433.0, + "step": 3718 + }, + { + "epoch": 0.4730950260781071, + "ewc_loss": 0.028153520077466965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4076759725867305e-05, + "grad_norm": 22.84447479248047, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8709696531295776, + "num_tokens": 141907140.0, + "step": 3719 + }, + { + "epoch": 0.47322223635669763, + "ewc_loss": 0.028076617047190666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4038308108865749e-05, + "grad_norm": 22.79981231689453, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8671042919158936, + "num_tokens": 141947436.0, + "step": 3720 + }, + { + "epoch": 0.4733494466352881, + "ewc_loss": 0.028214039281010628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4107019524089992e-05, + "grad_norm": 22.84772300720215, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8463617563247681, + "num_tokens": 141986876.0, + "step": 3721 + }, + { + "epoch": 0.47347665691387864, + "ewc_loss": 0.028110699728131294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.405535022058757e-05, + "grad_norm": 22.811132431030273, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8582203388214111, + "num_tokens": 142025806.0, + "step": 3722 + }, + { + "epoch": 0.47360386719246916, + "ewc_loss": 0.028198715299367905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4099357940722257e-05, + "grad_norm": 22.93280792236328, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8535192012786865, + "num_tokens": 142057932.0, + "step": 3723 + }, + { + "epoch": 0.47373107747105964, + "ewc_loss": 0.028203969821333885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4101984561420977e-05, + "grad_norm": 22.90252113342285, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8572840690612793, + "num_tokens": 142088880.0, + "step": 3724 + }, + { + "epoch": 0.47385828774965016, + "ewc_loss": 0.028179220855236053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4089610886003356e-05, + "grad_norm": 22.905832290649414, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8665246963500977, + "num_tokens": 142124959.0, + "step": 3725 + }, + { + "epoch": 0.4739854980282407, + "ewc_loss": 0.028147323057055473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4073661986913066e-05, + "grad_norm": 22.853437423706055, + "learning_rate": 1e-06, + "loss": 0.5321, + "mean_token_accuracy": 0.8398319482803345, + "num_tokens": 142160667.0, + "step": 3726 + }, + { + "epoch": 0.47411270830683117, + "ewc_loss": 0.02822314016520977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4111569726082962e-05, + "grad_norm": 22.883087158203125, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8531440496444702, + "num_tokens": 142205017.0, + "step": 3727 + }, + { + "epoch": 0.4742399185854217, + "ewc_loss": 0.028142200782895088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4071100849832874e-05, + "grad_norm": 22.83987808227539, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8605045080184937, + "num_tokens": 142241479.0, + "step": 3728 + }, + { + "epoch": 0.4743671288640122, + "ewc_loss": 0.028168782591819763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4084391295909882e-05, + "grad_norm": 22.835329055786133, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8561050295829773, + "num_tokens": 142280881.0, + "step": 3729 + }, + { + "epoch": 0.4744943391426027, + "ewc_loss": 0.028226366266608238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4113183169683907e-05, + "grad_norm": 22.901710510253906, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8621710538864136, + "num_tokens": 142328248.0, + "step": 3730 + }, + { + "epoch": 0.4746215494211932, + "ewc_loss": 0.028219908475875854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4109954463492613e-05, + "grad_norm": 22.87590789794922, + "learning_rate": 1e-06, + "loss": 0.5327, + "mean_token_accuracy": 0.8381624817848206, + "num_tokens": 142364872.0, + "step": 3731 + }, + { + "epoch": 0.47474875969978375, + "ewc_loss": 0.028208190575242043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4104095498623792e-05, + "grad_norm": 22.82853889465332, + "learning_rate": 1e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.8360315561294556, + "num_tokens": 142400494.0, + "step": 3732 + }, + { + "epoch": 0.4748759699783743, + "ewc_loss": 0.028245702385902405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4122851098363753e-05, + "grad_norm": 22.831647872924805, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8507965803146362, + "num_tokens": 142433739.0, + "step": 3733 + }, + { + "epoch": 0.47500318025696475, + "ewc_loss": 0.028205769136548042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4102884961175732e-05, + "grad_norm": 22.840696334838867, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.862358808517456, + "num_tokens": 142469777.0, + "step": 3734 + }, + { + "epoch": 0.4751303905355553, + "ewc_loss": 0.02828926593065262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4144632586976513e-05, + "grad_norm": 22.87407112121582, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8451849818229675, + "num_tokens": 142510052.0, + "step": 3735 + }, + { + "epoch": 0.4752576008141458, + "ewc_loss": 0.028235383331775665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4117691534920596e-05, + "grad_norm": 22.89539337158203, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8420120477676392, + "num_tokens": 142545689.0, + "step": 3736 + }, + { + "epoch": 0.4753848110927363, + "ewc_loss": 0.028303885832428932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4151943105389364e-05, + "grad_norm": 22.882959365844727, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8534464836120605, + "num_tokens": 142585916.0, + "step": 3737 + }, + { + "epoch": 0.4755120213713268, + "ewc_loss": 0.0282233078032732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4111654309090227e-05, + "grad_norm": 22.869436264038086, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8609437942504883, + "num_tokens": 142623019.0, + "step": 3738 + }, + { + "epoch": 0.47563923164991734, + "ewc_loss": 0.028277631849050522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4138816368358675e-05, + "grad_norm": 22.893890380859375, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8511744141578674, + "num_tokens": 142660986.0, + "step": 3739 + }, + { + "epoch": 0.4757664419285078, + "ewc_loss": 0.028300898149609566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4150448805594351e-05, + "grad_norm": 23.067346572875977, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8575730323791504, + "num_tokens": 142702876.0, + "step": 3740 + }, + { + "epoch": 0.47589365220709834, + "ewc_loss": 0.028283627703785896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4141814062895719e-05, + "grad_norm": 22.95698356628418, + "learning_rate": 1e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8428642749786377, + "num_tokens": 142744082.0, + "step": 3741 + }, + { + "epoch": 0.47602086248568887, + "ewc_loss": 0.028234634548425674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4117317732598167e-05, + "grad_norm": 23.207378387451172, + "learning_rate": 1e-06, + "loss": 0.5185, + "mean_token_accuracy": 0.8347222805023193, + "num_tokens": 142777511.0, + "step": 3742 + }, + { + "epoch": 0.47614807276427934, + "ewc_loss": 0.02821001037955284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4105004993325565e-05, + "grad_norm": 22.869413375854492, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8498388528823853, + "num_tokens": 142818882.0, + "step": 3743 + }, + { + "epoch": 0.47627528304286987, + "ewc_loss": 0.028161747381091118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4080873370403424e-05, + "grad_norm": 23.127124786376953, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.856900691986084, + "num_tokens": 142852771.0, + "step": 3744 + }, + { + "epoch": 0.4764024933214604, + "ewc_loss": 0.02826237678527832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4131188436294906e-05, + "grad_norm": 22.957454681396484, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8547084331512451, + "num_tokens": 142888398.0, + "step": 3745 + }, + { + "epoch": 0.47652970360005087, + "ewc_loss": 0.028129028156399727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4064514289202634e-05, + "grad_norm": 22.925798416137695, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8698441982269287, + "num_tokens": 142924789.0, + "step": 3746 + }, + { + "epoch": 0.4766569138786414, + "ewc_loss": 0.028172273188829422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4086136616242584e-05, + "grad_norm": 22.891132354736328, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8618499040603638, + "num_tokens": 142958843.0, + "step": 3747 + }, + { + "epoch": 0.4767841241572319, + "ewc_loss": 0.028202272951602936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4101136912358925e-05, + "grad_norm": 22.965160369873047, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8649449944496155, + "num_tokens": 142996408.0, + "step": 3748 + }, + { + "epoch": 0.4769113344358224, + "ewc_loss": 0.028246372938156128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4123186701908708e-05, + "grad_norm": 22.937559127807617, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8465772867202759, + "num_tokens": 143040031.0, + "step": 3749 + }, + { + "epoch": 0.4770385447144129, + "ewc_loss": 0.028172513470053673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4086256669543218e-05, + "grad_norm": 22.817428588867188, + "learning_rate": 1e-06, + "loss": 0.5329, + "mean_token_accuracy": 0.8325484991073608, + "num_tokens": 143081648.0, + "step": 3750 + }, + { + "epoch": 0.47716575499300345, + "ewc_loss": 0.02827240340411663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4136201571091078e-05, + "grad_norm": 23.070545196533203, + "learning_rate": 1e-06, + "loss": 0.5279, + "mean_token_accuracy": 0.8331757187843323, + "num_tokens": 143118913.0, + "step": 3751 + }, + { + "epoch": 0.4772929652715939, + "ewc_loss": 0.02817467227578163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4087336239754222e-05, + "grad_norm": 22.805246353149414, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.863304615020752, + "num_tokens": 143153858.0, + "step": 3752 + }, + { + "epoch": 0.47742017555018446, + "ewc_loss": 0.02822270616889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.411135326634394e-05, + "grad_norm": 23.017423629760742, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.864490270614624, + "num_tokens": 143190977.0, + "step": 3753 + }, + { + "epoch": 0.477547385828775, + "ewc_loss": 0.028221379965543747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4110690244706348e-05, + "grad_norm": 22.863388061523438, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8472124934196472, + "num_tokens": 143228009.0, + "step": 3754 + }, + { + "epoch": 0.47767459610736546, + "ewc_loss": 0.028236936777830124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.411846824339591e-05, + "grad_norm": 22.971221923828125, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8434299826622009, + "num_tokens": 143269820.0, + "step": 3755 + }, + { + "epoch": 0.477801806385956, + "ewc_loss": 0.028275402262806892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4137701327854302e-05, + "grad_norm": 22.96546745300293, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8585110902786255, + "num_tokens": 143303441.0, + "step": 3756 + }, + { + "epoch": 0.4779290166645465, + "ewc_loss": 0.02822534367442131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.411267203366151e-05, + "grad_norm": 22.920419692993164, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8456281423568726, + "num_tokens": 143338748.0, + "step": 3757 + }, + { + "epoch": 0.478056226943137, + "ewc_loss": 0.02825103886425495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4125519555818755e-05, + "grad_norm": 22.906513214111328, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8538745641708374, + "num_tokens": 143373900.0, + "step": 3758 + }, + { + "epoch": 0.4781834372217275, + "ewc_loss": 0.028209082782268524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.410454115102766e-05, + "grad_norm": 22.905254364013672, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8490132093429565, + "num_tokens": 143411190.0, + "step": 3759 + }, + { + "epoch": 0.47831064750031804, + "ewc_loss": 0.028290295973420143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4145148270472419e-05, + "grad_norm": 22.855905532836914, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.851093053817749, + "num_tokens": 143449470.0, + "step": 3760 + }, + { + "epoch": 0.4784378577789085, + "ewc_loss": 0.02823370136320591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4116850252321456e-05, + "grad_norm": 22.89597511291504, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8597191572189331, + "num_tokens": 143485868.0, + "step": 3761 + }, + { + "epoch": 0.47856506805749904, + "ewc_loss": 0.028271663933992386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4135832316242158e-05, + "grad_norm": 22.895761489868164, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8628512620925903, + "num_tokens": 143520334.0, + "step": 3762 + }, + { + "epoch": 0.47869227833608957, + "ewc_loss": 0.028257222846150398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4128611837804783e-05, + "grad_norm": 22.967100143432617, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8466746211051941, + "num_tokens": 143557752.0, + "step": 3763 + }, + { + "epoch": 0.47881948861468004, + "ewc_loss": 0.028322050347924232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4161025319481269e-05, + "grad_norm": 22.931657791137695, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8540141582489014, + "num_tokens": 143593385.0, + "step": 3764 + }, + { + "epoch": 0.4789466988932706, + "ewc_loss": 0.028249669820070267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.412483470630832e-05, + "grad_norm": 23.014062881469727, + "learning_rate": 1e-06, + "loss": 0.5333, + "mean_token_accuracy": 0.8331476449966431, + "num_tokens": 143631975.0, + "step": 3765 + }, + { + "epoch": 0.4790739091718611, + "ewc_loss": 0.028305552899837494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4152776202536188e-05, + "grad_norm": 22.949405670166016, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8361045718193054, + "num_tokens": 143668382.0, + "step": 3766 + }, + { + "epoch": 0.4792011194504516, + "ewc_loss": 0.028281979262828827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4140990060695913e-05, + "grad_norm": 22.988666534423828, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8392062783241272, + "num_tokens": 143706623.0, + "step": 3767 + }, + { + "epoch": 0.4793283297290421, + "ewc_loss": 0.028302108868956566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4151054529065732e-05, + "grad_norm": 22.87008285522461, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8564357161521912, + "num_tokens": 143745664.0, + "step": 3768 + }, + { + "epoch": 0.47945554000763263, + "ewc_loss": 0.028269590809941292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4134795492282137e-05, + "grad_norm": 23.02044677734375, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8449500799179077, + "num_tokens": 143780761.0, + "step": 3769 + }, + { + "epoch": 0.4795827502862231, + "ewc_loss": 0.02836081013083458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4180404832586646e-05, + "grad_norm": 22.94119644165039, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8575690388679504, + "num_tokens": 143825298.0, + "step": 3770 + }, + { + "epoch": 0.47970996056481363, + "ewc_loss": 0.028280504047870636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4140252460492775e-05, + "grad_norm": 22.98095703125, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8726781010627747, + "num_tokens": 143859855.0, + "step": 3771 + }, + { + "epoch": 0.47983717084340416, + "ewc_loss": 0.028313657268881798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4156828910927288e-05, + "grad_norm": 22.91043472290039, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8674923181533813, + "num_tokens": 143898553.0, + "step": 3772 + }, + { + "epoch": 0.47996438112199463, + "ewc_loss": 0.0282752588391304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4137629477772862e-05, + "grad_norm": 22.99049186706543, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8514032363891602, + "num_tokens": 143935362.0, + "step": 3773 + }, + { + "epoch": 0.48009159140058516, + "ewc_loss": 0.02832581289112568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4162906154524535e-05, + "grad_norm": 22.93982696533203, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8545360565185547, + "num_tokens": 143978692.0, + "step": 3774 + }, + { + "epoch": 0.4802188016791757, + "ewc_loss": 0.0283003281801939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4150164133752696e-05, + "grad_norm": 22.960905075073242, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8618281483650208, + "num_tokens": 144020951.0, + "step": 3775 + }, + { + "epoch": 0.48034601195776616, + "ewc_loss": 0.028332917019724846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.416645864082966e-05, + "grad_norm": 22.970380783081055, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8629284501075745, + "num_tokens": 144058341.0, + "step": 3776 + }, + { + "epoch": 0.4804732222363567, + "ewc_loss": 0.028363680467009544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4181840015226044e-05, + "grad_norm": 22.955501556396484, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8464668989181519, + "num_tokens": 144098232.0, + "step": 3777 + }, + { + "epoch": 0.4806004325149472, + "ewc_loss": 0.028313472867012024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4156736142467707e-05, + "grad_norm": 23.034257888793945, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8499024510383606, + "num_tokens": 144137537.0, + "step": 3778 + }, + { + "epoch": 0.4807276427935377, + "ewc_loss": 0.028366923332214355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4183461644279305e-05, + "grad_norm": 22.950544357299805, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.859026312828064, + "num_tokens": 144178459.0, + "step": 3779 + }, + { + "epoch": 0.4808548530721282, + "ewc_loss": 0.028254186734557152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4127092981652822e-05, + "grad_norm": 23.024810791015625, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8490664958953857, + "num_tokens": 144219257.0, + "step": 3780 + }, + { + "epoch": 0.48098206335071875, + "ewc_loss": 0.028340069577097893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4170034773997031e-05, + "grad_norm": 22.902799606323242, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8590428829193115, + "num_tokens": 144262232.0, + "step": 3781 + }, + { + "epoch": 0.4811092736293093, + "ewc_loss": 0.02824562042951584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4122810171102174e-05, + "grad_norm": 22.966445922851562, + "learning_rate": 1e-06, + "loss": 0.5271, + "mean_token_accuracy": 0.8317854404449463, + "num_tokens": 144297144.0, + "step": 3782 + }, + { + "epoch": 0.48123648390789975, + "ewc_loss": 0.02833358384668827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4166791515890509e-05, + "grad_norm": 22.91571617126465, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.854660153388977, + "num_tokens": 144336078.0, + "step": 3783 + }, + { + "epoch": 0.4813636941864903, + "ewc_loss": 0.02831677533686161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4158387784846127e-05, + "grad_norm": 23.002573013305664, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8460599184036255, + "num_tokens": 144380162.0, + "step": 3784 + }, + { + "epoch": 0.4814909044650808, + "ewc_loss": 0.028269026428461075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4134513548924588e-05, + "grad_norm": 22.944366455078125, + "learning_rate": 1e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8384663462638855, + "num_tokens": 144416735.0, + "step": 3785 + }, + { + "epoch": 0.4816181147436713, + "ewc_loss": 0.028306806460022926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.415340284438571e-05, + "grad_norm": 22.959352493286133, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8428383469581604, + "num_tokens": 144452780.0, + "step": 3786 + }, + { + "epoch": 0.4817453250222618, + "ewc_loss": 0.02829570323228836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4147851288726088e-05, + "grad_norm": 22.960657119750977, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.859412431716919, + "num_tokens": 144494155.0, + "step": 3787 + }, + { + "epoch": 0.48187253530085233, + "ewc_loss": 0.02827891707420349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4139458471618127e-05, + "grad_norm": 22.91996192932129, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8632696866989136, + "num_tokens": 144533989.0, + "step": 3788 + }, + { + "epoch": 0.4819997455794428, + "ewc_loss": 0.02833200804889202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4166003893478774e-05, + "grad_norm": 22.962507247924805, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8497573137283325, + "num_tokens": 144577409.0, + "step": 3789 + }, + { + "epoch": 0.48212695585803333, + "ewc_loss": 0.028284866362810135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4142433428787626e-05, + "grad_norm": 22.94036865234375, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8457146286964417, + "num_tokens": 144618501.0, + "step": 3790 + }, + { + "epoch": 0.48225416613662386, + "ewc_loss": 0.02832278423011303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4161391845846083e-05, + "grad_norm": 23.029796600341797, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8615217208862305, + "num_tokens": 144659454.0, + "step": 3791 + }, + { + "epoch": 0.48238137641521434, + "ewc_loss": 0.028289368376135826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4144684428174514e-05, + "grad_norm": 22.918245315551758, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8590900301933289, + "num_tokens": 144695580.0, + "step": 3792 + }, + { + "epoch": 0.48250858669380486, + "ewc_loss": 0.028286578133702278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4143289263301995e-05, + "grad_norm": 22.917072296142578, + "learning_rate": 1e-06, + "loss": 0.5244, + "mean_token_accuracy": 0.8361858129501343, + "num_tokens": 144735586.0, + "step": 3793 + }, + { + "epoch": 0.4826357969723954, + "ewc_loss": 0.028305573388934135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.415278711647261e-05, + "grad_norm": 22.93744468688965, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8545622825622559, + "num_tokens": 144776371.0, + "step": 3794 + }, + { + "epoch": 0.48276300725098586, + "ewc_loss": 0.028364963829517365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4182482118485495e-05, + "grad_norm": 22.94120979309082, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8483489155769348, + "num_tokens": 144816724.0, + "step": 3795 + }, + { + "epoch": 0.4828902175295764, + "ewc_loss": 0.028304466977715492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.415223323419923e-05, + "grad_norm": 22.952190399169922, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8599458932876587, + "num_tokens": 144854591.0, + "step": 3796 + }, + { + "epoch": 0.4830174278081669, + "ewc_loss": 0.02832692861557007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4163464584271424e-05, + "grad_norm": 22.918460845947266, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8553390502929688, + "num_tokens": 144893694.0, + "step": 3797 + }, + { + "epoch": 0.4831446380867574, + "ewc_loss": 0.02830633707344532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4153168194752652e-05, + "grad_norm": 22.92161750793457, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8497897982597351, + "num_tokens": 144932985.0, + "step": 3798 + }, + { + "epoch": 0.4832718483653479, + "ewc_loss": 0.028298690915107727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.41493455885211e-05, + "grad_norm": 22.902141571044922, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8529845476150513, + "num_tokens": 144967823.0, + "step": 3799 + }, + { + "epoch": 0.48339905864393845, + "ewc_loss": 0.02833194099366665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4165970242174808e-05, + "grad_norm": 22.99020767211914, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8738690614700317, + "num_tokens": 145007750.0, + "step": 3800 + }, + { + "epoch": 0.4835262689225289, + "ewc_loss": 0.028367776423692703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4183888197294436e-05, + "grad_norm": 22.967348098754883, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8484742641448975, + "num_tokens": 145048596.0, + "step": 3801 + }, + { + "epoch": 0.48365347920111945, + "ewc_loss": 0.028311125934123993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.415556289430242e-05, + "grad_norm": 22.931123733520508, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8429701328277588, + "num_tokens": 145080868.0, + "step": 3802 + }, + { + "epoch": 0.48378068947971, + "ewc_loss": 0.02829919010400772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4149594790069386e-05, + "grad_norm": 22.874269485473633, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8630387187004089, + "num_tokens": 145115865.0, + "step": 3803 + }, + { + "epoch": 0.48390789975830045, + "ewc_loss": 0.028372740373015404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4186370208335575e-05, + "grad_norm": 22.990955352783203, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8674957752227783, + "num_tokens": 145151748.0, + "step": 3804 + }, + { + "epoch": 0.484035110036891, + "ewc_loss": 0.02834153175354004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4170766007737257e-05, + "grad_norm": 22.80360221862793, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8543195128440857, + "num_tokens": 145187074.0, + "step": 3805 + }, + { + "epoch": 0.4841623203154815, + "ewc_loss": 0.028356218710541725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4178109267959371e-05, + "grad_norm": 23.030729293823242, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8712045550346375, + "num_tokens": 145224855.0, + "step": 3806 + }, + { + "epoch": 0.484289530594072, + "ewc_loss": 0.02839052863419056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4195264157024212e-05, + "grad_norm": 22.89766502380371, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8639311790466309, + "num_tokens": 145266538.0, + "step": 3807 + }, + { + "epoch": 0.4844167408726625, + "ewc_loss": 0.02829255908727646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4146279681881424e-05, + "grad_norm": 22.854814529418945, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8588129281997681, + "num_tokens": 145303560.0, + "step": 3808 + }, + { + "epoch": 0.48454395115125304, + "ewc_loss": 0.028380950912833214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4190475667419378e-05, + "grad_norm": 22.811491012573242, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8545643091201782, + "num_tokens": 145345985.0, + "step": 3809 + }, + { + "epoch": 0.4846711614298435, + "ewc_loss": 0.028378695249557495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.418934789398918e-05, + "grad_norm": 22.95867919921875, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8446210622787476, + "num_tokens": 145382132.0, + "step": 3810 + }, + { + "epoch": 0.48479837170843404, + "ewc_loss": 0.028465313836932182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4232657122192904e-05, + "grad_norm": 22.989622116088867, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8442012667655945, + "num_tokens": 145418992.0, + "step": 3811 + }, + { + "epoch": 0.48492558198702457, + "ewc_loss": 0.028451647609472275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4225824088498484e-05, + "grad_norm": 22.95513343811035, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8531038761138916, + "num_tokens": 145455892.0, + "step": 3812 + }, + { + "epoch": 0.48505279226561504, + "ewc_loss": 0.02845389023423195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.422694549546577e-05, + "grad_norm": 22.986129760742188, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8517086505889893, + "num_tokens": 145497217.0, + "step": 3813 + }, + { + "epoch": 0.48518000254420557, + "ewc_loss": 0.028443772345781326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4221885976439808e-05, + "grad_norm": 23.139596939086914, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8602281808853149, + "num_tokens": 145533822.0, + "step": 3814 + }, + { + "epoch": 0.4853072128227961, + "ewc_loss": 0.02841228060424328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4206139894668013e-05, + "grad_norm": 22.977933883666992, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8732296824455261, + "num_tokens": 145572967.0, + "step": 3815 + }, + { + "epoch": 0.48543442310138657, + "ewc_loss": 0.028378544375300407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4189272405928932e-05, + "grad_norm": 23.107479095458984, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8678404092788696, + "num_tokens": 145609436.0, + "step": 3816 + }, + { + "epoch": 0.4855616333799771, + "ewc_loss": 0.028436781838536263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4218390788300894e-05, + "grad_norm": 23.089263916015625, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8661905527114868, + "num_tokens": 145645083.0, + "step": 3817 + }, + { + "epoch": 0.4856888436585676, + "ewc_loss": 0.02835589461028576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4177947377902456e-05, + "grad_norm": 23.094228744506836, + "learning_rate": 1e-06, + "loss": 0.5456, + "mean_token_accuracy": 0.8361146450042725, + "num_tokens": 145680249.0, + "step": 3818 + }, + { + "epoch": 0.4858160539371581, + "ewc_loss": 0.028401201590895653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4200601071934216e-05, + "grad_norm": 23.04507064819336, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8592655658721924, + "num_tokens": 145718569.0, + "step": 3819 + }, + { + "epoch": 0.4859432642157486, + "ewc_loss": 0.02835756540298462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4178782294038683e-05, + "grad_norm": 22.990638732910156, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8376806974411011, + "num_tokens": 145751868.0, + "step": 3820 + }, + { + "epoch": 0.48607047449433916, + "ewc_loss": 0.028401069343090057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4200534678820986e-05, + "grad_norm": 23.198617935180664, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8587581515312195, + "num_tokens": 145787881.0, + "step": 3821 + }, + { + "epoch": 0.48619768477292963, + "ewc_loss": 0.028392791748046875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4196395568433218e-05, + "grad_norm": 22.99345588684082, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.861552357673645, + "num_tokens": 145824010.0, + "step": 3822 + }, + { + "epoch": 0.48632489505152016, + "ewc_loss": 0.02831178717315197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4155893950373866e-05, + "grad_norm": 22.975627899169922, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8614438772201538, + "num_tokens": 145860981.0, + "step": 3823 + }, + { + "epoch": 0.4864521053301107, + "ewc_loss": 0.02838744968175888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4193725291988812e-05, + "grad_norm": 23.074100494384766, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8581737279891968, + "num_tokens": 145900687.0, + "step": 3824 + }, + { + "epoch": 0.48657931560870116, + "ewc_loss": 0.02840903215110302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4204516446625348e-05, + "grad_norm": 22.96457290649414, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8448758125305176, + "num_tokens": 145938888.0, + "step": 3825 + }, + { + "epoch": 0.4867065258872917, + "ewc_loss": 0.02838488481938839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.419244199496461e-05, + "grad_norm": 22.96590805053711, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.849920392036438, + "num_tokens": 145987060.0, + "step": 3826 + }, + { + "epoch": 0.4868337361658822, + "ewc_loss": 0.0284479521214962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4223975995264482e-05, + "grad_norm": 23.003902435302734, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.850077211856842, + "num_tokens": 146026447.0, + "step": 3827 + }, + { + "epoch": 0.4869609464444727, + "ewc_loss": 0.02838764898478985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4193824426911306e-05, + "grad_norm": 23.07149887084961, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8576287627220154, + "num_tokens": 146067099.0, + "step": 3828 + }, + { + "epoch": 0.4870881567230632, + "ewc_loss": 0.028416084125638008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.420804164808942e-05, + "grad_norm": 23.089527130126953, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8410413265228271, + "num_tokens": 146110767.0, + "step": 3829 + }, + { + "epoch": 0.48721536700165374, + "ewc_loss": 0.028405779972672462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4202890270098578e-05, + "grad_norm": 22.973913192749023, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8461191654205322, + "num_tokens": 146147603.0, + "step": 3830 + }, + { + "epoch": 0.48734257728024427, + "ewc_loss": 0.028374774381518364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4187387023412157e-05, + "grad_norm": 23.029996871948242, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8644862771034241, + "num_tokens": 146188821.0, + "step": 3831 + }, + { + "epoch": 0.48746978755883474, + "ewc_loss": 0.02839774824678898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4198873941495549e-05, + "grad_norm": 23.05176544189453, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8515968918800354, + "num_tokens": 146231886.0, + "step": 3832 + }, + { + "epoch": 0.48759699783742527, + "ewc_loss": 0.028397146612405777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4198572898749262e-05, + "grad_norm": 22.947866439819336, + "learning_rate": 1e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.831920862197876, + "num_tokens": 146271965.0, + "step": 3833 + }, + { + "epoch": 0.4877242081160158, + "ewc_loss": 0.02839583158493042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4197916243574582e-05, + "grad_norm": 22.952247619628906, + "learning_rate": 1e-06, + "loss": 0.5127, + "mean_token_accuracy": 0.8417951464653015, + "num_tokens": 146316686.0, + "step": 3834 + }, + { + "epoch": 0.4878514183946063, + "ewc_loss": 0.02842448092997074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4212240785127506e-05, + "grad_norm": 23.020700454711914, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8447568416595459, + "num_tokens": 146355098.0, + "step": 3835 + }, + { + "epoch": 0.4879786286731968, + "ewc_loss": 0.02840944193303585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4204721082933247e-05, + "grad_norm": 22.975482940673828, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.859317421913147, + "num_tokens": 146390666.0, + "step": 3836 + }, + { + "epoch": 0.48810583895178733, + "ewc_loss": 0.028391148895025253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4195574294717517e-05, + "grad_norm": 23.04530143737793, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8618448972702026, + "num_tokens": 146426740.0, + "step": 3837 + }, + { + "epoch": 0.4882330492303778, + "ewc_loss": 0.028407644480466843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4203822502167895e-05, + "grad_norm": 22.957441329956055, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8453177213668823, + "num_tokens": 146462668.0, + "step": 3838 + }, + { + "epoch": 0.48836025950896833, + "ewc_loss": 0.02836306020617485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.418152987753274e-05, + "grad_norm": 23.059545516967773, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8472009897232056, + "num_tokens": 146500160.0, + "step": 3839 + }, + { + "epoch": 0.48848746978755886, + "ewc_loss": 0.0284311193972826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4215559531294275e-05, + "grad_norm": 22.938419342041016, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8490315675735474, + "num_tokens": 146542963.0, + "step": 3840 + }, + { + "epoch": 0.48861468006614933, + "ewc_loss": 0.028332142159342766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4166071196086705e-05, + "grad_norm": 23.065807342529297, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8611270189285278, + "num_tokens": 146580649.0, + "step": 3841 + }, + { + "epoch": 0.48874189034473986, + "ewc_loss": 0.028446821495890617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.422341119905468e-05, + "grad_norm": 22.89236068725586, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8613881468772888, + "num_tokens": 146620823.0, + "step": 3842 + }, + { + "epoch": 0.4888691006233304, + "ewc_loss": 0.028386706486344337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4193353308655787e-05, + "grad_norm": 22.98952865600586, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8648264408111572, + "num_tokens": 146663700.0, + "step": 3843 + }, + { + "epoch": 0.48899631090192086, + "ewc_loss": 0.02847834676504135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4239173651731107e-05, + "grad_norm": 23.04865074157715, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8487862348556519, + "num_tokens": 146698211.0, + "step": 3844 + }, + { + "epoch": 0.4891235211805114, + "ewc_loss": 0.02843395806849003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4216979252523743e-05, + "grad_norm": 22.99120330810547, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8524457812309265, + "num_tokens": 146738380.0, + "step": 3845 + }, + { + "epoch": 0.4892507314591019, + "ewc_loss": 0.028393976390361786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4196988558978774e-05, + "grad_norm": 23.018184661865234, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.856682300567627, + "num_tokens": 146775413.0, + "step": 3846 + }, + { + "epoch": 0.4893779417376924, + "ewc_loss": 0.028414089232683182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4207044841896277e-05, + "grad_norm": 22.942302703857422, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8563367128372192, + "num_tokens": 146814774.0, + "step": 3847 + }, + { + "epoch": 0.4895051520162829, + "ewc_loss": 0.028436770662665367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4218385331332684e-05, + "grad_norm": 23.129243850708008, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8636196255683899, + "num_tokens": 146853957.0, + "step": 3848 + }, + { + "epoch": 0.48963236229487345, + "ewc_loss": 0.028448443859815598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.422422155883396e-05, + "grad_norm": 22.98782730102539, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8463397026062012, + "num_tokens": 146889060.0, + "step": 3849 + }, + { + "epoch": 0.4897595725734639, + "ewc_loss": 0.028355129063129425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.417756448063301e-05, + "grad_norm": 23.01141929626465, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.864079475402832, + "num_tokens": 146929849.0, + "step": 3850 + }, + { + "epoch": 0.48988678285205445, + "ewc_loss": 0.02848728746175766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4243643818190321e-05, + "grad_norm": 22.99712562561035, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8561477661132812, + "num_tokens": 146968376.0, + "step": 3851 + }, + { + "epoch": 0.490013993130645, + "ewc_loss": 0.028419064357876778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4209532309905626e-05, + "grad_norm": 23.12102508544922, + "learning_rate": 1e-06, + "loss": 0.523, + "mean_token_accuracy": 0.839740514755249, + "num_tokens": 147002871.0, + "step": 3852 + }, + { + "epoch": 0.49014120340923545, + "ewc_loss": 0.028429526835680008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4214763723430224e-05, + "grad_norm": 23.04547882080078, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8507139682769775, + "num_tokens": 147039412.0, + "step": 3853 + }, + { + "epoch": 0.490268413687826, + "ewc_loss": 0.028455516323447227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4227758583729155e-05, + "grad_norm": 23.07770538330078, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8577795028686523, + "num_tokens": 147080307.0, + "step": 3854 + }, + { + "epoch": 0.4903956239664165, + "ewc_loss": 0.028359003365039825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4179501704347786e-05, + "grad_norm": 22.928443908691406, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8618369698524475, + "num_tokens": 147120131.0, + "step": 3855 + }, + { + "epoch": 0.490522834245007, + "ewc_loss": 0.028451234102249146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.422561672370648e-05, + "grad_norm": 23.020965576171875, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8471763134002686, + "num_tokens": 147155760.0, + "step": 3856 + }, + { + "epoch": 0.4906500445235975, + "ewc_loss": 0.028452090919017792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4226045095711015e-05, + "grad_norm": 23.05646514892578, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8578981161117554, + "num_tokens": 147195406.0, + "step": 3857 + }, + { + "epoch": 0.49077725480218803, + "ewc_loss": 0.028436653316020966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4218326214177068e-05, + "grad_norm": 23.021677017211914, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8672371506690979, + "num_tokens": 147235688.0, + "step": 3858 + }, + { + "epoch": 0.4909044650807785, + "ewc_loss": 0.02841096557676792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4205482329998631e-05, + "grad_norm": 22.952810287475586, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8629851937294006, + "num_tokens": 147271575.0, + "step": 3859 + }, + { + "epoch": 0.49103167535936904, + "ewc_loss": 0.028418436646461487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4209218534233514e-05, + "grad_norm": 23.004566192626953, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8511777520179749, + "num_tokens": 147315473.0, + "step": 3860 + }, + { + "epoch": 0.49115888563795956, + "ewc_loss": 0.028446106240153313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4223052858142182e-05, + "grad_norm": 23.025949478149414, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8588014841079712, + "num_tokens": 147359058.0, + "step": 3861 + }, + { + "epoch": 0.49128609591655004, + "ewc_loss": 0.02843089960515499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.421544948243536e-05, + "grad_norm": 23.014909744262695, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8596053719520569, + "num_tokens": 147394443.0, + "step": 3862 + }, + { + "epoch": 0.49141330619514056, + "ewc_loss": 0.028464721515774727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4232360626920126e-05, + "grad_norm": 23.082386016845703, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8500299453735352, + "num_tokens": 147436225.0, + "step": 3863 + }, + { + "epoch": 0.4915405164737311, + "ewc_loss": 0.028428392484784126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4214196198736317e-05, + "grad_norm": 23.082815170288086, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8640820980072021, + "num_tokens": 147473449.0, + "step": 3864 + }, + { + "epoch": 0.49166772675232157, + "ewc_loss": 0.028402553871273994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4201276826497633e-05, + "grad_norm": 23.02264404296875, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8531010150909424, + "num_tokens": 147510330.0, + "step": 3865 + }, + { + "epoch": 0.4917949370309121, + "ewc_loss": 0.0284073818475008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4203690625436138e-05, + "grad_norm": 23.00861167907715, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8659014105796814, + "num_tokens": 147544683.0, + "step": 3866 + }, + { + "epoch": 0.4919221473095026, + "ewc_loss": 0.028367485851049423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4183742678142153e-05, + "grad_norm": 23.089805603027344, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8728920221328735, + "num_tokens": 147589330.0, + "step": 3867 + }, + { + "epoch": 0.4920493575880931, + "ewc_loss": 0.02839161455631256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.419580712536117e-05, + "grad_norm": 23.05462074279785, + "learning_rate": 1e-06, + "loss": 0.5698, + "mean_token_accuracy": 0.8247346878051758, + "num_tokens": 147626864.0, + "step": 3868 + }, + { + "epoch": 0.4921765678666836, + "ewc_loss": 0.028358619660139084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4179309800965711e-05, + "grad_norm": 23.062458038330078, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8367195129394531, + "num_tokens": 147672006.0, + "step": 3869 + }, + { + "epoch": 0.49230377814527415, + "ewc_loss": 0.028397666290402412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4198833014233969e-05, + "grad_norm": 23.121288299560547, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8453552722930908, + "num_tokens": 147717388.0, + "step": 3870 + }, + { + "epoch": 0.4924309884238646, + "ewc_loss": 0.02835807390511036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.417903695255518e-05, + "grad_norm": 23.006805419921875, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.842098593711853, + "num_tokens": 147759827.0, + "step": 3871 + }, + { + "epoch": 0.49255819870245515, + "ewc_loss": 0.028349285945296288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4174643183650915e-05, + "grad_norm": 23.049636840820312, + "learning_rate": 1e-06, + "loss": 0.5148, + "mean_token_accuracy": 0.8352788090705872, + "num_tokens": 147793742.0, + "step": 3872 + }, + { + "epoch": 0.4926854089810457, + "ewc_loss": 0.0284393522888422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.42196759043145e-05, + "grad_norm": 23.03221321105957, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8647708892822266, + "num_tokens": 147829881.0, + "step": 3873 + }, + { + "epoch": 0.49281261925963615, + "ewc_loss": 0.02842264249920845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4211321285984013e-05, + "grad_norm": 23.096973419189453, + "learning_rate": 1e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8404027223587036, + "num_tokens": 147875051.0, + "step": 3874 + }, + { + "epoch": 0.4929398295382267, + "ewc_loss": 0.02840220183134079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.420110129402019e-05, + "grad_norm": 23.057294845581055, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8470439910888672, + "num_tokens": 147912786.0, + "step": 3875 + }, + { + "epoch": 0.4930670398168172, + "ewc_loss": 0.02837788127362728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4188940440362785e-05, + "grad_norm": 23.037391662597656, + "learning_rate": 1e-06, + "loss": 0.5286, + "mean_token_accuracy": 0.8334298133850098, + "num_tokens": 147954981.0, + "step": 3876 + }, + { + "epoch": 0.4931942500954077, + "ewc_loss": 0.02840784192085266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4203920727595687e-05, + "grad_norm": 23.0749568939209, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8478119969367981, + "num_tokens": 147992179.0, + "step": 3877 + }, + { + "epoch": 0.4933214603739982, + "ewc_loss": 0.02843424677848816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4217122952686623e-05, + "grad_norm": 22.988731384277344, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8590104579925537, + "num_tokens": 148029866.0, + "step": 3878 + }, + { + "epoch": 0.49344867065258874, + "ewc_loss": 0.028438888490200043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4219443983165547e-05, + "grad_norm": 23.10761260986328, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8552790880203247, + "num_tokens": 148070057.0, + "step": 3879 + }, + { + "epoch": 0.4935758809311792, + "ewc_loss": 0.028518235310912132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4259117961046286e-05, + "grad_norm": 22.936138153076172, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8580762147903442, + "num_tokens": 148108461.0, + "step": 3880 + }, + { + "epoch": 0.49370309120976974, + "ewc_loss": 0.028409086167812347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.42045428219717e-05, + "grad_norm": 23.020538330078125, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8471930027008057, + "num_tokens": 148147609.0, + "step": 3881 + }, + { + "epoch": 0.49383030148836027, + "ewc_loss": 0.028555570170283318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4277785339800175e-05, + "grad_norm": 23.0150146484375, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8494096994400024, + "num_tokens": 148190475.0, + "step": 3882 + }, + { + "epoch": 0.4939575117669508, + "ewc_loss": 0.028538867831230164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4269433449953794e-05, + "grad_norm": 23.137271881103516, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8655074238777161, + "num_tokens": 148221187.0, + "step": 3883 + }, + { + "epoch": 0.49408472204554127, + "ewc_loss": 0.028551386669278145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4275693501986098e-05, + "grad_norm": 23.00728416442871, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8627898693084717, + "num_tokens": 148260916.0, + "step": 3884 + }, + { + "epoch": 0.4942119323241318, + "ewc_loss": 0.028491759672760963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4245880265661981e-05, + "grad_norm": 23.05508804321289, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8602168560028076, + "num_tokens": 148295591.0, + "step": 3885 + }, + { + "epoch": 0.4943391426027223, + "ewc_loss": 0.028557034209370613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4278517483035102e-05, + "grad_norm": 23.05719566345215, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8477228879928589, + "num_tokens": 148333445.0, + "step": 3886 + }, + { + "epoch": 0.4944663528813128, + "ewc_loss": 0.028502708300948143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4251354514271952e-05, + "grad_norm": 23.063417434692383, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8633858561515808, + "num_tokens": 148367951.0, + "step": 3887 + }, + { + "epoch": 0.4945935631599033, + "ewc_loss": 0.028516564518213272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4258282135415357e-05, + "grad_norm": 22.987977981567383, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8458771109580994, + "num_tokens": 148408491.0, + "step": 3888 + }, + { + "epoch": 0.49472077343849385, + "ewc_loss": 0.028543274849653244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.427163715561619e-05, + "grad_norm": 23.1090030670166, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8546409606933594, + "num_tokens": 148443883.0, + "step": 3889 + }, + { + "epoch": 0.4948479837170843, + "ewc_loss": 0.028543947264552116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4271973668655846e-05, + "grad_norm": 23.090970993041992, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8555660247802734, + "num_tokens": 148480511.0, + "step": 3890 + }, + { + "epoch": 0.49497519399567486, + "ewc_loss": 0.028559023514389992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.427951156074414e-05, + "grad_norm": 23.05579948425293, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8478853702545166, + "num_tokens": 148514759.0, + "step": 3891 + }, + { + "epoch": 0.4951024042742654, + "ewc_loss": 0.028561048209667206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4280523828347214e-05, + "grad_norm": 23.14262580871582, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8562911748886108, + "num_tokens": 148553209.0, + "step": 3892 + }, + { + "epoch": 0.49522961455285586, + "ewc_loss": 0.028494948521256447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4247474609874189e-05, + "grad_norm": 23.05255699157715, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8450064063072205, + "num_tokens": 148590387.0, + "step": 3893 + }, + { + "epoch": 0.4953568248314464, + "ewc_loss": 0.028569137677550316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.42845692607807e-05, + "grad_norm": 23.16930389404297, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8520050048828125, + "num_tokens": 148625201.0, + "step": 3894 + }, + { + "epoch": 0.4954840351100369, + "ewc_loss": 0.028576232492923737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4288116290117614e-05, + "grad_norm": 23.199296951293945, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8582940101623535, + "num_tokens": 148665989.0, + "step": 3895 + }, + { + "epoch": 0.4956112453886274, + "ewc_loss": 0.0285485852509737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4274292880145367e-05, + "grad_norm": 23.045743942260742, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8575953841209412, + "num_tokens": 148704839.0, + "step": 3896 + }, + { + "epoch": 0.4957384556672179, + "ewc_loss": 0.02852347306907177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.426173639629269e-05, + "grad_norm": 23.20934295654297, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8636289238929749, + "num_tokens": 148740523.0, + "step": 3897 + }, + { + "epoch": 0.49586566594580844, + "ewc_loss": 0.02853880450129509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4269402527133934e-05, + "grad_norm": 23.036598205566406, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8690759539604187, + "num_tokens": 148779103.0, + "step": 3898 + }, + { + "epoch": 0.4959928762243989, + "ewc_loss": 0.02848992869257927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4244964404497296e-05, + "grad_norm": 23.06325340270996, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8643137216567993, + "num_tokens": 148814659.0, + "step": 3899 + }, + { + "epoch": 0.49612008650298944, + "ewc_loss": 0.02858944609761238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4294722859631293e-05, + "grad_norm": 23.180734634399414, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8745154142379761, + "num_tokens": 148856864.0, + "step": 3900 + }, + { + "epoch": 0.49624729678157997, + "ewc_loss": 0.028528468683362007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.426423477823846e-05, + "grad_norm": 23.02252197265625, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.860499382019043, + "num_tokens": 148890309.0, + "step": 3901 + }, + { + "epoch": 0.49637450706017044, + "ewc_loss": 0.02856183983385563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4280920368037187e-05, + "grad_norm": 23.141679763793945, + "learning_rate": 1e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8388059139251709, + "num_tokens": 148926662.0, + "step": 3902 + }, + { + "epoch": 0.496501717338761, + "ewc_loss": 0.028581589460372925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4290794752014335e-05, + "grad_norm": 23.05936050415039, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8421051502227783, + "num_tokens": 148963032.0, + "step": 3903 + }, + { + "epoch": 0.4966289276173515, + "ewc_loss": 0.028569290414452553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4284645658335648e-05, + "grad_norm": 22.977893829345703, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8599377870559692, + "num_tokens": 148996684.0, + "step": 3904 + }, + { + "epoch": 0.496756137895942, + "ewc_loss": 0.028601573780179024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4300786460808013e-05, + "grad_norm": 23.01764488220215, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8542813062667847, + "num_tokens": 149032735.0, + "step": 3905 + }, + { + "epoch": 0.4968833481745325, + "ewc_loss": 0.02861577831208706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4307888704934157e-05, + "grad_norm": 23.13613510131836, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8555601835250854, + "num_tokens": 149069415.0, + "step": 3906 + }, + { + "epoch": 0.49701055845312303, + "ewc_loss": 0.02862394228577614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4311971426650416e-05, + "grad_norm": 22.98129653930664, + "learning_rate": 1e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8404173851013184, + "num_tokens": 149115042.0, + "step": 3907 + }, + { + "epoch": 0.4971377687317135, + "ewc_loss": 0.028613878414034843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4306939192465506e-05, + "grad_norm": 23.092864990234375, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.862234354019165, + "num_tokens": 149149767.0, + "step": 3908 + }, + { + "epoch": 0.49726497901030403, + "ewc_loss": 0.028633860871195793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4316929991764482e-05, + "grad_norm": 23.124500274658203, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8475566506385803, + "num_tokens": 149193000.0, + "step": 3909 + }, + { + "epoch": 0.49739218928889456, + "ewc_loss": 0.028632404282689095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4316202396003064e-05, + "grad_norm": 23.047468185424805, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8521246314048767, + "num_tokens": 149226963.0, + "step": 3910 + }, + { + "epoch": 0.49751939956748503, + "ewc_loss": 0.028582114726305008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4291057595983148e-05, + "grad_norm": 23.03565216064453, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8493593335151672, + "num_tokens": 149268139.0, + "step": 3911 + }, + { + "epoch": 0.49764660984607556, + "ewc_loss": 0.028644010424613953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4322004972200375e-05, + "grad_norm": 23.10976791381836, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8619362115859985, + "num_tokens": 149305882.0, + "step": 3912 + }, + { + "epoch": 0.4977738201246661, + "ewc_loss": 0.028618590906262398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.43092956932378e-05, + "grad_norm": 23.090063095092773, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8512533903121948, + "num_tokens": 149344119.0, + "step": 3913 + }, + { + "epoch": 0.49790103040325656, + "ewc_loss": 0.02862987294793129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4314936379378196e-05, + "grad_norm": 23.10303497314453, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.845069944858551, + "num_tokens": 149384102.0, + "step": 3914 + }, + { + "epoch": 0.4980282406818471, + "ewc_loss": 0.028584768995642662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4292384548753034e-05, + "grad_norm": 23.123918533325195, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8517695665359497, + "num_tokens": 149425677.0, + "step": 3915 + }, + { + "epoch": 0.4981554509604376, + "ewc_loss": 0.02868250198662281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4341250789584592e-05, + "grad_norm": 23.1451473236084, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8504685759544373, + "num_tokens": 149463275.0, + "step": 3916 + }, + { + "epoch": 0.4982826612390281, + "ewc_loss": 0.02858222834765911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4291113984654658e-05, + "grad_norm": 22.91939353942871, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8691996335983276, + "num_tokens": 149503950.0, + "step": 3917 + }, + { + "epoch": 0.4984098715176186, + "ewc_loss": 0.028649192303419113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4324596122605726e-05, + "grad_norm": 23.15996742248535, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8458884358406067, + "num_tokens": 149541192.0, + "step": 3918 + }, + { + "epoch": 0.49853708179620915, + "ewc_loss": 0.02870149165391922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4350745914271101e-05, + "grad_norm": 23.080333709716797, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8507453203201294, + "num_tokens": 149581784.0, + "step": 3919 + }, + { + "epoch": 0.4986642920747996, + "ewc_loss": 0.028651870787143707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4325935808301438e-05, + "grad_norm": 23.083450317382812, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8592255711555481, + "num_tokens": 149619312.0, + "step": 3920 + }, + { + "epoch": 0.49879150235339015, + "ewc_loss": 0.028707092627882957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.435354624845786e-05, + "grad_norm": 23.090158462524414, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8505575656890869, + "num_tokens": 149657443.0, + "step": 3921 + }, + { + "epoch": 0.4989187126319807, + "ewc_loss": 0.02861638553440571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.430819247616455e-05, + "grad_norm": 23.079608917236328, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.866314709186554, + "num_tokens": 149694992.0, + "step": 3922 + }, + { + "epoch": 0.49904592291057115, + "ewc_loss": 0.02863818220794201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4319090951175895e-05, + "grad_norm": 22.977577209472656, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8503532409667969, + "num_tokens": 149729554.0, + "step": 3923 + }, + { + "epoch": 0.4991731331891617, + "ewc_loss": 0.02866205759346485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4331028978631366e-05, + "grad_norm": 23.203975677490234, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8599330186843872, + "num_tokens": 149770051.0, + "step": 3924 + }, + { + "epoch": 0.4993003434677522, + "ewc_loss": 0.028665291145443916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4332645150716417e-05, + "grad_norm": 22.978233337402344, + "learning_rate": 1e-06, + "loss": 0.5654, + "mean_token_accuracy": 0.8222090005874634, + "num_tokens": 149806724.0, + "step": 3925 + }, + { + "epoch": 0.4994275537463427, + "ewc_loss": 0.028607575222849846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4303787793323863e-05, + "grad_norm": 23.078832626342773, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8457409143447876, + "num_tokens": 149849874.0, + "step": 3926 + }, + { + "epoch": 0.4995547640249332, + "ewc_loss": 0.02864871919155121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4324359653983265e-05, + "grad_norm": 22.976226806640625, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8532934784889221, + "num_tokens": 149888076.0, + "step": 3927 + }, + { + "epoch": 0.49968197430352373, + "ewc_loss": 0.02864857390522957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4324286894407123e-05, + "grad_norm": 23.069087982177734, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8548604249954224, + "num_tokens": 149923750.0, + "step": 3928 + }, + { + "epoch": 0.4998091845821142, + "ewc_loss": 0.028708407655358315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4354203813127242e-05, + "grad_norm": 23.044191360473633, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8421393632888794, + "num_tokens": 149960657.0, + "step": 3929 + }, + { + "epoch": 0.49993639486070474, + "ewc_loss": 0.028682423755526543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4341211681312416e-05, + "grad_norm": 23.224958419799805, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8695860505104065, + "num_tokens": 149995983.0, + "step": 3930 + }, + { + "epoch": 0.5000636051392953, + "ewc_loss": 0.028677422553300858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4338711480377242e-05, + "grad_norm": 22.960224151611328, + "learning_rate": 1e-06, + "loss": 0.5297, + "mean_token_accuracy": 0.8322862386703491, + "num_tokens": 150037508.0, + "step": 3931 + }, + { + "epoch": 0.5001908154178858, + "ewc_loss": 0.02869056537747383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4345282579597551e-05, + "grad_norm": 23.248868942260742, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8495126962661743, + "num_tokens": 150077806.0, + "step": 3932 + }, + { + "epoch": 0.5003180256964763, + "ewc_loss": 0.028733929619193077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4366964933287818e-05, + "grad_norm": 23.059947967529297, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8526186943054199, + "num_tokens": 150117589.0, + "step": 3933 + }, + { + "epoch": 0.5004452359750667, + "ewc_loss": 0.028659900650382042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4329950317915063e-05, + "grad_norm": 23.191083908081055, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8481065034866333, + "num_tokens": 150161579.0, + "step": 3934 + }, + { + "epoch": 0.5005724462536573, + "ewc_loss": 0.028683770447969437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.434188561688643e-05, + "grad_norm": 23.01570701599121, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8419519066810608, + "num_tokens": 150208018.0, + "step": 3935 + }, + { + "epoch": 0.5006996565322478, + "ewc_loss": 0.028608856722712517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4304428077593911e-05, + "grad_norm": 23.116533279418945, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8563477993011475, + "num_tokens": 150250164.0, + "step": 3936 + }, + { + "epoch": 0.5008268668108383, + "ewc_loss": 0.028679154813289642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.433957731933333e-05, + "grad_norm": 23.001222610473633, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8726310133934021, + "num_tokens": 150289972.0, + "step": 3937 + }, + { + "epoch": 0.5009540770894289, + "ewc_loss": 0.028589613735675812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4294806533143856e-05, + "grad_norm": 23.067386627197266, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8528598546981812, + "num_tokens": 150333656.0, + "step": 3938 + }, + { + "epoch": 0.5010812873680194, + "ewc_loss": 0.028694048523902893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4347024261951447e-05, + "grad_norm": 23.100889205932617, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8677646517753601, + "num_tokens": 150374086.0, + "step": 3939 + }, + { + "epoch": 0.5012084976466098, + "ewc_loss": 0.028628047555685043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4314024156192318e-05, + "grad_norm": 23.0802059173584, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8577168583869934, + "num_tokens": 150407472.0, + "step": 3940 + }, + { + "epoch": 0.5013357079252003, + "ewc_loss": 0.02865273505449295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.432636781828478e-05, + "grad_norm": 22.99839210510254, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8593661785125732, + "num_tokens": 150444560.0, + "step": 3941 + }, + { + "epoch": 0.5014629182037909, + "ewc_loss": 0.028693009167909622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4346504940476734e-05, + "grad_norm": 23.11273765563965, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8586429357528687, + "num_tokens": 150483621.0, + "step": 3942 + }, + { + "epoch": 0.5015901284823814, + "ewc_loss": 0.0286273043602705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4313652172859292e-05, + "grad_norm": 23.018545150756836, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8542125821113586, + "num_tokens": 150520414.0, + "step": 3943 + }, + { + "epoch": 0.5017173387609719, + "ewc_loss": 0.028618546202778816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4309272955870256e-05, + "grad_norm": 23.084087371826172, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8461892008781433, + "num_tokens": 150556589.0, + "step": 3944 + }, + { + "epoch": 0.5018445490395624, + "ewc_loss": 0.02871013432741165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4355066923599225e-05, + "grad_norm": 23.03557777404785, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8524627685546875, + "num_tokens": 150592393.0, + "step": 3945 + }, + { + "epoch": 0.5019717593181529, + "ewc_loss": 0.028615979477763176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4307989658846054e-05, + "grad_norm": 23.072912216186523, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8405253887176514, + "num_tokens": 150630457.0, + "step": 3946 + }, + { + "epoch": 0.5020989695967434, + "ewc_loss": 0.028673991560935974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4336996173369698e-05, + "grad_norm": 23.059471130371094, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8617873191833496, + "num_tokens": 150659482.0, + "step": 3947 + }, + { + "epoch": 0.5022261798753339, + "ewc_loss": 0.028747325763106346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4373662452271674e-05, + "grad_norm": 23.125444412231445, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8539121150970459, + "num_tokens": 150700920.0, + "step": 3948 + }, + { + "epoch": 0.5023533901539244, + "ewc_loss": 0.0286468006670475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4323400137072895e-05, + "grad_norm": 23.06734275817871, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8646578788757324, + "num_tokens": 150736393.0, + "step": 3949 + }, + { + "epoch": 0.502480600432515, + "ewc_loss": 0.028736166656017303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4368083611770999e-05, + "grad_norm": 23.153362274169922, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8506450653076172, + "num_tokens": 150783848.0, + "step": 3950 + }, + { + "epoch": 0.5026078107111055, + "ewc_loss": 0.02868782728910446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4343913790071383e-05, + "grad_norm": 23.0456600189209, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.850814938545227, + "num_tokens": 150824179.0, + "step": 3951 + }, + { + "epoch": 0.5027350209896959, + "ewc_loss": 0.028706025332212448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.435301237506792e-05, + "grad_norm": 23.145654678344727, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8550542593002319, + "num_tokens": 150862775.0, + "step": 3952 + }, + { + "epoch": 0.5028622312682864, + "ewc_loss": 0.02871701493859291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4358507542056032e-05, + "grad_norm": 23.018970489501953, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8530922532081604, + "num_tokens": 150900005.0, + "step": 3953 + }, + { + "epoch": 0.502989441546877, + "ewc_loss": 0.028696797788143158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4348398508445825e-05, + "grad_norm": 23.23053550720215, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8504472374916077, + "num_tokens": 150936148.0, + "step": 3954 + }, + { + "epoch": 0.5031166518254675, + "ewc_loss": 0.028735831379890442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.436791535525117e-05, + "grad_norm": 23.007335662841797, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8506380915641785, + "num_tokens": 150971661.0, + "step": 3955 + }, + { + "epoch": 0.503243862104058, + "ewc_loss": 0.02868194505572319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.434097248420585e-05, + "grad_norm": 23.160165786743164, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.865780234336853, + "num_tokens": 151009521.0, + "step": 3956 + }, + { + "epoch": 0.5033710723826486, + "ewc_loss": 0.028806723654270172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4403362001758069e-05, + "grad_norm": 23.105857849121094, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8503689765930176, + "num_tokens": 151053641.0, + "step": 3957 + }, + { + "epoch": 0.5034982826612391, + "ewc_loss": 0.02873370051383972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4366850336955395e-05, + "grad_norm": 23.063276290893555, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8579326868057251, + "num_tokens": 151094476.0, + "step": 3958 + }, + { + "epoch": 0.5036254929398295, + "ewc_loss": 0.028744082897901535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4372041732713114e-05, + "grad_norm": 23.124914169311523, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8476213812828064, + "num_tokens": 151126381.0, + "step": 3959 + }, + { + "epoch": 0.50375270321842, + "ewc_loss": 0.028736034408211708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.436801721865777e-05, + "grad_norm": 23.169607162475586, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8587164282798767, + "num_tokens": 151156875.0, + "step": 3960 + }, + { + "epoch": 0.5038799134970106, + "ewc_loss": 0.028724219650030136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4362110050569754e-05, + "grad_norm": 23.21017837524414, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8501754999160767, + "num_tokens": 151194394.0, + "step": 3961 + }, + { + "epoch": 0.5040071237756011, + "ewc_loss": 0.02878572605550289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4392862794920802e-05, + "grad_norm": 23.314424514770508, + "learning_rate": 1e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8404661417007446, + "num_tokens": 151234472.0, + "step": 3962 + }, + { + "epoch": 0.5041343340541916, + "ewc_loss": 0.028681084513664246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4340542293211911e-05, + "grad_norm": 23.108972549438477, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8505935668945312, + "num_tokens": 151270426.0, + "step": 3963 + }, + { + "epoch": 0.5042615443327821, + "ewc_loss": 0.028711998835206032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4355999155668542e-05, + "grad_norm": 23.16819190979004, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8517398238182068, + "num_tokens": 151305793.0, + "step": 3964 + }, + { + "epoch": 0.5043887546113726, + "ewc_loss": 0.02874213457107544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4371067663887516e-05, + "grad_norm": 23.224468231201172, + "learning_rate": 1e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8417595028877258, + "num_tokens": 151345116.0, + "step": 3965 + }, + { + "epoch": 0.5045159648899631, + "ewc_loss": 0.028776075690984726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4388037925527897e-05, + "grad_norm": 23.127788543701172, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8580808639526367, + "num_tokens": 151381901.0, + "step": 3966 + }, + { + "epoch": 0.5046431751685536, + "ewc_loss": 0.02874273620545864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.43713677971391e-05, + "grad_norm": 23.2512264251709, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.856126606464386, + "num_tokens": 151426884.0, + "step": 3967 + }, + { + "epoch": 0.5047703854471441, + "ewc_loss": 0.028813226148486137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4406613445316907e-05, + "grad_norm": 23.183469772338867, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8469067811965942, + "num_tokens": 151467596.0, + "step": 3968 + }, + { + "epoch": 0.5048975957257347, + "ewc_loss": 0.028716271743178368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4358135558723006e-05, + "grad_norm": 23.18752098083496, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8734235763549805, + "num_tokens": 151502131.0, + "step": 3969 + }, + { + "epoch": 0.5050248060043252, + "ewc_loss": 0.028785211965441704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4392606317414902e-05, + "grad_norm": 23.119298934936523, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8512035608291626, + "num_tokens": 151538932.0, + "step": 3970 + }, + { + "epoch": 0.5051520162829156, + "ewc_loss": 0.028705917298793793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4352958714880515e-05, + "grad_norm": 23.07750129699707, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8531369566917419, + "num_tokens": 151577839.0, + "step": 3971 + }, + { + "epoch": 0.5052792265615061, + "ewc_loss": 0.02880626916885376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4403134628082626e-05, + "grad_norm": 23.223854064941406, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8537666201591492, + "num_tokens": 151618984.0, + "step": 3972 + }, + { + "epoch": 0.5054064368400967, + "ewc_loss": 0.02880484238266945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4402421584236436e-05, + "grad_norm": 23.063613891601562, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8541167974472046, + "num_tokens": 151658392.0, + "step": 3973 + }, + { + "epoch": 0.5055336471186872, + "ewc_loss": 0.028796877712011337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4398438906937372e-05, + "grad_norm": 23.191343307495117, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.864255428314209, + "num_tokens": 151689781.0, + "step": 3974 + }, + { + "epoch": 0.5056608573972777, + "ewc_loss": 0.02881939336657524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4409696632355917e-05, + "grad_norm": 23.02174186706543, + "learning_rate": 1e-06, + "loss": 0.5261, + "mean_token_accuracy": 0.838459849357605, + "num_tokens": 151729612.0, + "step": 3975 + }, + { + "epoch": 0.5057880676758683, + "ewc_loss": 0.028788631781935692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4394316167454235e-05, + "grad_norm": 23.213783264160156, + "learning_rate": 1e-06, + "loss": 0.5024, + "mean_token_accuracy": 0.8412673473358154, + "num_tokens": 151769270.0, + "step": 3976 + }, + { + "epoch": 0.5059152779544587, + "ewc_loss": 0.028839027509093285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4419513718166854e-05, + "grad_norm": 23.069087982177734, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8345125913619995, + "num_tokens": 151803936.0, + "step": 3977 + }, + { + "epoch": 0.5060424882330492, + "ewc_loss": 0.028762411326169968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4381205801328178e-05, + "grad_norm": 23.093751907348633, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8425579071044922, + "num_tokens": 151840429.0, + "step": 3978 + }, + { + "epoch": 0.5061696985116397, + "ewc_loss": 0.028843402862548828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4421701052924618e-05, + "grad_norm": 23.16237449645996, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8553062677383423, + "num_tokens": 151883296.0, + "step": 3979 + }, + { + "epoch": 0.5062969087902303, + "ewc_loss": 0.028849679976701736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4424839719140437e-05, + "grad_norm": 23.14993667602539, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8680745959281921, + "num_tokens": 151917658.0, + "step": 3980 + }, + { + "epoch": 0.5064241190688208, + "ewc_loss": 0.02879888191819191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4399441170098726e-05, + "grad_norm": 23.0223331451416, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8531096577644348, + "num_tokens": 151958822.0, + "step": 3981 + }, + { + "epoch": 0.5065513293474113, + "ewc_loss": 0.028888054192066193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.444402732886374e-05, + "grad_norm": 23.263904571533203, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8491568565368652, + "num_tokens": 151994964.0, + "step": 3982 + }, + { + "epoch": 0.5066785396260017, + "ewc_loss": 0.028939301148056984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4469650523096789e-05, + "grad_norm": 23.165807723999023, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8490238189697266, + "num_tokens": 152037013.0, + "step": 3983 + }, + { + "epoch": 0.5068057499045923, + "ewc_loss": 0.02881550043821335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4407750313694123e-05, + "grad_norm": 23.098241806030273, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8492845296859741, + "num_tokens": 152074821.0, + "step": 3984 + }, + { + "epoch": 0.5069329601831828, + "ewc_loss": 0.02888447418808937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.444223744329065e-05, + "grad_norm": 23.12795066833496, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8473194241523743, + "num_tokens": 152116667.0, + "step": 3985 + }, + { + "epoch": 0.5070601704617733, + "ewc_loss": 0.028786353766918182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4393176570592914e-05, + "grad_norm": 22.9578857421875, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8581067323684692, + "num_tokens": 152152714.0, + "step": 3986 + }, + { + "epoch": 0.5071873807403638, + "ewc_loss": 0.028873009607195854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4436504898185376e-05, + "grad_norm": 23.227434158325195, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8388463258743286, + "num_tokens": 152190230.0, + "step": 3987 + }, + { + "epoch": 0.5073145910189544, + "ewc_loss": 0.02890406921505928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4452034520218149e-05, + "grad_norm": 23.21999740600586, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8531447052955627, + "num_tokens": 152228887.0, + "step": 3988 + }, + { + "epoch": 0.5074418012975448, + "ewc_loss": 0.028866810724139214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4433405340241734e-05, + "grad_norm": 23.270051956176758, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8620164394378662, + "num_tokens": 152271863.0, + "step": 3989 + }, + { + "epoch": 0.5075690115761353, + "ewc_loss": 0.028814364224672318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4407181879505515e-05, + "grad_norm": 23.100446701049805, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8477269411087036, + "num_tokens": 152310765.0, + "step": 3990 + }, + { + "epoch": 0.5076962218547258, + "ewc_loss": 0.028844596818089485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4422298590943683e-05, + "grad_norm": 23.196977615356445, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.874997615814209, + "num_tokens": 152342545.0, + "step": 3991 + }, + { + "epoch": 0.5078234321333164, + "ewc_loss": 0.028881845995783806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4440923223446589e-05, + "grad_norm": 23.175567626953125, + "learning_rate": 1e-06, + "loss": 0.5122, + "mean_token_accuracy": 0.8379590511322021, + "num_tokens": 152386223.0, + "step": 3992 + }, + { + "epoch": 0.5079506424119069, + "ewc_loss": 0.02877483330667019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4387416740646586e-05, + "grad_norm": 23.075254440307617, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8601523637771606, + "num_tokens": 152421811.0, + "step": 3993 + }, + { + "epoch": 0.5080778526904974, + "ewc_loss": 0.028828341513872147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4414170436793938e-05, + "grad_norm": 23.121957778930664, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8407456874847412, + "num_tokens": 152454581.0, + "step": 3994 + }, + { + "epoch": 0.5082050629690879, + "ewc_loss": 0.02889414131641388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4447070498135872e-05, + "grad_norm": 23.186504364013672, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8440585136413574, + "num_tokens": 152490591.0, + "step": 3995 + }, + { + "epoch": 0.5083322732476784, + "ewc_loss": 0.028870759531855583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4435379853239283e-05, + "grad_norm": 23.083650588989258, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8758867383003235, + "num_tokens": 152532290.0, + "step": 3996 + }, + { + "epoch": 0.5084594835262689, + "ewc_loss": 0.028850862756371498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.442543180019129e-05, + "grad_norm": 23.115921020507812, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.858894944190979, + "num_tokens": 152571984.0, + "step": 3997 + }, + { + "epoch": 0.5085866938048594, + "ewc_loss": 0.0288632083684206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4431604540732224e-05, + "grad_norm": 23.139623641967773, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.854061484336853, + "num_tokens": 152611082.0, + "step": 3998 + }, + { + "epoch": 0.50871390408345, + "ewc_loss": 0.028945336118340492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4472668226517271e-05, + "grad_norm": 23.15952491760254, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8614343404769897, + "num_tokens": 152655303.0, + "step": 3999 + }, + { + "epoch": 0.5088411143620405, + "ewc_loss": 0.028910303488373756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4455151358561125e-05, + "grad_norm": 23.263376235961914, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8531990647315979, + "num_tokens": 152688432.0, + "step": 4000 + }, + { + "epoch": 0.5089683246406309, + "ewc_loss": 0.028916191309690475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4458095392910764e-05, + "grad_norm": 23.23829460144043, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8483200669288635, + "num_tokens": 152725637.0, + "step": 4001 + }, + { + "epoch": 0.5090955349192214, + "ewc_loss": 0.028806068003177643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4403033674170729e-05, + "grad_norm": 23.236221313476562, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8521283864974976, + "num_tokens": 152763108.0, + "step": 4002 + }, + { + "epoch": 0.509222745197812, + "ewc_loss": 0.028841830790042877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4420915249502286e-05, + "grad_norm": 23.101341247558594, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8607483506202698, + "num_tokens": 152800659.0, + "step": 4003 + }, + { + "epoch": 0.5093499554764025, + "ewc_loss": 0.028828509151935577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4414254110306501e-05, + "grad_norm": 23.14240837097168, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8448877334594727, + "num_tokens": 152842797.0, + "step": 4004 + }, + { + "epoch": 0.509477165754993, + "ewc_loss": 0.028866533190011978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4433266187552363e-05, + "grad_norm": 23.03946876525879, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8439217209815979, + "num_tokens": 152888257.0, + "step": 4005 + }, + { + "epoch": 0.5096043760335836, + "ewc_loss": 0.02880697138607502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.440348569303751e-05, + "grad_norm": 23.155927658081055, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8530205488204956, + "num_tokens": 152929089.0, + "step": 4006 + }, + { + "epoch": 0.5097315863121741, + "ewc_loss": 0.02891797013580799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4458984878729098e-05, + "grad_norm": 23.117233276367188, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8480566740036011, + "num_tokens": 152972710.0, + "step": 4007 + }, + { + "epoch": 0.5098587965907645, + "ewc_loss": 0.028799915686249733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4399957763089333e-05, + "grad_norm": 23.20142364501953, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8536337018013, + "num_tokens": 153008867.0, + "step": 4008 + }, + { + "epoch": 0.509986006869355, + "ewc_loss": 0.028928136453032494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4464068044617306e-05, + "grad_norm": 23.09772491455078, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8483660221099854, + "num_tokens": 153051497.0, + "step": 4009 + }, + { + "epoch": 0.5101132171479456, + "ewc_loss": 0.02881801873445511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4409009054361377e-05, + "grad_norm": 23.13717269897461, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8701413869857788, + "num_tokens": 153089295.0, + "step": 4010 + }, + { + "epoch": 0.5102404274265361, + "ewc_loss": 0.028906039893627167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4453019502980169e-05, + "grad_norm": 23.216873168945312, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8572550415992737, + "num_tokens": 153123957.0, + "step": 4011 + }, + { + "epoch": 0.5103676377051266, + "ewc_loss": 0.02890048734843731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4450243725150358e-05, + "grad_norm": 23.1384334564209, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8489497303962708, + "num_tokens": 153165248.0, + "step": 4012 + }, + { + "epoch": 0.5104948479837171, + "ewc_loss": 0.02886679582297802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.443339806428412e-05, + "grad_norm": 23.151084899902344, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8679015636444092, + "num_tokens": 153206163.0, + "step": 4013 + }, + { + "epoch": 0.5106220582623076, + "ewc_loss": 0.02890525944530964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4452629329753108e-05, + "grad_norm": 23.112668991088867, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8439397811889648, + "num_tokens": 153248666.0, + "step": 4014 + }, + { + "epoch": 0.5107492685408981, + "ewc_loss": 0.02891319990158081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4456600183621049e-05, + "grad_norm": 23.19932746887207, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8761483430862427, + "num_tokens": 153289009.0, + "step": 4015 + }, + { + "epoch": 0.5108764788194886, + "ewc_loss": 0.02889866754412651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4449334230448585e-05, + "grad_norm": 23.084823608398438, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8585494756698608, + "num_tokens": 153321354.0, + "step": 4016 + }, + { + "epoch": 0.5110036890980791, + "ewc_loss": 0.028860075399279594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4430037481361069e-05, + "grad_norm": 23.11465835571289, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8534653186798096, + "num_tokens": 153361061.0, + "step": 4017 + }, + { + "epoch": 0.5111308993766697, + "ewc_loss": 0.028908338397741318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.445416910428321e-05, + "grad_norm": 23.11433982849121, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8476885557174683, + "num_tokens": 153397002.0, + "step": 4018 + }, + { + "epoch": 0.5112581096552602, + "ewc_loss": 0.028899909928441048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4449955415329896e-05, + "grad_norm": 23.116535186767578, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8394827842712402, + "num_tokens": 153435264.0, + "step": 4019 + }, + { + "epoch": 0.5113853199338506, + "ewc_loss": 0.02893856354057789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.446928217774257e-05, + "grad_norm": 23.172380447387695, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8524290323257446, + "num_tokens": 153474751.0, + "step": 4020 + }, + { + "epoch": 0.5115125302124411, + "ewc_loss": 0.028968418017029762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4484208804788068e-05, + "grad_norm": 23.122379302978516, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8625777363777161, + "num_tokens": 153510084.0, + "step": 4021 + }, + { + "epoch": 0.5116397404910317, + "ewc_loss": 0.028879648074507713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4439823644352145e-05, + "grad_norm": 23.15736198425293, + "learning_rate": 1e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8394465446472168, + "num_tokens": 153550387.0, + "step": 4022 + }, + { + "epoch": 0.5117669507696222, + "ewc_loss": 0.028933485969901085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.446674286853522e-05, + "grad_norm": 23.08709716796875, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8748670816421509, + "num_tokens": 153592441.0, + "step": 4023 + }, + { + "epoch": 0.5118941610482127, + "ewc_loss": 0.02894626371562481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4473132068815175e-05, + "grad_norm": 23.303882598876953, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8606164455413818, + "num_tokens": 153629202.0, + "step": 4024 + }, + { + "epoch": 0.5120213713268033, + "ewc_loss": 0.028930557891726494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4465278582065366e-05, + "grad_norm": 23.147045135498047, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8484344482421875, + "num_tokens": 153669243.0, + "step": 4025 + }, + { + "epoch": 0.5121485816053937, + "ewc_loss": 0.028879504650831223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4439752703765407e-05, + "grad_norm": 23.175743103027344, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8491116166114807, + "num_tokens": 153709206.0, + "step": 4026 + }, + { + "epoch": 0.5122757918839842, + "ewc_loss": 0.028966162353754044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.448308103135787e-05, + "grad_norm": 23.238353729248047, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8602067232131958, + "num_tokens": 153741251.0, + "step": 4027 + }, + { + "epoch": 0.5124030021625747, + "ewc_loss": 0.02889534831047058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.444767440261785e-05, + "grad_norm": 23.216882705688477, + "learning_rate": 1e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.8396204113960266, + "num_tokens": 153775559.0, + "step": 4028 + }, + { + "epoch": 0.5125302124411653, + "ewc_loss": 0.028950871899724007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4475435818894766e-05, + "grad_norm": 23.16227149963379, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8582154512405396, + "num_tokens": 153819158.0, + "step": 4029 + }, + { + "epoch": 0.5126574227197558, + "ewc_loss": 0.02886536531150341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4432682291953824e-05, + "grad_norm": 23.064287185668945, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8465266227722168, + "num_tokens": 153856231.0, + "step": 4030 + }, + { + "epoch": 0.5127846329983463, + "ewc_loss": 0.028987688943743706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4493844901153352e-05, + "grad_norm": 23.140037536621094, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8644366264343262, + "num_tokens": 153897125.0, + "step": 4031 + }, + { + "epoch": 0.5129118432769367, + "ewc_loss": 0.028991293162107468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4495646610157564e-05, + "grad_norm": 23.242210388183594, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.85992431640625, + "num_tokens": 153933552.0, + "step": 4032 + }, + { + "epoch": 0.5130390535555273, + "ewc_loss": 0.02898838371038437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4494191418634728e-05, + "grad_norm": 23.186466217041016, + "learning_rate": 1e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.8391333818435669, + "num_tokens": 153974337.0, + "step": 4033 + }, + { + "epoch": 0.5131662638341178, + "ewc_loss": 0.02895631454885006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4478157027042471e-05, + "grad_norm": 23.10810661315918, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.847974956035614, + "num_tokens": 154018911.0, + "step": 4034 + }, + { + "epoch": 0.5132934741127083, + "ewc_loss": 0.028979836031794548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4489917703031097e-05, + "grad_norm": 23.209285736083984, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8634945154190063, + "num_tokens": 154056441.0, + "step": 4035 + }, + { + "epoch": 0.5134206843912988, + "ewc_loss": 0.029041161760687828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4520581316901371e-05, + "grad_norm": 23.141277313232422, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8439369201660156, + "num_tokens": 154096326.0, + "step": 4036 + }, + { + "epoch": 0.5135478946698894, + "ewc_loss": 0.028974255546927452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4487127373286057e-05, + "grad_norm": 23.203927993774414, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8575891256332397, + "num_tokens": 154137223.0, + "step": 4037 + }, + { + "epoch": 0.5136751049484798, + "ewc_loss": 0.029003608971834183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4501804798783269e-05, + "grad_norm": 23.08599281311035, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8528066873550415, + "num_tokens": 154168312.0, + "step": 4038 + }, + { + "epoch": 0.5138023152270703, + "ewc_loss": 0.029003093019127846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4501546502287965e-05, + "grad_norm": 23.351253509521484, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8679065108299255, + "num_tokens": 154203125.0, + "step": 4039 + }, + { + "epoch": 0.5139295255056608, + "ewc_loss": 0.028961004689335823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4480502613878343e-05, + "grad_norm": 22.997692108154297, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8488170504570007, + "num_tokens": 154248437.0, + "step": 4040 + }, + { + "epoch": 0.5140567357842514, + "ewc_loss": 0.028968188911676407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4484094208455645e-05, + "grad_norm": 23.203968048095703, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8573961853981018, + "num_tokens": 154286303.0, + "step": 4041 + }, + { + "epoch": 0.5141839460628419, + "ewc_loss": 0.029051601886749268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4525800906994846e-05, + "grad_norm": 23.185535430908203, + "learning_rate": 1e-06, + "loss": 0.5014, + "mean_token_accuracy": 0.8446838855743408, + "num_tokens": 154328638.0, + "step": 4042 + }, + { + "epoch": 0.5143111563414324, + "ewc_loss": 0.028941286727786064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4470643691311125e-05, + "grad_norm": 23.172605514526367, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8520261645317078, + "num_tokens": 154362953.0, + "step": 4043 + }, + { + "epoch": 0.5144383666200228, + "ewc_loss": 0.02899794653058052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.449897354177665e-05, + "grad_norm": 23.1131649017334, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8406823873519897, + "num_tokens": 154401289.0, + "step": 4044 + }, + { + "epoch": 0.5145655768986134, + "ewc_loss": 0.02896084263920784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4480421668849885e-05, + "grad_norm": 23.183115005493164, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8580597043037415, + "num_tokens": 154438779.0, + "step": 4045 + }, + { + "epoch": 0.5146927871772039, + "ewc_loss": 0.029060548171401024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4530273801938165e-05, + "grad_norm": 23.209686279296875, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8478744626045227, + "num_tokens": 154467032.0, + "step": 4046 + }, + { + "epoch": 0.5148199974557944, + "ewc_loss": 0.029004419222474098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4502209523925558e-05, + "grad_norm": 23.067882537841797, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8638331890106201, + "num_tokens": 154505222.0, + "step": 4047 + }, + { + "epoch": 0.514947207734385, + "ewc_loss": 0.029055606573820114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.452780361432815e-05, + "grad_norm": 23.216461181640625, + "learning_rate": 1e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8400764465332031, + "num_tokens": 154544953.0, + "step": 4048 + }, + { + "epoch": 0.5150744180129755, + "ewc_loss": 0.02904529869556427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4522649507853203e-05, + "grad_norm": 23.145673751831055, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8663926720619202, + "num_tokens": 154586193.0, + "step": 4049 + }, + { + "epoch": 0.5152016282915659, + "ewc_loss": 0.02906087040901184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4530434782500379e-05, + "grad_norm": 23.239057540893555, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8582369089126587, + "num_tokens": 154629531.0, + "step": 4050 + }, + { + "epoch": 0.5153288385701564, + "ewc_loss": 0.029010426253080368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4505212675430812e-05, + "grad_norm": 23.187725067138672, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8689147233963013, + "num_tokens": 154659482.0, + "step": 4051 + }, + { + "epoch": 0.515456048848747, + "ewc_loss": 0.029031479731202126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4515740076603834e-05, + "grad_norm": 23.079416275024414, + "learning_rate": 1e-06, + "loss": 0.5638, + "mean_token_accuracy": 0.8202945590019226, + "num_tokens": 154702645.0, + "step": 4052 + }, + { + "epoch": 0.5155832591273375, + "ewc_loss": 0.029060835018754005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4530417502101045e-05, + "grad_norm": 23.2033634185791, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8632156848907471, + "num_tokens": 154743329.0, + "step": 4053 + }, + { + "epoch": 0.515710469405928, + "ewc_loss": 0.0291306022554636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4565301171387546e-05, + "grad_norm": 23.129703521728516, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8496431112289429, + "num_tokens": 154783460.0, + "step": 4054 + }, + { + "epoch": 0.5158376796845185, + "ewc_loss": 0.029062937945127487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4531468877976295e-05, + "grad_norm": 23.185964584350586, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8481149077415466, + "num_tokens": 154827209.0, + "step": 4055 + }, + { + "epoch": 0.5159648899631091, + "ewc_loss": 0.02908429317176342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4542146345775109e-05, + "grad_norm": 23.087696075439453, + "learning_rate": 1e-06, + "loss": 0.5457, + "mean_token_accuracy": 0.828445553779602, + "num_tokens": 154866577.0, + "step": 4056 + }, + { + "epoch": 0.5160921002416995, + "ewc_loss": 0.029072117060422897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.453605818824144e-05, + "grad_norm": 23.162702560424805, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8559722304344177, + "num_tokens": 154912191.0, + "step": 4057 + }, + { + "epoch": 0.51621931052029, + "ewc_loss": 0.029120782390236855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4560390809492674e-05, + "grad_norm": 23.25718879699707, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8404103517532349, + "num_tokens": 154953412.0, + "step": 4058 + }, + { + "epoch": 0.5163465207988805, + "ewc_loss": 0.02907686121761799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4538430150423665e-05, + "grad_norm": 23.134746551513672, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8648505806922913, + "num_tokens": 154989913.0, + "step": 4059 + }, + { + "epoch": 0.5164737310774711, + "ewc_loss": 0.02908777818083763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4543888937623706e-05, + "grad_norm": 23.24837303161621, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8417806029319763, + "num_tokens": 155038094.0, + "step": 4060 + }, + { + "epoch": 0.5166009413560616, + "ewc_loss": 0.029078016057610512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.453900767955929e-05, + "grad_norm": 23.175071716308594, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8431957960128784, + "num_tokens": 155075900.0, + "step": 4061 + }, + { + "epoch": 0.5167281516346521, + "ewc_loss": 0.02901385724544525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4506928891933057e-05, + "grad_norm": 23.107891082763672, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8525019884109497, + "num_tokens": 155117038.0, + "step": 4062 + }, + { + "epoch": 0.5168553619132426, + "ewc_loss": 0.029018383473157883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4509191714751069e-05, + "grad_norm": 23.233352661132812, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8659645915031433, + "num_tokens": 155151442.0, + "step": 4063 + }, + { + "epoch": 0.5169825721918331, + "ewc_loss": 0.029072869569063187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4536434719047975e-05, + "grad_norm": 23.213594436645508, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8623764514923096, + "num_tokens": 155192288.0, + "step": 4064 + }, + { + "epoch": 0.5171097824704236, + "ewc_loss": 0.029002778232097626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4501389159704559e-05, + "grad_norm": 23.207460403442383, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8567100763320923, + "num_tokens": 155238931.0, + "step": 4065 + }, + { + "epoch": 0.5172369927490141, + "ewc_loss": 0.029014036059379578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4507018022413831e-05, + "grad_norm": 23.125883102416992, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8667914867401123, + "num_tokens": 155276644.0, + "step": 4066 + }, + { + "epoch": 0.5173642030276047, + "ewc_loss": 0.029019588604569435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4509794709738344e-05, + "grad_norm": 23.263883590698242, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8657529354095459, + "num_tokens": 155307638.0, + "step": 4067 + }, + { + "epoch": 0.5174914133061952, + "ewc_loss": 0.029052609577775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4526304767059628e-05, + "grad_norm": 23.166271209716797, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8468097448348999, + "num_tokens": 155343255.0, + "step": 4068 + }, + { + "epoch": 0.5176186235847856, + "ewc_loss": 0.029005007818341255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4502504200208932e-05, + "grad_norm": 23.26091957092285, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8558241128921509, + "num_tokens": 155382283.0, + "step": 4069 + }, + { + "epoch": 0.5177458338633761, + "ewc_loss": 0.02908480539917946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4542402823281009e-05, + "grad_norm": 23.214366912841797, + "learning_rate": 1e-06, + "loss": 0.5365, + "mean_token_accuracy": 0.831660807132721, + "num_tokens": 155427096.0, + "step": 4070 + }, + { + "epoch": 0.5178730441419667, + "ewc_loss": 0.029037468135356903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.451873413316207e-05, + "grad_norm": 23.218292236328125, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.841641366481781, + "num_tokens": 155467472.0, + "step": 4071 + }, + { + "epoch": 0.5180002544205572, + "ewc_loss": 0.028995715081691742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4497857591777574e-05, + "grad_norm": 23.161596298217773, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8482189178466797, + "num_tokens": 155508350.0, + "step": 4072 + }, + { + "epoch": 0.5181274646991477, + "ewc_loss": 0.029069332405924797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4534665751853026e-05, + "grad_norm": 23.190528869628906, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8762074708938599, + "num_tokens": 155547215.0, + "step": 4073 + }, + { + "epoch": 0.5182546749777382, + "ewc_loss": 0.029002953320741653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4501476471195929e-05, + "grad_norm": 23.151098251342773, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8580719232559204, + "num_tokens": 155586658.0, + "step": 4074 + }, + { + "epoch": 0.5183818852563287, + "ewc_loss": 0.029070254415273666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4535126865666825e-05, + "grad_norm": 23.203481674194336, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8557513952255249, + "num_tokens": 155620944.0, + "step": 4075 + }, + { + "epoch": 0.5185090955349192, + "ewc_loss": 0.029103538021445274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4551768799719866e-05, + "grad_norm": 23.21113395690918, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8516782522201538, + "num_tokens": 155662699.0, + "step": 4076 + }, + { + "epoch": 0.5186363058135097, + "ewc_loss": 0.02909560315310955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4547801583830733e-05, + "grad_norm": 23.300443649291992, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8415944576263428, + "num_tokens": 155707783.0, + "step": 4077 + }, + { + "epoch": 0.5187635160921003, + "ewc_loss": 0.029078729450702667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4539365110977087e-05, + "grad_norm": 23.262176513671875, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8585885763168335, + "num_tokens": 155745227.0, + "step": 4078 + }, + { + "epoch": 0.5188907263706908, + "ewc_loss": 0.029021907597780228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4510953405988403e-05, + "grad_norm": 23.237321853637695, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8540965914726257, + "num_tokens": 155786466.0, + "step": 4079 + }, + { + "epoch": 0.5190179366492813, + "ewc_loss": 0.02902381494641304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4511907465930562e-05, + "grad_norm": 23.20955467224121, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8550223708152771, + "num_tokens": 155833145.0, + "step": 4080 + }, + { + "epoch": 0.5191451469278717, + "ewc_loss": 0.028982456773519516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4491228284896351e-05, + "grad_norm": 23.172225952148438, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.84478759765625, + "num_tokens": 155869533.0, + "step": 4081 + }, + { + "epoch": 0.5192723572064623, + "ewc_loss": 0.029060473665595055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4530236512655392e-05, + "grad_norm": 23.23777961730957, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8529357314109802, + "num_tokens": 155909329.0, + "step": 4082 + }, + { + "epoch": 0.5193995674850528, + "ewc_loss": 0.02901059202849865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4505296348943375e-05, + "grad_norm": 23.22890853881836, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8629388213157654, + "num_tokens": 155947848.0, + "step": 4083 + }, + { + "epoch": 0.5195267777636433, + "ewc_loss": 0.029075762256979942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4537880815623794e-05, + "grad_norm": 23.237606048583984, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8580324649810791, + "num_tokens": 155982693.0, + "step": 4084 + }, + { + "epoch": 0.5196539880422338, + "ewc_loss": 0.029043741524219513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4521870980388485e-05, + "grad_norm": 23.285293579101562, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8532122373580933, + "num_tokens": 156023414.0, + "step": 4085 + }, + { + "epoch": 0.5197811983208244, + "ewc_loss": 0.028977178037166595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4488588931271806e-05, + "grad_norm": 23.110376358032227, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8539537787437439, + "num_tokens": 156063488.0, + "step": 4086 + }, + { + "epoch": 0.5199084085994148, + "ewc_loss": 0.029052024707198143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4526011909765657e-05, + "grad_norm": 23.227005004882812, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8539708852767944, + "num_tokens": 156103081.0, + "step": 4087 + }, + { + "epoch": 0.5200356188780053, + "ewc_loss": 0.029096024110913277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4548011677106842e-05, + "grad_norm": 23.19268798828125, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8449063301086426, + "num_tokens": 156139453.0, + "step": 4088 + }, + { + "epoch": 0.5201628291565958, + "ewc_loss": 0.029110006988048553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4555003872374073e-05, + "grad_norm": 23.227813720703125, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8485751152038574, + "num_tokens": 156176564.0, + "step": 4089 + }, + { + "epoch": 0.5202900394351864, + "ewc_loss": 0.029115833342075348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4557916983903851e-05, + "grad_norm": 23.214189529418945, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8570647835731506, + "num_tokens": 156221278.0, + "step": 4090 + }, + { + "epoch": 0.5204172497137769, + "ewc_loss": 0.029085487127304077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4542743883794174e-05, + "grad_norm": 23.339536666870117, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8596291542053223, + "num_tokens": 156259124.0, + "step": 4091 + }, + { + "epoch": 0.5205444599923674, + "ewc_loss": 0.02913258597254753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4566292520612478e-05, + "grad_norm": 23.232582092285156, + "learning_rate": 1e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8441371917724609, + "num_tokens": 156297832.0, + "step": 4092 + }, + { + "epoch": 0.5206716702709578, + "ewc_loss": 0.029065296053886414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4532648492604494e-05, + "grad_norm": 23.30179214477539, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8610570430755615, + "num_tokens": 156329745.0, + "step": 4093 + }, + { + "epoch": 0.5207988805495484, + "ewc_loss": 0.029063204303383827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4531602573697455e-05, + "grad_norm": 23.26943588256836, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8536636829376221, + "num_tokens": 156360154.0, + "step": 4094 + }, + { + "epoch": 0.5209260908281389, + "ewc_loss": 0.02904076874256134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4520383956551086e-05, + "grad_norm": 23.15605354309082, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8555464148521423, + "num_tokens": 156399360.0, + "step": 4095 + }, + { + "epoch": 0.5210533011067294, + "ewc_loss": 0.02915172465145588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4575862223864533e-05, + "grad_norm": 23.40826988220215, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8527670502662659, + "num_tokens": 156438782.0, + "step": 4096 + }, + { + "epoch": 0.52118051138532, + "ewc_loss": 0.029092658311128616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4546329111908562e-05, + "grad_norm": 23.205570220947266, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8595664501190186, + "num_tokens": 156470261.0, + "step": 4097 + }, + { + "epoch": 0.5213077216639105, + "ewc_loss": 0.029135318472981453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4567659491149243e-05, + "grad_norm": 23.2321834564209, + "learning_rate": 1e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.8369224071502686, + "num_tokens": 156510493.0, + "step": 4098 + }, + { + "epoch": 0.5214349319425009, + "ewc_loss": 0.02915298379957676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.457649159419816e-05, + "grad_norm": 23.291976928710938, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8562191128730774, + "num_tokens": 156545040.0, + "step": 4099 + }, + { + "epoch": 0.5215621422210914, + "ewc_loss": 0.029142169281840324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.457108464819612e-05, + "grad_norm": 23.097187042236328, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8526009917259216, + "num_tokens": 156583474.0, + "step": 4100 + }, + { + "epoch": 0.521689352499682, + "ewc_loss": 0.029128678143024445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.456433892599307e-05, + "grad_norm": 23.23877716064453, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8537459969520569, + "num_tokens": 156627137.0, + "step": 4101 + }, + { + "epoch": 0.5218165627782725, + "ewc_loss": 0.029269535094499588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.463476746721426e-05, + "grad_norm": 23.32879066467285, + "learning_rate": 1e-06, + "loss": 0.5049, + "mean_token_accuracy": 0.8434240221977234, + "num_tokens": 156664726.0, + "step": 4102 + }, + { + "epoch": 0.521943773056863, + "ewc_loss": 0.02914930135011673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4574650776921771e-05, + "grad_norm": 23.186471939086914, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8714779615402222, + "num_tokens": 156707014.0, + "step": 4103 + }, + { + "epoch": 0.5220709833354535, + "ewc_loss": 0.029202846810221672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4601423572457861e-05, + "grad_norm": 23.26133155822754, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8506039381027222, + "num_tokens": 156747784.0, + "step": 4104 + }, + { + "epoch": 0.522198193614044, + "ewc_loss": 0.029191767796874046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4595883840229362e-05, + "grad_norm": 23.26577377319336, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8521115183830261, + "num_tokens": 156784760.0, + "step": 4105 + }, + { + "epoch": 0.5223254038926345, + "ewc_loss": 0.029170652851462364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4585326425731182e-05, + "grad_norm": 23.258268356323242, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8516842126846313, + "num_tokens": 156822899.0, + "step": 4106 + }, + { + "epoch": 0.522452614171225, + "ewc_loss": 0.02917100116610527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.458550013921922e-05, + "grad_norm": 23.251983642578125, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8598442077636719, + "num_tokens": 156861308.0, + "step": 4107 + }, + { + "epoch": 0.5225798244498155, + "ewc_loss": 0.029176492244005203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4588245903723873e-05, + "grad_norm": 23.274192810058594, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8607581853866577, + "num_tokens": 156901981.0, + "step": 4108 + }, + { + "epoch": 0.5227070347284061, + "ewc_loss": 0.029196586459875107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4598293091694359e-05, + "grad_norm": 23.17792510986328, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8594618439674377, + "num_tokens": 156942215.0, + "step": 4109 + }, + { + "epoch": 0.5228342450069966, + "ewc_loss": 0.02917328290641308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4586641555069946e-05, + "grad_norm": 23.270456314086914, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8601766228675842, + "num_tokens": 156983820.0, + "step": 4110 + }, + { + "epoch": 0.5229614552855871, + "ewc_loss": 0.029187900945544243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4593950254493393e-05, + "grad_norm": 23.138023376464844, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8665170073509216, + "num_tokens": 157016215.0, + "step": 4111 + }, + { + "epoch": 0.5230886655641775, + "ewc_loss": 0.029091624543070793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4545812518917955e-05, + "grad_norm": 23.214012145996094, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8535507917404175, + "num_tokens": 157052071.0, + "step": 4112 + }, + { + "epoch": 0.5232158758427681, + "ewc_loss": 0.029237691313028336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4618845852965023e-05, + "grad_norm": 23.225454330444336, + "learning_rate": 1e-06, + "loss": 0.5408, + "mean_token_accuracy": 0.8321497440338135, + "num_tokens": 157097711.0, + "step": 4113 + }, + { + "epoch": 0.5233430861213586, + "ewc_loss": 0.02916480042040348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4582400581275579e-05, + "grad_norm": 23.229820251464844, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.85207200050354, + "num_tokens": 157134801.0, + "step": 4114 + }, + { + "epoch": 0.5234702963999491, + "ewc_loss": 0.02917100489139557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4585502867703326e-05, + "grad_norm": 23.243316650390625, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8550649881362915, + "num_tokens": 157171885.0, + "step": 4115 + }, + { + "epoch": 0.5235975066785397, + "ewc_loss": 0.02914394810795784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4571974134014454e-05, + "grad_norm": 23.170015335083008, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.852668046951294, + "num_tokens": 157211915.0, + "step": 4116 + }, + { + "epoch": 0.5237247169571302, + "ewc_loss": 0.02921026386320591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.460513158235699e-05, + "grad_norm": 23.29637908935547, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.848070502281189, + "num_tokens": 157260564.0, + "step": 4117 + }, + { + "epoch": 0.5238519272357206, + "ewc_loss": 0.029183706268668175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4591852959711105e-05, + "grad_norm": 23.184715270996094, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8546077013015747, + "num_tokens": 157294676.0, + "step": 4118 + }, + { + "epoch": 0.5239791375143111, + "ewc_loss": 0.029138486832380295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4569243830919731e-05, + "grad_norm": 23.191877365112305, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8651396632194519, + "num_tokens": 157334607.0, + "step": 4119 + }, + { + "epoch": 0.5241063477929017, + "ewc_loss": 0.0291788037866354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4589401871489827e-05, + "grad_norm": 23.213680267333984, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8553839921951294, + "num_tokens": 157372021.0, + "step": 4120 + }, + { + "epoch": 0.5242335580714922, + "ewc_loss": 0.029199793934822083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4599896530853584e-05, + "grad_norm": 23.302082061767578, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8588850498199463, + "num_tokens": 157410092.0, + "step": 4121 + }, + { + "epoch": 0.5243607683500827, + "ewc_loss": 0.02914772927761078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.457386497349944e-05, + "grad_norm": 23.21260643005371, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8506793975830078, + "num_tokens": 157447979.0, + "step": 4122 + }, + { + "epoch": 0.5244879786286732, + "ewc_loss": 0.0292112585157156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.460562907595886e-05, + "grad_norm": 23.37409210205078, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8489296436309814, + "num_tokens": 157481376.0, + "step": 4123 + }, + { + "epoch": 0.5246151889072637, + "ewc_loss": 0.029159773141145706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4579886737919878e-05, + "grad_norm": 23.23903465270996, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8596221804618835, + "num_tokens": 157519555.0, + "step": 4124 + }, + { + "epoch": 0.5247423991858542, + "ewc_loss": 0.029102768748998642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4551384083461016e-05, + "grad_norm": 23.14138412475586, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8517716526985168, + "num_tokens": 157551849.0, + "step": 4125 + }, + { + "epoch": 0.5248696094644447, + "ewc_loss": 0.029226304963231087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4613152416131925e-05, + "grad_norm": 23.29489517211914, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.85378497838974, + "num_tokens": 157590823.0, + "step": 4126 + }, + { + "epoch": 0.5249968197430352, + "ewc_loss": 0.029210969805717468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4605484466301277e-05, + "grad_norm": 23.190454483032227, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8649464249610901, + "num_tokens": 157633771.0, + "step": 4127 + }, + { + "epoch": 0.5251240300216258, + "ewc_loss": 0.029187744483351707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.459387203794904e-05, + "grad_norm": 23.244953155517578, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8684402704238892, + "num_tokens": 157667796.0, + "step": 4128 + }, + { + "epoch": 0.5252512403002163, + "ewc_loss": 0.029208006337285042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.460400289943209e-05, + "grad_norm": 23.32805633544922, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8643175959587097, + "num_tokens": 157709452.0, + "step": 4129 + }, + { + "epoch": 0.5253784505788067, + "ewc_loss": 0.02922380343079567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4611901860916987e-05, + "grad_norm": 23.246227264404297, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8450566530227661, + "num_tokens": 157741407.0, + "step": 4130 + }, + { + "epoch": 0.5255056608573972, + "ewc_loss": 0.02921619452536106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4608097444579471e-05, + "grad_norm": 23.275882720947266, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8654298782348633, + "num_tokens": 157777693.0, + "step": 4131 + }, + { + "epoch": 0.5256328711359878, + "ewc_loss": 0.029207132756710052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4603566341975238e-05, + "grad_norm": 23.327852249145508, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8527789115905762, + "num_tokens": 157814729.0, + "step": 4132 + }, + { + "epoch": 0.5257600814145783, + "ewc_loss": 0.029182931408286095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.459146551496815e-05, + "grad_norm": 23.175199508666992, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8640735745429993, + "num_tokens": 157853698.0, + "step": 4133 + }, + { + "epoch": 0.5258872916931688, + "ewc_loss": 0.029149064794182777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4574532542610541e-05, + "grad_norm": 23.249412536621094, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8590397834777832, + "num_tokens": 157888580.0, + "step": 4134 + }, + { + "epoch": 0.5260145019717594, + "ewc_loss": 0.029230408370494843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4615204236179125e-05, + "grad_norm": 23.31712532043457, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8524128198623657, + "num_tokens": 157928434.0, + "step": 4135 + }, + { + "epoch": 0.5261417122503498, + "ewc_loss": 0.029216913506388664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4608456694986671e-05, + "grad_norm": 23.215808868408203, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8497477173805237, + "num_tokens": 157968936.0, + "step": 4136 + }, + { + "epoch": 0.5262689225289403, + "ewc_loss": 0.029225360602140427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4612680388381705e-05, + "grad_norm": 23.270692825317383, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8439459800720215, + "num_tokens": 158005496.0, + "step": 4137 + }, + { + "epoch": 0.5263961328075308, + "ewc_loss": 0.02929440140724182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4647201169282198e-05, + "grad_norm": 23.31464195251465, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8620144128799438, + "num_tokens": 158047569.0, + "step": 4138 + }, + { + "epoch": 0.5265233430861214, + "ewc_loss": 0.0292314775288105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4615739019063767e-05, + "grad_norm": 23.34855079650879, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8539413213729858, + "num_tokens": 158085733.0, + "step": 4139 + }, + { + "epoch": 0.5266505533647119, + "ewc_loss": 0.029250051826238632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.462502586946357e-05, + "grad_norm": 23.226755142211914, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8444327712059021, + "num_tokens": 158123906.0, + "step": 4140 + }, + { + "epoch": 0.5267777636433024, + "ewc_loss": 0.029243476688861847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4621738046116661e-05, + "grad_norm": 23.3514404296875, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8607087135314941, + "num_tokens": 158163663.0, + "step": 4141 + }, + { + "epoch": 0.5269049739218928, + "ewc_loss": 0.02924455888569355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4622279195464216e-05, + "grad_norm": 23.274681091308594, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8428021669387817, + "num_tokens": 158205878.0, + "step": 4142 + }, + { + "epoch": 0.5270321842004834, + "ewc_loss": 0.02919750101864338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.459875056752935e-05, + "grad_norm": 23.214580535888672, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8579022288322449, + "num_tokens": 158241608.0, + "step": 4143 + }, + { + "epoch": 0.5271593944790739, + "ewc_loss": 0.02927209623157978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4636048035754357e-05, + "grad_norm": 23.370450973510742, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8672446012496948, + "num_tokens": 158274259.0, + "step": 4144 + }, + { + "epoch": 0.5272866047576644, + "ewc_loss": 0.029238687828183174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4619344256061595e-05, + "grad_norm": 23.281902313232422, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8608536124229431, + "num_tokens": 158304793.0, + "step": 4145 + }, + { + "epoch": 0.527413815036255, + "ewc_loss": 0.0292633306235075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4631665180786513e-05, + "grad_norm": 23.263151168823242, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8686107397079468, + "num_tokens": 158349420.0, + "step": 4146 + }, + { + "epoch": 0.5275410253148455, + "ewc_loss": 0.029285328462719917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4642664609709755e-05, + "grad_norm": 23.287694931030273, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.853569507598877, + "num_tokens": 158395834.0, + "step": 4147 + }, + { + "epoch": 0.5276682355934359, + "ewc_loss": 0.029292389750480652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4646195268142037e-05, + "grad_norm": 23.205402374267578, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8565597534179688, + "num_tokens": 158435568.0, + "step": 4148 + }, + { + "epoch": 0.5277954458720264, + "ewc_loss": 0.02923300303518772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4616501175623853e-05, + "grad_norm": 23.247419357299805, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8645039200782776, + "num_tokens": 158473584.0, + "step": 4149 + }, + { + "epoch": 0.527922656150617, + "ewc_loss": 0.029296446591615677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.464822344132699e-05, + "grad_norm": 23.210664749145508, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8562332391738892, + "num_tokens": 158513435.0, + "step": 4150 + }, + { + "epoch": 0.5280498664292075, + "ewc_loss": 0.02930430881679058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4652154277428053e-05, + "grad_norm": 23.318906784057617, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8552758693695068, + "num_tokens": 158549478.0, + "step": 4151 + }, + { + "epoch": 0.528177076707798, + "ewc_loss": 0.029283612966537476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4641806046711281e-05, + "grad_norm": 23.308992385864258, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8591783046722412, + "num_tokens": 158598742.0, + "step": 4152 + }, + { + "epoch": 0.5283042869863885, + "ewc_loss": 0.02920708991587162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4603544514102396e-05, + "grad_norm": 23.09565544128418, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.8432706594467163, + "num_tokens": 158636806.0, + "step": 4153 + }, + { + "epoch": 0.528431497264979, + "ewc_loss": 0.029257245361804962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4628622921009082e-05, + "grad_norm": 23.26137924194336, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8489090204238892, + "num_tokens": 158676043.0, + "step": 4154 + }, + { + "epoch": 0.5285587075435695, + "ewc_loss": 0.029346013441681862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4673006262455601e-05, + "grad_norm": 23.29248046875, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8389437794685364, + "num_tokens": 158711772.0, + "step": 4155 + }, + { + "epoch": 0.52868591782216, + "ewc_loss": 0.029230359941720963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4615179679822177e-05, + "grad_norm": 23.149810791015625, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8537814021110535, + "num_tokens": 158749608.0, + "step": 4156 + }, + { + "epoch": 0.5288131281007505, + "ewc_loss": 0.02927270159125328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4636350897490047e-05, + "grad_norm": 23.2939453125, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8560731410980225, + "num_tokens": 158790502.0, + "step": 4157 + }, + { + "epoch": 0.5289403383793411, + "ewc_loss": 0.029255101457238197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4627550626755692e-05, + "grad_norm": 23.19169807434082, + "learning_rate": 1e-06, + "loss": 0.5408, + "mean_token_accuracy": 0.8308110237121582, + "num_tokens": 158828415.0, + "step": 4158 + }, + { + "epoch": 0.5290675486579316, + "ewc_loss": 0.029278971254825592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4639485925727058e-05, + "grad_norm": 23.267499923706055, + "learning_rate": 1e-06, + "loss": 0.494, + "mean_token_accuracy": 0.8441633582115173, + "num_tokens": 158866972.0, + "step": 4159 + }, + { + "epoch": 0.5291947589365221, + "ewc_loss": 0.02927488461136818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4637442291132174e-05, + "grad_norm": 23.2615909576416, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8516634702682495, + "num_tokens": 158902991.0, + "step": 4160 + }, + { + "epoch": 0.5293219692151125, + "ewc_loss": 0.029301922768354416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4650961020379327e-05, + "grad_norm": 23.26706886291504, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8622369766235352, + "num_tokens": 158945963.0, + "step": 4161 + }, + { + "epoch": 0.5294491794937031, + "ewc_loss": 0.02928899973630905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4644499970017932e-05, + "grad_norm": 23.28672981262207, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8540467619895935, + "num_tokens": 158985407.0, + "step": 4162 + }, + { + "epoch": 0.5295763897722936, + "ewc_loss": 0.02929597906768322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4647989701188635e-05, + "grad_norm": 23.27657699584961, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8480307459831238, + "num_tokens": 159024387.0, + "step": 4163 + }, + { + "epoch": 0.5297036000508841, + "ewc_loss": 0.02929707057774067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4648535398009699e-05, + "grad_norm": 23.19816017150879, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8464101552963257, + "num_tokens": 159061430.0, + "step": 4164 + }, + { + "epoch": 0.5298308103294747, + "ewc_loss": 0.02936774492263794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4683872905152384e-05, + "grad_norm": 23.372638702392578, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8560341596603394, + "num_tokens": 159099350.0, + "step": 4165 + }, + { + "epoch": 0.5299580206080652, + "ewc_loss": 0.029370594769716263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.468529717385536e-05, + "grad_norm": 23.257654190063477, + "learning_rate": 1e-06, + "loss": 0.557, + "mean_token_accuracy": 0.8285911083221436, + "num_tokens": 159136276.0, + "step": 4166 + }, + { + "epoch": 0.5300852308866556, + "ewc_loss": 0.02932565100491047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4662825378763955e-05, + "grad_norm": 23.430927276611328, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8763223886489868, + "num_tokens": 159177102.0, + "step": 4167 + }, + { + "epoch": 0.5302124411652461, + "ewc_loss": 0.029367797076702118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4683898371004034e-05, + "grad_norm": 23.255170822143555, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8657922148704529, + "num_tokens": 159213582.0, + "step": 4168 + }, + { + "epoch": 0.5303396514438367, + "ewc_loss": 0.02927067130804062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4635335901402868e-05, + "grad_norm": 23.202896118164062, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8533821702003479, + "num_tokens": 159254283.0, + "step": 4169 + }, + { + "epoch": 0.5304668617224272, + "ewc_loss": 0.02934367023408413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4671834833279718e-05, + "grad_norm": 23.406757354736328, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8570506572723389, + "num_tokens": 159288557.0, + "step": 4170 + }, + { + "epoch": 0.5305940720010177, + "ewc_loss": 0.02940303459763527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4701517102366779e-05, + "grad_norm": 23.437034606933594, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8641708493232727, + "num_tokens": 159325047.0, + "step": 4171 + }, + { + "epoch": 0.5307212822796082, + "ewc_loss": 0.029295191168785095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4647595889982767e-05, + "grad_norm": 23.2547664642334, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8589965105056763, + "num_tokens": 159366291.0, + "step": 4172 + }, + { + "epoch": 0.5308484925581987, + "ewc_loss": 0.029360273852944374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4680136700917501e-05, + "grad_norm": 23.415950775146484, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8483960628509521, + "num_tokens": 159401634.0, + "step": 4173 + }, + { + "epoch": 0.5309757028367892, + "ewc_loss": 0.02933305688202381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4666528841189574e-05, + "grad_norm": 23.1552734375, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8465411067008972, + "num_tokens": 159435316.0, + "step": 4174 + }, + { + "epoch": 0.5311029131153797, + "ewc_loss": 0.029343698173761368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4671849385194946e-05, + "grad_norm": 23.403194427490234, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8472964763641357, + "num_tokens": 159472192.0, + "step": 4175 + }, + { + "epoch": 0.5312301233939702, + "ewc_loss": 0.02942577563226223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4712887605128344e-05, + "grad_norm": 23.30912208557129, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8617954254150391, + "num_tokens": 159515342.0, + "step": 4176 + }, + { + "epoch": 0.5313573336725608, + "ewc_loss": 0.029325267300009727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4662633475381881e-05, + "grad_norm": 23.271970748901367, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.858538806438446, + "num_tokens": 159553932.0, + "step": 4177 + }, + { + "epoch": 0.5314845439511513, + "ewc_loss": 0.029341204091906548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4670602467958815e-05, + "grad_norm": 23.415987014770508, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.855563759803772, + "num_tokens": 159584711.0, + "step": 4178 + }, + { + "epoch": 0.5316117542297417, + "ewc_loss": 0.029357459396123886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4678729712613858e-05, + "grad_norm": 23.275529861450195, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8421950936317444, + "num_tokens": 159618784.0, + "step": 4179 + }, + { + "epoch": 0.5317389645083322, + "ewc_loss": 0.029276588931679726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4638294487667736e-05, + "grad_norm": 23.198434829711914, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8532170057296753, + "num_tokens": 159661288.0, + "step": 4180 + }, + { + "epoch": 0.5318661747869228, + "ewc_loss": 0.029398981481790543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4699490748171229e-05, + "grad_norm": 23.332157135009766, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8652170896530151, + "num_tokens": 159701286.0, + "step": 4181 + }, + { + "epoch": 0.5319933850655133, + "ewc_loss": 0.029319429770112038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4659714906883892e-05, + "grad_norm": 23.23077392578125, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8415416479110718, + "num_tokens": 159746631.0, + "step": 4182 + }, + { + "epoch": 0.5321205953441038, + "ewc_loss": 0.029402105137705803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4701052350574173e-05, + "grad_norm": 23.3145809173584, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.857566237449646, + "num_tokens": 159787753.0, + "step": 4183 + }, + { + "epoch": 0.5322478056226944, + "ewc_loss": 0.029396189376711845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4698094673804007e-05, + "grad_norm": 23.269075393676758, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.8473073244094849, + "num_tokens": 159823092.0, + "step": 4184 + }, + { + "epoch": 0.5323750159012848, + "ewc_loss": 0.029377654194831848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4688826922792941e-05, + "grad_norm": 23.325761795043945, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8452411890029907, + "num_tokens": 159858480.0, + "step": 4185 + }, + { + "epoch": 0.5325022261798753, + "ewc_loss": 0.029448725283145905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4724362699780613e-05, + "grad_norm": 23.420135498046875, + "learning_rate": 1e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8466408252716064, + "num_tokens": 159896220.0, + "step": 4186 + }, + { + "epoch": 0.5326294364584658, + "ewc_loss": 0.02934468723833561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4672343240818009e-05, + "grad_norm": 23.252643585205078, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8520892858505249, + "num_tokens": 159930995.0, + "step": 4187 + }, + { + "epoch": 0.5327566467370564, + "ewc_loss": 0.029368985444307327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4684492271044292e-05, + "grad_norm": 23.248937606811523, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8499817848205566, + "num_tokens": 159968013.0, + "step": 4188 + }, + { + "epoch": 0.5328838570156469, + "ewc_loss": 0.029427826404571533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4713913515151944e-05, + "grad_norm": 23.303571701049805, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8479476571083069, + "num_tokens": 160004085.0, + "step": 4189 + }, + { + "epoch": 0.5330110672942374, + "ewc_loss": 0.029411466792225838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4705733519804198e-05, + "grad_norm": 23.30565071105957, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8642064332962036, + "num_tokens": 160040606.0, + "step": 4190 + }, + { + "epoch": 0.5331382775728278, + "ewc_loss": 0.029376648366451263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4688323972222861e-05, + "grad_norm": 23.231428146362305, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8666622638702393, + "num_tokens": 160080792.0, + "step": 4191 + }, + { + "epoch": 0.5332654878514184, + "ewc_loss": 0.0294377151876688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4718857528350782e-05, + "grad_norm": 23.34733772277832, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8438997268676758, + "num_tokens": 160117055.0, + "step": 4192 + }, + { + "epoch": 0.5333926981300089, + "ewc_loss": 0.029388489201664925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4694244782731403e-05, + "grad_norm": 23.2215576171875, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8600846529006958, + "num_tokens": 160153223.0, + "step": 4193 + }, + { + "epoch": 0.5335199084085994, + "ewc_loss": 0.029411302879452705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4705651665281039e-05, + "grad_norm": 23.33783531188965, + "learning_rate": 1e-06, + "loss": 0.5215, + "mean_token_accuracy": 0.8367167711257935, + "num_tokens": 160190920.0, + "step": 4194 + }, + { + "epoch": 0.53364711868719, + "ewc_loss": 0.029492149129509926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4746074157301337e-05, + "grad_norm": 23.29844856262207, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8517630696296692, + "num_tokens": 160221898.0, + "step": 4195 + }, + { + "epoch": 0.5337743289657805, + "ewc_loss": 0.029433824121952057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.471691211918369e-05, + "grad_norm": 23.339513778686523, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8577488660812378, + "num_tokens": 160267102.0, + "step": 4196 + }, + { + "epoch": 0.5339015392443709, + "ewc_loss": 0.029471028596162796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4735514014319051e-05, + "grad_norm": 23.298900604248047, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8597598075866699, + "num_tokens": 160304422.0, + "step": 4197 + }, + { + "epoch": 0.5340287495229614, + "ewc_loss": 0.029469165951013565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4734582691744436e-05, + "grad_norm": 23.308576583862305, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8521482944488525, + "num_tokens": 160339006.0, + "step": 4198 + }, + { + "epoch": 0.534155959801552, + "ewc_loss": 0.029456019401550293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4728009773534723e-05, + "grad_norm": 23.398088455200195, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8746687769889832, + "num_tokens": 160377504.0, + "step": 4199 + }, + { + "epoch": 0.5342831700801425, + "ewc_loss": 0.029476439580321312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4738219761056826e-05, + "grad_norm": 23.27212142944336, + "learning_rate": 1e-06, + "loss": 0.5242, + "mean_token_accuracy": 0.8338078856468201, + "num_tokens": 160413315.0, + "step": 4200 + }, + { + "epoch": 0.534410380358733, + "ewc_loss": 0.029414834454655647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.470741699449718e-05, + "grad_norm": 23.33390235900879, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8679622411727905, + "num_tokens": 160455034.0, + "step": 4201 + }, + { + "epoch": 0.5345375906373235, + "ewc_loss": 0.029478304088115692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4739151993126143e-05, + "grad_norm": 23.357084274291992, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8493026494979858, + "num_tokens": 160490505.0, + "step": 4202 + }, + { + "epoch": 0.534664800915914, + "ewc_loss": 0.029474178329110146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4737089259142522e-05, + "grad_norm": 23.284536361694336, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8648179173469543, + "num_tokens": 160528712.0, + "step": 4203 + }, + { + "epoch": 0.5347920111945045, + "ewc_loss": 0.02946225181221962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4731125702382997e-05, + "grad_norm": 23.360193252563477, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8455121517181396, + "num_tokens": 160567945.0, + "step": 4204 + }, + { + "epoch": 0.534919221473095, + "ewc_loss": 0.029476938769221306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4738468962605111e-05, + "grad_norm": 23.288084030151367, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8616111874580383, + "num_tokens": 160596767.0, + "step": 4205 + }, + { + "epoch": 0.5350464317516855, + "ewc_loss": 0.029413672164082527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4706835827382747e-05, + "grad_norm": 23.380300521850586, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8510038256645203, + "num_tokens": 160636912.0, + "step": 4206 + }, + { + "epoch": 0.5351736420302761, + "ewc_loss": 0.02948334813117981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4741674021934159e-05, + "grad_norm": 23.341176986694336, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8477255702018738, + "num_tokens": 160674588.0, + "step": 4207 + }, + { + "epoch": 0.5353008523088666, + "ewc_loss": 0.02944607101380825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4723035747010726e-05, + "grad_norm": 23.416641235351562, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8429137468338013, + "num_tokens": 160713441.0, + "step": 4208 + }, + { + "epoch": 0.5354280625874571, + "ewc_loss": 0.029476583003997803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4738291611138266e-05, + "grad_norm": 23.389596939086914, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8638232946395874, + "num_tokens": 160752716.0, + "step": 4209 + }, + { + "epoch": 0.5355552728660475, + "ewc_loss": 0.029409604147076607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4704802197229583e-05, + "grad_norm": 23.246747970581055, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8654888868331909, + "num_tokens": 160798771.0, + "step": 4210 + }, + { + "epoch": 0.5356824831446381, + "ewc_loss": 0.02938614785671234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4693074263050221e-05, + "grad_norm": 23.273086547851562, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8721837997436523, + "num_tokens": 160843085.0, + "step": 4211 + }, + { + "epoch": 0.5358096934232286, + "ewc_loss": 0.029451103881001472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4725552318850532e-05, + "grad_norm": 23.404489517211914, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8471928834915161, + "num_tokens": 160883073.0, + "step": 4212 + }, + { + "epoch": 0.5359369037018191, + "ewc_loss": 0.029422754421830177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.47113769344287e-05, + "grad_norm": 23.282108306884766, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8613698482513428, + "num_tokens": 160923734.0, + "step": 4213 + }, + { + "epoch": 0.5360641139804097, + "ewc_loss": 0.029465483501553535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4732741874468047e-05, + "grad_norm": 23.371095657348633, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8514171242713928, + "num_tokens": 160963460.0, + "step": 4214 + }, + { + "epoch": 0.5361913242590002, + "ewc_loss": 0.029406502842903137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.470325150876306e-05, + "grad_norm": 23.43628692626953, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8520239591598511, + "num_tokens": 161003691.0, + "step": 4215 + }, + { + "epoch": 0.5363185345375906, + "ewc_loss": 0.029458746314048767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.472937310609268e-05, + "grad_norm": 23.213085174560547, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8551710247993469, + "num_tokens": 161042056.0, + "step": 4216 + }, + { + "epoch": 0.5364457448161811, + "ewc_loss": 0.029372824355959892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4686412214359734e-05, + "grad_norm": 23.352977752685547, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.878325343132019, + "num_tokens": 161078670.0, + "step": 4217 + }, + { + "epoch": 0.5365729550947717, + "ewc_loss": 0.029493434354662895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.474671717005549e-05, + "grad_norm": 23.25655746459961, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8675543665885925, + "num_tokens": 161113992.0, + "step": 4218 + }, + { + "epoch": 0.5367001653733622, + "ewc_loss": 0.02941841073334217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4709205061080866e-05, + "grad_norm": 23.343456268310547, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8598074913024902, + "num_tokens": 161153763.0, + "step": 4219 + }, + { + "epoch": 0.5368273756519527, + "ewc_loss": 0.029490530490875244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.474526561651146e-05, + "grad_norm": 23.35918617248535, + "learning_rate": 1e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.8392388820648193, + "num_tokens": 161184186.0, + "step": 4220 + }, + { + "epoch": 0.5369545859305432, + "ewc_loss": 0.029416462406516075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4708230992255267e-05, + "grad_norm": 23.340526580810547, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8530522584915161, + "num_tokens": 161219298.0, + "step": 4221 + }, + { + "epoch": 0.5370817962091337, + "ewc_loss": 0.02950219251215458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4751096387044527e-05, + "grad_norm": 23.28859519958496, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8700613379478455, + "num_tokens": 161255252.0, + "step": 4222 + }, + { + "epoch": 0.5372090064877242, + "ewc_loss": 0.029414160177111626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4707080481457524e-05, + "grad_norm": 23.3654842376709, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8592759370803833, + "num_tokens": 161295890.0, + "step": 4223 + }, + { + "epoch": 0.5373362167663147, + "ewc_loss": 0.02944999746978283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4724998436577152e-05, + "grad_norm": 23.180917739868164, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8516806960105896, + "num_tokens": 161330808.0, + "step": 4224 + }, + { + "epoch": 0.5374634270449052, + "ewc_loss": 0.029431326314806938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4715663382958155e-05, + "grad_norm": 23.328794479370117, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.863250732421875, + "num_tokens": 161370515.0, + "step": 4225 + }, + { + "epoch": 0.5375906373234958, + "ewc_loss": 0.029493752866983414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.47468763316283e-05, + "grad_norm": 23.33755111694336, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.867640495300293, + "num_tokens": 161405257.0, + "step": 4226 + }, + { + "epoch": 0.5377178476020863, + "ewc_loss": 0.029504425823688507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4752213246538304e-05, + "grad_norm": 23.307985305786133, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8560492992401123, + "num_tokens": 161443526.0, + "step": 4227 + }, + { + "epoch": 0.5378450578806767, + "ewc_loss": 0.029522109776735306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4761054444534238e-05, + "grad_norm": 23.40259552001953, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8487281203269958, + "num_tokens": 161481262.0, + "step": 4228 + }, + { + "epoch": 0.5379722681592672, + "ewc_loss": 0.02948981523513794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4744907275598962e-05, + "grad_norm": 23.232807159423828, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8404573202133179, + "num_tokens": 161517756.0, + "step": 4229 + }, + { + "epoch": 0.5380994784378578, + "ewc_loss": 0.029507337138056755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.475366843806114e-05, + "grad_norm": 23.435367584228516, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8632079362869263, + "num_tokens": 161552680.0, + "step": 4230 + }, + { + "epoch": 0.5382266887164483, + "ewc_loss": 0.02957051433622837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4785257008043118e-05, + "grad_norm": 23.341856002807617, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8515080213546753, + "num_tokens": 161588607.0, + "step": 4231 + }, + { + "epoch": 0.5383538989950388, + "ewc_loss": 0.02948792092502117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4743960491614416e-05, + "grad_norm": 23.257434844970703, + "learning_rate": 1e-06, + "loss": 0.5188, + "mean_token_accuracy": 0.8382751941680908, + "num_tokens": 161626427.0, + "step": 4232 + }, + { + "epoch": 0.5384811092736294, + "ewc_loss": 0.02956184186041355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4780920537305064e-05, + "grad_norm": 23.38751220703125, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8609694242477417, + "num_tokens": 161665182.0, + "step": 4233 + }, + { + "epoch": 0.5386083195522198, + "ewc_loss": 0.029525842517614365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4762921637156978e-05, + "grad_norm": 23.286142349243164, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8559901118278503, + "num_tokens": 161704685.0, + "step": 4234 + }, + { + "epoch": 0.5387355298308103, + "ewc_loss": 0.029529137536883354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4764568732061889e-05, + "grad_norm": 23.337238311767578, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8608173131942749, + "num_tokens": 161742960.0, + "step": 4235 + }, + { + "epoch": 0.5388627401094008, + "ewc_loss": 0.02956053428351879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4780267520109192e-05, + "grad_norm": 23.249649047851562, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8550670742988586, + "num_tokens": 161778199.0, + "step": 4236 + }, + { + "epoch": 0.5389899503879914, + "ewc_loss": 0.029554443433880806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4777221622352954e-05, + "grad_norm": 23.386611938476562, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8512903451919556, + "num_tokens": 161811235.0, + "step": 4237 + }, + { + "epoch": 0.5391171606665819, + "ewc_loss": 0.029642969369888306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4821484910498839e-05, + "grad_norm": 23.312610626220703, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8547106981277466, + "num_tokens": 161847669.0, + "step": 4238 + }, + { + "epoch": 0.5392443709451724, + "ewc_loss": 0.02952766604721546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4763832950848155e-05, + "grad_norm": 23.325794219970703, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8415799736976624, + "num_tokens": 161888470.0, + "step": 4239 + }, + { + "epoch": 0.5393715812237628, + "ewc_loss": 0.029634928330779076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.48174640344223e-05, + "grad_norm": 23.25775909423828, + "learning_rate": 1e-06, + "loss": 0.5234, + "mean_token_accuracy": 0.8362771272659302, + "num_tokens": 161924563.0, + "step": 4240 + }, + { + "epoch": 0.5394987915023534, + "ewc_loss": 0.029602939262986183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4801469660596922e-05, + "grad_norm": 23.350889205932617, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8411368131637573, + "num_tokens": 161959062.0, + "step": 4241 + }, + { + "epoch": 0.5396260017809439, + "ewc_loss": 0.029625514522194862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4812757399340626e-05, + "grad_norm": 23.269132614135742, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8589487075805664, + "num_tokens": 161994755.0, + "step": 4242 + }, + { + "epoch": 0.5397532120595344, + "ewc_loss": 0.029662733897566795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4831366570433602e-05, + "grad_norm": 23.324644088745117, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8636939525604248, + "num_tokens": 162032812.0, + "step": 4243 + }, + { + "epoch": 0.5398804223381249, + "ewc_loss": 0.02963012084364891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4815060239925515e-05, + "grad_norm": 23.300704956054688, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8590570688247681, + "num_tokens": 162067130.0, + "step": 4244 + }, + { + "epoch": 0.5400076326167155, + "ewc_loss": 0.029686661437153816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4843330973235425e-05, + "grad_norm": 23.322389602661133, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8568319082260132, + "num_tokens": 162107452.0, + "step": 4245 + }, + { + "epoch": 0.5401348428953059, + "ewc_loss": 0.02958471141755581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4792355614190456e-05, + "grad_norm": 23.290613174438477, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8559140563011169, + "num_tokens": 162144220.0, + "step": 4246 + }, + { + "epoch": 0.5402620531738964, + "ewc_loss": 0.02965669333934784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4828347048023716e-05, + "grad_norm": 23.27806854248047, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8640997409820557, + "num_tokens": 162181798.0, + "step": 4247 + }, + { + "epoch": 0.540389263452487, + "ewc_loss": 0.029638618230819702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4819309399172198e-05, + "grad_norm": 23.265167236328125, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8474503755569458, + "num_tokens": 162222501.0, + "step": 4248 + }, + { + "epoch": 0.5405164737310775, + "ewc_loss": 0.029633615165948868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.481680737924762e-05, + "grad_norm": 23.23918914794922, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8515201807022095, + "num_tokens": 162265533.0, + "step": 4249 + }, + { + "epoch": 0.540643684009668, + "ewc_loss": 0.029686447232961655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4843223652860615e-05, + "grad_norm": 23.292829513549805, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8699200749397278, + "num_tokens": 162304984.0, + "step": 4250 + }, + { + "epoch": 0.5407708942882585, + "ewc_loss": 0.029681971296668053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4840985386399552e-05, + "grad_norm": 23.327686309814453, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8453354835510254, + "num_tokens": 162342358.0, + "step": 4251 + }, + { + "epoch": 0.540898104566849, + "ewc_loss": 0.029682425782084465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4841212760074995e-05, + "grad_norm": 23.286296844482422, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8567564487457275, + "num_tokens": 162377343.0, + "step": 4252 + }, + { + "epoch": 0.5410253148454395, + "ewc_loss": 0.029705673456192017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4852837011858355e-05, + "grad_norm": 23.392541885375977, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.856201171875, + "num_tokens": 162416416.0, + "step": 4253 + }, + { + "epoch": 0.54115252512403, + "ewc_loss": 0.029683692380785942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.484184576838743e-05, + "grad_norm": 23.285337448120117, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.882652223110199, + "num_tokens": 162447108.0, + "step": 4254 + }, + { + "epoch": 0.5412797354026205, + "ewc_loss": 0.02967393584549427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4836968148301821e-05, + "grad_norm": 23.330768585205078, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8575361967086792, + "num_tokens": 162489659.0, + "step": 4255 + }, + { + "epoch": 0.5414069456812111, + "ewc_loss": 0.029661493375897408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4830746295046993e-05, + "grad_norm": 23.284948348999023, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8456842303276062, + "num_tokens": 162525816.0, + "step": 4256 + }, + { + "epoch": 0.5415341559598016, + "ewc_loss": 0.02964070625603199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4820353499089833e-05, + "grad_norm": 23.242055892944336, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.851587176322937, + "num_tokens": 162564154.0, + "step": 4257 + }, + { + "epoch": 0.5416613662383921, + "ewc_loss": 0.02974719926714897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.487359986640513e-05, + "grad_norm": 23.386812210083008, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8483383655548096, + "num_tokens": 162602728.0, + "step": 4258 + }, + { + "epoch": 0.5417885765169825, + "ewc_loss": 0.02964838221669197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4824190657236613e-05, + "grad_norm": 23.200084686279297, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.860753059387207, + "num_tokens": 162636659.0, + "step": 4259 + }, + { + "epoch": 0.5419157867955731, + "ewc_loss": 0.029750531539320946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4875266060698777e-05, + "grad_norm": 23.4222354888916, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8715760111808777, + "num_tokens": 162676639.0, + "step": 4260 + }, + { + "epoch": 0.5420429970741636, + "ewc_loss": 0.029688585549592972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4844292309135199e-05, + "grad_norm": 23.284767150878906, + "learning_rate": 1e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8377259969711304, + "num_tokens": 162716763.0, + "step": 4261 + }, + { + "epoch": 0.5421702073527541, + "ewc_loss": 0.029683586210012436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4841793017694727e-05, + "grad_norm": 23.462705612182617, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8456822633743286, + "num_tokens": 162753943.0, + "step": 4262 + }, + { + "epoch": 0.5422974176313446, + "ewc_loss": 0.029677800834178925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4838900824543089e-05, + "grad_norm": 23.301908493041992, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.850558876991272, + "num_tokens": 162792938.0, + "step": 4263 + }, + { + "epoch": 0.5424246279099352, + "ewc_loss": 0.029659714549779892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4829856809228659e-05, + "grad_norm": 23.52376937866211, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8543934226036072, + "num_tokens": 162838938.0, + "step": 4264 + }, + { + "epoch": 0.5425518381885256, + "ewc_loss": 0.02969028614461422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4845142686681356e-05, + "grad_norm": 23.358095169067383, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8613326549530029, + "num_tokens": 162875026.0, + "step": 4265 + }, + { + "epoch": 0.5426790484671161, + "ewc_loss": 0.029590560123324394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4795279639656655e-05, + "grad_norm": 23.37749481201172, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8588513731956482, + "num_tokens": 162910724.0, + "step": 4266 + }, + { + "epoch": 0.5428062587457066, + "ewc_loss": 0.029632067307829857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4816033399256412e-05, + "grad_norm": 23.366546630859375, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8589812517166138, + "num_tokens": 162941602.0, + "step": 4267 + }, + { + "epoch": 0.5429334690242972, + "ewc_loss": 0.02965390868484974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4826954611635301e-05, + "grad_norm": 23.274433135986328, + "learning_rate": 1e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.842153012752533, + "num_tokens": 162980760.0, + "step": 4268 + }, + { + "epoch": 0.5430606793028877, + "ewc_loss": 0.029680810868740082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4840405128779821e-05, + "grad_norm": 23.391639709472656, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8663451671600342, + "num_tokens": 163016155.0, + "step": 4269 + }, + { + "epoch": 0.5431878895814782, + "ewc_loss": 0.029752690345048904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.487634472141508e-05, + "grad_norm": 23.352598190307617, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.846571683883667, + "num_tokens": 163052519.0, + "step": 4270 + }, + { + "epoch": 0.5433150998600687, + "ewc_loss": 0.029658198356628418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4829099200142082e-05, + "grad_norm": 23.35001564025879, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8642600774765015, + "num_tokens": 163091806.0, + "step": 4271 + }, + { + "epoch": 0.5434423101386592, + "ewc_loss": 0.029716795310378075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4858397662464995e-05, + "grad_norm": 23.389385223388672, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8715277910232544, + "num_tokens": 163128830.0, + "step": 4272 + }, + { + "epoch": 0.5435695204172497, + "ewc_loss": 0.029655328020453453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4827664017502684e-05, + "grad_norm": 23.27428436279297, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8524880409240723, + "num_tokens": 163169567.0, + "step": 4273 + }, + { + "epoch": 0.5436967306958402, + "ewc_loss": 0.02973298169672489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4866491255816072e-05, + "grad_norm": 23.449480056762695, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8477747440338135, + "num_tokens": 163202913.0, + "step": 4274 + }, + { + "epoch": 0.5438239409744308, + "ewc_loss": 0.02971632406115532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4858162103337236e-05, + "grad_norm": 23.368648529052734, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8634805083274841, + "num_tokens": 163236741.0, + "step": 4275 + }, + { + "epoch": 0.5439511512530213, + "ewc_loss": 0.02970942109823227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4854710570944007e-05, + "grad_norm": 23.44053840637207, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8499422669410706, + "num_tokens": 163281402.0, + "step": 4276 + }, + { + "epoch": 0.5440783615316117, + "ewc_loss": 0.02969154715538025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4845773876004387e-05, + "grad_norm": 23.282175064086914, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8644139766693115, + "num_tokens": 163325288.0, + "step": 4277 + }, + { + "epoch": 0.5442055718102022, + "ewc_loss": 0.02964773401618004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4823866877122782e-05, + "grad_norm": 23.374509811401367, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.845771312713623, + "num_tokens": 163367471.0, + "step": 4278 + }, + { + "epoch": 0.5443327820887928, + "ewc_loss": 0.02968878112733364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.484439053456299e-05, + "grad_norm": 23.419652938842773, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8561807870864868, + "num_tokens": 163400077.0, + "step": 4279 + }, + { + "epoch": 0.5444599923673833, + "ewc_loss": 0.02971784397959709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4858921531413216e-05, + "grad_norm": 23.419527053833008, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8592182993888855, + "num_tokens": 163433739.0, + "step": 4280 + }, + { + "epoch": 0.5445872026459738, + "ewc_loss": 0.029674693942070007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.483734740759246e-05, + "grad_norm": 23.440845489501953, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.856461763381958, + "num_tokens": 163467447.0, + "step": 4281 + }, + { + "epoch": 0.5447144129245644, + "ewc_loss": 0.029641231521964073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4820615433563944e-05, + "grad_norm": 23.2985782623291, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8521718978881836, + "num_tokens": 163506633.0, + "step": 4282 + }, + { + "epoch": 0.5448416232031548, + "ewc_loss": 0.02966185100376606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4830925465503242e-05, + "grad_norm": 23.472360610961914, + "learning_rate": 1e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8397100567817688, + "num_tokens": 163542773.0, + "step": 4283 + }, + { + "epoch": 0.5449688334817453, + "ewc_loss": 0.02967560477554798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4837802154943347e-05, + "grad_norm": 23.251144409179688, + "learning_rate": 1e-06, + "loss": 0.5681, + "mean_token_accuracy": 0.8222194314002991, + "num_tokens": 163585247.0, + "step": 4284 + }, + { + "epoch": 0.5450960437603358, + "ewc_loss": 0.029629075899720192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4814538189966697e-05, + "grad_norm": 23.424358367919922, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8433663845062256, + "num_tokens": 163622802.0, + "step": 4285 + }, + { + "epoch": 0.5452232540389264, + "ewc_loss": 0.029736684635281563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.486834207753418e-05, + "grad_norm": 23.411170959472656, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8800023794174194, + "num_tokens": 163655980.0, + "step": 4286 + }, + { + "epoch": 0.5453504643175169, + "ewc_loss": 0.029660847038030624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4830423424427863e-05, + "grad_norm": 23.32372283935547, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8618066310882568, + "num_tokens": 163691494.0, + "step": 4287 + }, + { + "epoch": 0.5454776745961074, + "ewc_loss": 0.02971823140978813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4859115253784694e-05, + "grad_norm": 23.523807525634766, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.866295337677002, + "num_tokens": 163725169.0, + "step": 4288 + }, + { + "epoch": 0.5456048848746978, + "ewc_loss": 0.029703259468078613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4851630112389103e-05, + "grad_norm": 23.396957397460938, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8593536615371704, + "num_tokens": 163766050.0, + "step": 4289 + }, + { + "epoch": 0.5457320951532884, + "ewc_loss": 0.029590053483843803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4795026800129563e-05, + "grad_norm": 23.313833236694336, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8494021892547607, + "num_tokens": 163799559.0, + "step": 4290 + }, + { + "epoch": 0.5458593054318789, + "ewc_loss": 0.02974771335721016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.487385634391103e-05, + "grad_norm": 23.5172061920166, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.850125789642334, + "num_tokens": 163837052.0, + "step": 4291 + }, + { + "epoch": 0.5459865157104694, + "ewc_loss": 0.02973332442343235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4866662240820006e-05, + "grad_norm": 23.472049713134766, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8523222804069519, + "num_tokens": 163872494.0, + "step": 4292 + }, + { + "epoch": 0.5461137259890599, + "ewc_loss": 0.02964787743985653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4823938727204222e-05, + "grad_norm": 23.403648376464844, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8479975461959839, + "num_tokens": 163913499.0, + "step": 4293 + }, + { + "epoch": 0.5462409362676505, + "ewc_loss": 0.029741669073700905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4870834093017038e-05, + "grad_norm": 23.599306106567383, + "learning_rate": 1e-06, + "loss": 0.5228, + "mean_token_accuracy": 0.8358594179153442, + "num_tokens": 163957965.0, + "step": 4294 + }, + { + "epoch": 0.5463681465462409, + "ewc_loss": 0.029683100059628487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4841550182609353e-05, + "grad_norm": 23.51706314086914, + "learning_rate": 1e-06, + "loss": 0.5948, + "mean_token_accuracy": 0.8142235279083252, + "num_tokens": 164005661.0, + "step": 4295 + }, + { + "epoch": 0.5464953568248314, + "ewc_loss": 0.029586590826511383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4793295122217387e-05, + "grad_norm": 23.29555320739746, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8719431757926941, + "num_tokens": 164044717.0, + "step": 4296 + }, + { + "epoch": 0.5466225671034219, + "ewc_loss": 0.029639577493071556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4819788702880032e-05, + "grad_norm": 23.43846321105957, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.854968249797821, + "num_tokens": 164080022.0, + "step": 4297 + }, + { + "epoch": 0.5467497773820125, + "ewc_loss": 0.02967255376279354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4836276932328474e-05, + "grad_norm": 23.38007354736328, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8629361391067505, + "num_tokens": 164112749.0, + "step": 4298 + }, + { + "epoch": 0.546876987660603, + "ewc_loss": 0.02971554361283779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4857771930110175e-05, + "grad_norm": 23.46310043334961, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8500834703445435, + "num_tokens": 164152517.0, + "step": 4299 + }, + { + "epoch": 0.5470041979391935, + "ewc_loss": 0.02968774177134037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4843871213088278e-05, + "grad_norm": 23.435741424560547, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8618781566619873, + "num_tokens": 164195245.0, + "step": 4300 + }, + { + "epoch": 0.5471314082177839, + "ewc_loss": 0.02967500314116478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.483750111219706e-05, + "grad_norm": 23.3829345703125, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8496230840682983, + "num_tokens": 164230643.0, + "step": 4301 + }, + { + "epoch": 0.5472586184963745, + "ewc_loss": 0.029675228521227837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.483761388954008e-05, + "grad_norm": 23.408674240112305, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.852224588394165, + "num_tokens": 164271089.0, + "step": 4302 + }, + { + "epoch": 0.547385828774965, + "ewc_loss": 0.02972038835287094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4860193914500996e-05, + "grad_norm": 23.404977798461914, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8530969619750977, + "num_tokens": 164310618.0, + "step": 4303 + }, + { + "epoch": 0.5475130390535555, + "ewc_loss": 0.02967435121536255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4837175513093825e-05, + "grad_norm": 23.308216094970703, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8416178226470947, + "num_tokens": 164348630.0, + "step": 4304 + }, + { + "epoch": 0.5476402493321461, + "ewc_loss": 0.029695438221096992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4847719285171479e-05, + "grad_norm": 23.374595642089844, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8617812395095825, + "num_tokens": 164383735.0, + "step": 4305 + }, + { + "epoch": 0.5477674596107366, + "ewc_loss": 0.029768163338303566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4884081792843062e-05, + "grad_norm": 23.4388370513916, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.856575071811676, + "num_tokens": 164426115.0, + "step": 4306 + }, + { + "epoch": 0.5478946698893271, + "ewc_loss": 0.029729511588811874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.486475593992509e-05, + "grad_norm": 23.438926696777344, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8476135730743408, + "num_tokens": 164462119.0, + "step": 4307 + }, + { + "epoch": 0.5480218801679175, + "ewc_loss": 0.029744843021035194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4872421161271632e-05, + "grad_norm": 23.397462844848633, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8531115651130676, + "num_tokens": 164503030.0, + "step": 4308 + }, + { + "epoch": 0.5481490904465081, + "ewc_loss": 0.0297459214925766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4872960491629783e-05, + "grad_norm": 23.535600662231445, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8574389219284058, + "num_tokens": 164541091.0, + "step": 4309 + }, + { + "epoch": 0.5482763007250986, + "ewc_loss": 0.029704416170716286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4852207641524728e-05, + "grad_norm": 23.379425048828125, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.850100040435791, + "num_tokens": 164577678.0, + "step": 4310 + }, + { + "epoch": 0.5484035110036891, + "ewc_loss": 0.029746146872639656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4873073268972803e-05, + "grad_norm": 23.53071403503418, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8521965742111206, + "num_tokens": 164612779.0, + "step": 4311 + }, + { + "epoch": 0.5485307212822796, + "ewc_loss": 0.02971024252474308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4855121662549209e-05, + "grad_norm": 23.328277587890625, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.860299289226532, + "num_tokens": 164649849.0, + "step": 4312 + }, + { + "epoch": 0.5486579315608702, + "ewc_loss": 0.029672596603631973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4836298760201316e-05, + "grad_norm": 23.611907958984375, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8530874252319336, + "num_tokens": 164690538.0, + "step": 4313 + }, + { + "epoch": 0.5487851418394606, + "ewc_loss": 0.029758097603917122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4879048649163451e-05, + "grad_norm": 23.376123428344727, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8564386963844299, + "num_tokens": 164729338.0, + "step": 4314 + }, + { + "epoch": 0.5489123521180511, + "ewc_loss": 0.0296426173299551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4821308468526695e-05, + "grad_norm": 23.431869506835938, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8666139245033264, + "num_tokens": 164766648.0, + "step": 4315 + }, + { + "epoch": 0.5490395623966416, + "ewc_loss": 0.029744887724518776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4872443898639176e-05, + "grad_norm": 23.45928192138672, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8544173240661621, + "num_tokens": 164804254.0, + "step": 4316 + }, + { + "epoch": 0.5491667726752322, + "ewc_loss": 0.029673632234334946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4836816262686625e-05, + "grad_norm": 23.515338897705078, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8686026930809021, + "num_tokens": 164839943.0, + "step": 4317 + }, + { + "epoch": 0.5492939829538227, + "ewc_loss": 0.029696717858314514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4848358659946825e-05, + "grad_norm": 23.33530616760254, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.846706748008728, + "num_tokens": 164887057.0, + "step": 4318 + }, + { + "epoch": 0.5494211932324132, + "ewc_loss": 0.029619380831718445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4809690583206248e-05, + "grad_norm": 23.434511184692383, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8548929691314697, + "num_tokens": 164928515.0, + "step": 4319 + }, + { + "epoch": 0.5495484035110036, + "ewc_loss": 0.029719799757003784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4859900147712324e-05, + "grad_norm": 23.424715042114258, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8550020456314087, + "num_tokens": 164963818.0, + "step": 4320 + }, + { + "epoch": 0.5496756137895942, + "ewc_loss": 0.02968623675405979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.484311815147521e-05, + "grad_norm": 23.376039505004883, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8694897890090942, + "num_tokens": 165001091.0, + "step": 4321 + }, + { + "epoch": 0.5498028240681847, + "ewc_loss": 0.02973601035773754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4868005564494524e-05, + "grad_norm": 23.497800827026367, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8374759554862976, + "num_tokens": 165046009.0, + "step": 4322 + }, + { + "epoch": 0.5499300343467752, + "ewc_loss": 0.029677271842956543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4838636161584873e-05, + "grad_norm": 23.3455867767334, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.851664662361145, + "num_tokens": 165090729.0, + "step": 4323 + }, + { + "epoch": 0.5500572446253658, + "ewc_loss": 0.0296749509871006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.483747564634541e-05, + "grad_norm": 23.448713302612305, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8711851835250854, + "num_tokens": 165129771.0, + "step": 4324 + }, + { + "epoch": 0.5501844549039563, + "ewc_loss": 0.029744507744908333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4872253814246505e-05, + "grad_norm": 23.452495574951172, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8713945746421814, + "num_tokens": 165164310.0, + "step": 4325 + }, + { + "epoch": 0.5503116651825467, + "ewc_loss": 0.029658924788236618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.482946208852809e-05, + "grad_norm": 23.324716567993164, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8526830077171326, + "num_tokens": 165202842.0, + "step": 4326 + }, + { + "epoch": 0.5504388754611372, + "ewc_loss": 0.029700886458158493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4850443221803289e-05, + "grad_norm": 23.49844741821289, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8588396906852722, + "num_tokens": 165235998.0, + "step": 4327 + }, + { + "epoch": 0.5505660857397278, + "ewc_loss": 0.0297356266528368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.486781366111245e-05, + "grad_norm": 23.37760353088379, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.856176495552063, + "num_tokens": 165271779.0, + "step": 4328 + }, + { + "epoch": 0.5506932960183183, + "ewc_loss": 0.029676154255867004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4838076822343282e-05, + "grad_norm": 23.43691062927246, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8500498533248901, + "num_tokens": 165306136.0, + "step": 4329 + }, + { + "epoch": 0.5508205062969088, + "ewc_loss": 0.029751816764473915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4875908163958229e-05, + "grad_norm": 23.374332427978516, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.869659423828125, + "num_tokens": 165342401.0, + "step": 4330 + }, + { + "epoch": 0.5509477165754993, + "ewc_loss": 0.029682720080018044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4841360098216683e-05, + "grad_norm": 23.381752014160156, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8594980835914612, + "num_tokens": 165380120.0, + "step": 4331 + }, + { + "epoch": 0.5510749268540898, + "ewc_loss": 0.02974909543991089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4874547559884377e-05, + "grad_norm": 23.39139175415039, + "learning_rate": 1e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8438788652420044, + "num_tokens": 165418210.0, + "step": 4332 + }, + { + "epoch": 0.5512021371326803, + "ewc_loss": 0.02970053069293499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4850264960841741e-05, + "grad_norm": 23.419721603393555, + "learning_rate": 1e-06, + "loss": 0.5346, + "mean_token_accuracy": 0.8352620601654053, + "num_tokens": 165460952.0, + "step": 4333 + }, + { + "epoch": 0.5513293474112708, + "ewc_loss": 0.029765255749225616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4882627510814928e-05, + "grad_norm": 23.467491149902344, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8682330250740051, + "num_tokens": 165500370.0, + "step": 4334 + }, + { + "epoch": 0.5514565576898613, + "ewc_loss": 0.02973095513880253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4865477169223595e-05, + "grad_norm": 23.42803382873535, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8558979630470276, + "num_tokens": 165534089.0, + "step": 4335 + }, + { + "epoch": 0.5515837679684519, + "ewc_loss": 0.029733804985880852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4866902347421274e-05, + "grad_norm": 23.48720932006836, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8474869728088379, + "num_tokens": 165573111.0, + "step": 4336 + }, + { + "epoch": 0.5517109782470424, + "ewc_loss": 0.029748769477009773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.487438476033276e-05, + "grad_norm": 23.444780349731445, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8456388711929321, + "num_tokens": 165611942.0, + "step": 4337 + }, + { + "epoch": 0.5518381885256328, + "ewc_loss": 0.02977145090699196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4885725249769166e-05, + "grad_norm": 23.426206588745117, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8629735708236694, + "num_tokens": 165651703.0, + "step": 4338 + }, + { + "epoch": 0.5519653988042234, + "ewc_loss": 0.029770679771900177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4885339624015614e-05, + "grad_norm": 23.42037010192871, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8528375625610352, + "num_tokens": 165691257.0, + "step": 4339 + }, + { + "epoch": 0.5520926090828139, + "ewc_loss": 0.029737116768956184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4868558537273202e-05, + "grad_norm": 23.396146774291992, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8713735938072205, + "num_tokens": 165727947.0, + "step": 4340 + }, + { + "epoch": 0.5522198193614044, + "ewc_loss": 0.029759975150227547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.487998724769568e-05, + "grad_norm": 23.43971824645996, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.863926887512207, + "num_tokens": 165763400.0, + "step": 4341 + }, + { + "epoch": 0.5523470296399949, + "ewc_loss": 0.029836149886250496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4918075066816527e-05, + "grad_norm": 23.45014190673828, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8551199436187744, + "num_tokens": 165809586.0, + "step": 4342 + }, + { + "epoch": 0.5524742399185855, + "ewc_loss": 0.02974993921816349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4874969565426e-05, + "grad_norm": 23.50383186340332, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8570392727851868, + "num_tokens": 165839565.0, + "step": 4343 + }, + { + "epoch": 0.5526014501971759, + "ewc_loss": 0.029797084629535675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.489854184910655e-05, + "grad_norm": 23.45969009399414, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8587360382080078, + "num_tokens": 165873539.0, + "step": 4344 + }, + { + "epoch": 0.5527286604757664, + "ewc_loss": 0.029742732644081116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4871366147417575e-05, + "grad_norm": 23.383464813232422, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8640625476837158, + "num_tokens": 165920987.0, + "step": 4345 + }, + { + "epoch": 0.5528558707543569, + "ewc_loss": 0.029842732474207878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4921366528142244e-05, + "grad_norm": 23.565898895263672, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.845238208770752, + "num_tokens": 165965661.0, + "step": 4346 + }, + { + "epoch": 0.5529830810329475, + "ewc_loss": 0.029768405482172966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4884202755638398e-05, + "grad_norm": 23.42851448059082, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8494981527328491, + "num_tokens": 165998239.0, + "step": 4347 + }, + { + "epoch": 0.553110291311538, + "ewc_loss": 0.02977421134710312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4887105862726457e-05, + "grad_norm": 23.52557373046875, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8540133237838745, + "num_tokens": 166036489.0, + "step": 4348 + }, + { + "epoch": 0.5532375015901285, + "ewc_loss": 0.029749559238553047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4874779481033329e-05, + "grad_norm": 23.4383544921875, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8523143529891968, + "num_tokens": 166078774.0, + "step": 4349 + }, + { + "epoch": 0.5533647118687189, + "ewc_loss": 0.029735906049609184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4867952813801821e-05, + "grad_norm": 23.482532501220703, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8632570505142212, + "num_tokens": 166119555.0, + "step": 4350 + }, + { + "epoch": 0.5534919221473095, + "ewc_loss": 0.029777823016047478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4888911209709477e-05, + "grad_norm": 23.4251651763916, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8551570177078247, + "num_tokens": 166159788.0, + "step": 4351 + }, + { + "epoch": 0.5536191324259, + "ewc_loss": 0.029741816222667694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4870907762087882e-05, + "grad_norm": 23.371885299682617, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8504242897033691, + "num_tokens": 166200230.0, + "step": 4352 + }, + { + "epoch": 0.5537463427044905, + "ewc_loss": 0.029776129871606827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4888065379636828e-05, + "grad_norm": 23.408283233642578, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8586571216583252, + "num_tokens": 166237864.0, + "step": 4353 + }, + { + "epoch": 0.553873552983081, + "ewc_loss": 0.029803870245814323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4901935173838865e-05, + "grad_norm": 23.53792953491211, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8584339618682861, + "num_tokens": 166278347.0, + "step": 4354 + }, + { + "epoch": 0.5540007632616716, + "ewc_loss": 0.02980865351855755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4904327144904528e-05, + "grad_norm": 23.417518615722656, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8479009866714478, + "num_tokens": 166315744.0, + "step": 4355 + }, + { + "epoch": 0.554127973540262, + "ewc_loss": 0.029763394966721535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4881697097735014e-05, + "grad_norm": 23.514205932617188, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8561387658119202, + "num_tokens": 166352705.0, + "step": 4356 + }, + { + "epoch": 0.5542551838188525, + "ewc_loss": 0.029835177585482597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4917588487151079e-05, + "grad_norm": 23.38006019592285, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8592808246612549, + "num_tokens": 166393956.0, + "step": 4357 + }, + { + "epoch": 0.554382394097443, + "ewc_loss": 0.029767638072371483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.488381894887425e-05, + "grad_norm": 23.469642639160156, + "learning_rate": 1e-06, + "loss": 0.5441, + "mean_token_accuracy": 0.8334044814109802, + "num_tokens": 166430172.0, + "step": 4358 + }, + { + "epoch": 0.5545096043760336, + "ewc_loss": 0.029876049607992172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4938024833099917e-05, + "grad_norm": 23.445634841918945, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8550222516059875, + "num_tokens": 166468732.0, + "step": 4359 + }, + { + "epoch": 0.5546368146546241, + "ewc_loss": 0.029728706926107407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4864353033772204e-05, + "grad_norm": 23.39897918701172, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8519657254219055, + "num_tokens": 166511383.0, + "step": 4360 + }, + { + "epoch": 0.5547640249332146, + "ewc_loss": 0.029809245839715004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4904622730682604e-05, + "grad_norm": 23.409997940063477, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8597314953804016, + "num_tokens": 166547674.0, + "step": 4361 + }, + { + "epoch": 0.5548912352118052, + "ewc_loss": 0.0297794621437788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4889730664435774e-05, + "grad_norm": 23.47693634033203, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8492520451545715, + "num_tokens": 166581763.0, + "step": 4362 + }, + { + "epoch": 0.5550184454903956, + "ewc_loss": 0.02981671877205372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4908359844412189e-05, + "grad_norm": 23.442626953125, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.852003812789917, + "num_tokens": 166618604.0, + "step": 4363 + }, + { + "epoch": 0.5551456557689861, + "ewc_loss": 0.02982606180012226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4913031009200495e-05, + "grad_norm": 23.497766494750977, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.864403247833252, + "num_tokens": 166654812.0, + "step": 4364 + }, + { + "epoch": 0.5552728660475766, + "ewc_loss": 0.029822617769241333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4911309335730039e-05, + "grad_norm": 23.448318481445312, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8433532118797302, + "num_tokens": 166693206.0, + "step": 4365 + }, + { + "epoch": 0.5554000763261672, + "ewc_loss": 0.029797272756695747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4898636436555535e-05, + "grad_norm": 23.464839935302734, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.854543149471283, + "num_tokens": 166734531.0, + "step": 4366 + }, + { + "epoch": 0.5555272866047577, + "ewc_loss": 0.029767123982310295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4883561561873648e-05, + "grad_norm": 23.37779998779297, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8543146848678589, + "num_tokens": 166771362.0, + "step": 4367 + }, + { + "epoch": 0.5556544968833482, + "ewc_loss": 0.029822446405887604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.491122293373337e-05, + "grad_norm": 23.402048110961914, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8466265797615051, + "num_tokens": 166802784.0, + "step": 4368 + }, + { + "epoch": 0.5557817071619386, + "ewc_loss": 0.02984100580215454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.492050250817556e-05, + "grad_norm": 23.491085052490234, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8513380885124207, + "num_tokens": 166835590.0, + "step": 4369 + }, + { + "epoch": 0.5559089174405292, + "ewc_loss": 0.02989373542368412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4946867850085255e-05, + "grad_norm": 23.486047744750977, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8513497710227966, + "num_tokens": 166872145.0, + "step": 4370 + }, + { + "epoch": 0.5560361277191197, + "ewc_loss": 0.02986428327858448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4932141311874148e-05, + "grad_norm": 23.38674545288086, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8523668646812439, + "num_tokens": 166912855.0, + "step": 4371 + }, + { + "epoch": 0.5561633379977102, + "ewc_loss": 0.029860224574804306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4930112229194492e-05, + "grad_norm": 23.390764236450195, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8403427600860596, + "num_tokens": 166949482.0, + "step": 4372 + }, + { + "epoch": 0.5562905482763008, + "ewc_loss": 0.029849067330360413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4924533388693817e-05, + "grad_norm": 23.536376953125, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8550137281417847, + "num_tokens": 166986727.0, + "step": 4373 + }, + { + "epoch": 0.5564177585548913, + "ewc_loss": 0.029921434819698334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4960717635403853e-05, + "grad_norm": 23.3762264251709, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8481056094169617, + "num_tokens": 167025869.0, + "step": 4374 + }, + { + "epoch": 0.5565449688334817, + "ewc_loss": 0.029845498502254486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4922748960088938e-05, + "grad_norm": 23.506818771362305, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8660004138946533, + "num_tokens": 167059825.0, + "step": 4375 + }, + { + "epoch": 0.5566721791120722, + "ewc_loss": 0.02994215488433838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4971077689551748e-05, + "grad_norm": 23.403396606445312, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8597372174263, + "num_tokens": 167097375.0, + "step": 4376 + }, + { + "epoch": 0.5567993893906628, + "ewc_loss": 0.02987498603761196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.493749277869938e-05, + "grad_norm": 23.510435104370117, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8458719253540039, + "num_tokens": 167136555.0, + "step": 4377 + }, + { + "epoch": 0.5569265996692533, + "ewc_loss": 0.029974287375807762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4987143913458567e-05, + "grad_norm": 23.448665618896484, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8528178930282593, + "num_tokens": 167179870.0, + "step": 4378 + }, + { + "epoch": 0.5570538099478438, + "ewc_loss": 0.029892228543758392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4946113878977485e-05, + "grad_norm": 23.41758155822754, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8531302809715271, + "num_tokens": 167215242.0, + "step": 4379 + }, + { + "epoch": 0.5571810202264343, + "ewc_loss": 0.029926951974630356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.496347613283433e-05, + "grad_norm": 23.503313064575195, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8633553981781006, + "num_tokens": 167250690.0, + "step": 4380 + }, + { + "epoch": 0.5573082305050248, + "ewc_loss": 0.02990919165313244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.495459582656622e-05, + "grad_norm": 23.401966094970703, + "learning_rate": 1e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8414745330810547, + "num_tokens": 167283677.0, + "step": 4381 + }, + { + "epoch": 0.5574354407836153, + "ewc_loss": 0.029903320595622063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4951659977668896e-05, + "grad_norm": 23.4803466796875, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8556762933731079, + "num_tokens": 167321355.0, + "step": 4382 + }, + { + "epoch": 0.5575626510622058, + "ewc_loss": 0.02998446114361286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4992230717325583e-05, + "grad_norm": 23.48420524597168, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8473207354545593, + "num_tokens": 167363103.0, + "step": 4383 + }, + { + "epoch": 0.5576898613407963, + "ewc_loss": 0.029917649924755096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4958824976929463e-05, + "grad_norm": 23.40582275390625, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8693350553512573, + "num_tokens": 167399732.0, + "step": 4384 + }, + { + "epoch": 0.5578170716193869, + "ewc_loss": 0.029951991513371468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4975995327404235e-05, + "grad_norm": 23.52600860595703, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.853434681892395, + "num_tokens": 167434938.0, + "step": 4385 + }, + { + "epoch": 0.5579442818979774, + "ewc_loss": 0.029947688803076744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4973844372434542e-05, + "grad_norm": 23.41936492919922, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8587418794631958, + "num_tokens": 167472293.0, + "step": 4386 + }, + { + "epoch": 0.5580714921765678, + "ewc_loss": 0.029985526576638222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4992763681220822e-05, + "grad_norm": 23.489356994628906, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8598807454109192, + "num_tokens": 167511015.0, + "step": 4387 + }, + { + "epoch": 0.5581987024551583, + "ewc_loss": 0.030002087354660034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5001043720985763e-05, + "grad_norm": 23.455751419067383, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8552414178848267, + "num_tokens": 167549777.0, + "step": 4388 + }, + { + "epoch": 0.5583259127337489, + "ewc_loss": 0.029954854398965836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4977426872064825e-05, + "grad_norm": 23.522340774536133, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8605302572250366, + "num_tokens": 167592342.0, + "step": 4389 + }, + { + "epoch": 0.5584531230123394, + "ewc_loss": 0.029967011883854866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4983505934651475e-05, + "grad_norm": 23.519960403442383, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8591042160987854, + "num_tokens": 167630829.0, + "step": 4390 + }, + { + "epoch": 0.5585803332909299, + "ewc_loss": 0.029863320291042328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.493166018917691e-05, + "grad_norm": 23.507225036621094, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8431063294410706, + "num_tokens": 167675938.0, + "step": 4391 + }, + { + "epoch": 0.5587075435695205, + "ewc_loss": 0.029905645176768303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4952822311897762e-05, + "grad_norm": 23.41192054748535, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8586559295654297, + "num_tokens": 167716550.0, + "step": 4392 + }, + { + "epoch": 0.5588347538481109, + "ewc_loss": 0.029933346435427666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4966673006711062e-05, + "grad_norm": 23.483612060546875, + "learning_rate": 1e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8419221639633179, + "num_tokens": 167754401.0, + "step": 4393 + }, + { + "epoch": 0.5589619641267014, + "ewc_loss": 0.029917558655142784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4958779502194375e-05, + "grad_norm": 23.49745750427246, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8590617179870605, + "num_tokens": 167792124.0, + "step": 4394 + }, + { + "epoch": 0.5590891744052919, + "ewc_loss": 0.029907191172242165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4953595382394269e-05, + "grad_norm": 23.51540756225586, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.873604953289032, + "num_tokens": 167829226.0, + "step": 4395 + }, + { + "epoch": 0.5592163846838825, + "ewc_loss": 0.029912181198596954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4956091035855934e-05, + "grad_norm": 23.465726852416992, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8651270866394043, + "num_tokens": 167863439.0, + "step": 4396 + }, + { + "epoch": 0.559343594962473, + "ewc_loss": 0.029850155115127563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4925077266525477e-05, + "grad_norm": 23.49323272705078, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.85593181848526, + "num_tokens": 167907019.0, + "step": 4397 + }, + { + "epoch": 0.5594708052410635, + "ewc_loss": 0.029918424785137177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4959212421672419e-05, + "grad_norm": 23.51238250732422, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8475115299224854, + "num_tokens": 167942539.0, + "step": 4398 + }, + { + "epoch": 0.5595980155196539, + "ewc_loss": 0.029827123507857323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.491356215410633e-05, + "grad_norm": 23.440223693847656, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8545679450035095, + "num_tokens": 167982848.0, + "step": 4399 + }, + { + "epoch": 0.5597252257982445, + "ewc_loss": 0.0298622976988554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4931149053154513e-05, + "grad_norm": 23.428260803222656, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8635751008987427, + "num_tokens": 168017500.0, + "step": 4400 + }, + { + "epoch": 0.559852436076835, + "ewc_loss": 0.029871048405766487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4935524632164743e-05, + "grad_norm": 23.47095489501953, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8399004340171814, + "num_tokens": 168052790.0, + "step": 4401 + }, + { + "epoch": 0.5599796463554255, + "ewc_loss": 0.029940439388155937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4970220036047976e-05, + "grad_norm": 23.61223602294922, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8628268241882324, + "num_tokens": 168082541.0, + "step": 4402 + }, + { + "epoch": 0.560106856634016, + "ewc_loss": 0.029899872839450836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4949936485209037e-05, + "grad_norm": 23.577320098876953, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8618165850639343, + "num_tokens": 168128218.0, + "step": 4403 + }, + { + "epoch": 0.5602340669126066, + "ewc_loss": 0.029806913807988167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4903456758474931e-05, + "grad_norm": 23.39366912841797, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8575838804244995, + "num_tokens": 168165496.0, + "step": 4404 + }, + { + "epoch": 0.560361277191197, + "ewc_loss": 0.029842931777238846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4921465663064737e-05, + "grad_norm": 23.452362060546875, + "learning_rate": 1e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8437032103538513, + "num_tokens": 168204675.0, + "step": 4405 + }, + { + "epoch": 0.5604884874697875, + "ewc_loss": 0.02991069108247757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.495534525020048e-05, + "grad_norm": 23.46145248413086, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8514940738677979, + "num_tokens": 168244832.0, + "step": 4406 + }, + { + "epoch": 0.560615697748378, + "ewc_loss": 0.029956258833408356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4978129001974594e-05, + "grad_norm": 23.696805953979492, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8507306575775146, + "num_tokens": 168282440.0, + "step": 4407 + }, + { + "epoch": 0.5607429080269686, + "ewc_loss": 0.029923973605036736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4961986380512826e-05, + "grad_norm": 23.659603118896484, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8611198663711548, + "num_tokens": 168325303.0, + "step": 4408 + }, + { + "epoch": 0.5608701183055591, + "ewc_loss": 0.02990739420056343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4953697245800868e-05, + "grad_norm": 23.686283111572266, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8605512380599976, + "num_tokens": 168360628.0, + "step": 4409 + }, + { + "epoch": 0.5609973285841496, + "ewc_loss": 0.02980949357151985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4904746421962045e-05, + "grad_norm": 23.4995174407959, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8432489633560181, + "num_tokens": 168398601.0, + "step": 4410 + }, + { + "epoch": 0.5611245388627402, + "ewc_loss": 0.02993832528591156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4969162293709815e-05, + "grad_norm": 23.8693904876709, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8416422605514526, + "num_tokens": 168436893.0, + "step": 4411 + }, + { + "epoch": 0.5612517491413306, + "ewc_loss": 0.029899951070547104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4949975593481213e-05, + "grad_norm": 23.482650756835938, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8515053987503052, + "num_tokens": 168478794.0, + "step": 4412 + }, + { + "epoch": 0.5613789594199211, + "ewc_loss": 0.029748111963272095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.487405643274542e-05, + "grad_norm": 23.437528610229492, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8530421853065491, + "num_tokens": 168520184.0, + "step": 4413 + }, + { + "epoch": 0.5615061696985116, + "ewc_loss": 0.029904969036579132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4952484889363404e-05, + "grad_norm": 23.64269256591797, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8526849150657654, + "num_tokens": 168554305.0, + "step": 4414 + }, + { + "epoch": 0.5616333799771022, + "ewc_loss": 0.029867440462112427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4933720194676425e-05, + "grad_norm": 23.410722732543945, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8389760255813599, + "num_tokens": 168597752.0, + "step": 4415 + }, + { + "epoch": 0.5617605902556927, + "ewc_loss": 0.02987261489033699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.493630770710297e-05, + "grad_norm": 23.59686851501465, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8616431951522827, + "num_tokens": 168637764.0, + "step": 4416 + }, + { + "epoch": 0.5618878005342832, + "ewc_loss": 0.029903173446655273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4951586308598053e-05, + "grad_norm": 23.43622589111328, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8352213501930237, + "num_tokens": 168672260.0, + "step": 4417 + }, + { + "epoch": 0.5620150108128736, + "ewc_loss": 0.029895184561610222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4947592717362568e-05, + "grad_norm": 23.63922691345215, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8642110228538513, + "num_tokens": 168711478.0, + "step": 4418 + }, + { + "epoch": 0.5621422210914642, + "ewc_loss": 0.029927141964435577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4963570720283315e-05, + "grad_norm": 23.369529724121094, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8576284646987915, + "num_tokens": 168751773.0, + "step": 4419 + }, + { + "epoch": 0.5622694313700547, + "ewc_loss": 0.029877053573727608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4938526874175295e-05, + "grad_norm": 23.51167106628418, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8666521310806274, + "num_tokens": 168786364.0, + "step": 4420 + }, + { + "epoch": 0.5623966416486452, + "ewc_loss": 0.029994245618581772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.499712288932642e-05, + "grad_norm": 23.510072708129883, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8534871339797974, + "num_tokens": 168826819.0, + "step": 4421 + }, + { + "epoch": 0.5625238519272358, + "ewc_loss": 0.029911886900663376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4955943697714247e-05, + "grad_norm": 23.551557540893555, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8535735607147217, + "num_tokens": 168869405.0, + "step": 4422 + }, + { + "epoch": 0.5626510622058263, + "ewc_loss": 0.029976584017276764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4988291695772205e-05, + "grad_norm": 23.595733642578125, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8613752126693726, + "num_tokens": 168904323.0, + "step": 4423 + }, + { + "epoch": 0.5627782724844167, + "ewc_loss": 0.029936175793409348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.496808818046702e-05, + "grad_norm": 23.652347564697266, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.846196174621582, + "num_tokens": 168948654.0, + "step": 4424 + }, + { + "epoch": 0.5629054827630072, + "ewc_loss": 0.029887057840824127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4943529095035046e-05, + "grad_norm": 23.484642028808594, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8622636795043945, + "num_tokens": 168984333.0, + "step": 4425 + }, + { + "epoch": 0.5630326930415978, + "ewc_loss": 0.02984655648469925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.492327828600537e-05, + "grad_norm": 23.597274780273438, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8551467061042786, + "num_tokens": 169019843.0, + "step": 4426 + }, + { + "epoch": 0.5631599033201883, + "ewc_loss": 0.0299367792904377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4968389223213308e-05, + "grad_norm": 23.568004608154297, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8658586144447327, + "num_tokens": 169058177.0, + "step": 4427 + }, + { + "epoch": 0.5632871135987788, + "ewc_loss": 0.029861977323889732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4930988982087001e-05, + "grad_norm": 23.521377563476562, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8615003228187561, + "num_tokens": 169102268.0, + "step": 4428 + }, + { + "epoch": 0.5634143238773693, + "ewc_loss": 0.029928140342235565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4964070032874588e-05, + "grad_norm": 23.543333053588867, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8499417901039124, + "num_tokens": 169141709.0, + "step": 4429 + }, + { + "epoch": 0.5635415341559598, + "ewc_loss": 0.02988406829535961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4942033885745332e-05, + "grad_norm": 23.58364486694336, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.851344883441925, + "num_tokens": 169177107.0, + "step": 4430 + }, + { + "epoch": 0.5636687444345503, + "ewc_loss": 0.029904820024967194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4952410310797859e-05, + "grad_norm": 23.532764434814453, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8519167900085449, + "num_tokens": 169211531.0, + "step": 4431 + }, + { + "epoch": 0.5637959547131408, + "ewc_loss": 0.029896751046180725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4948375792300794e-05, + "grad_norm": 23.49867057800293, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8477778434753418, + "num_tokens": 169247507.0, + "step": 4432 + }, + { + "epoch": 0.5639231649917313, + "ewc_loss": 0.02992088347673416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4960442058509216e-05, + "grad_norm": 23.50749397277832, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8699729442596436, + "num_tokens": 169281007.0, + "step": 4433 + }, + { + "epoch": 0.5640503752703219, + "ewc_loss": 0.02990417554974556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.495208744017873e-05, + "grad_norm": 23.466604232788086, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8563215732574463, + "num_tokens": 169322295.0, + "step": 4434 + }, + { + "epoch": 0.5641775855489124, + "ewc_loss": 0.0299522802233696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4976139937061816e-05, + "grad_norm": 23.6182804107666, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8510790467262268, + "num_tokens": 169358184.0, + "step": 4435 + }, + { + "epoch": 0.5643047958275028, + "ewc_loss": 0.02998683974146843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.49934194269008e-05, + "grad_norm": 23.569549560546875, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8604626655578613, + "num_tokens": 169391518.0, + "step": 4436 + }, + { + "epoch": 0.5644320061060933, + "ewc_loss": 0.029908331111073494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.495416563557228e-05, + "grad_norm": 23.518409729003906, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8554569482803345, + "num_tokens": 169426743.0, + "step": 4437 + }, + { + "epoch": 0.5645592163846839, + "ewc_loss": 0.02996388077735901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4981940694269724e-05, + "grad_norm": 23.55461883544922, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8741406202316284, + "num_tokens": 169466452.0, + "step": 4438 + }, + { + "epoch": 0.5646864266632744, + "ewc_loss": 0.02995695173740387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.497847551945597e-05, + "grad_norm": 23.575456619262695, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8610806465148926, + "num_tokens": 169500295.0, + "step": 4439 + }, + { + "epoch": 0.5648136369418649, + "ewc_loss": 0.030000025406479836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5000012353993952e-05, + "grad_norm": 23.50783920288086, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8548070192337036, + "num_tokens": 169541829.0, + "step": 4440 + }, + { + "epoch": 0.5649408472204555, + "ewc_loss": 0.02992578037083149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4962890418246388e-05, + "grad_norm": 23.528615951538086, + "learning_rate": 1e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8399573564529419, + "num_tokens": 169581547.0, + "step": 4441 + }, + { + "epoch": 0.5650680574990459, + "ewc_loss": 0.02998742274940014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.499371137470007e-05, + "grad_norm": 23.47555923461914, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8604626655578613, + "num_tokens": 169622107.0, + "step": 4442 + }, + { + "epoch": 0.5651952677776364, + "ewc_loss": 0.02998041920363903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4990209820098244e-05, + "grad_norm": 23.649703979492188, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8523640036582947, + "num_tokens": 169657258.0, + "step": 4443 + }, + { + "epoch": 0.5653224780562269, + "ewc_loss": 0.03004234842956066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5021174476714805e-05, + "grad_norm": 23.561912536621094, + "learning_rate": 1e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.844555139541626, + "num_tokens": 169691568.0, + "step": 4444 + }, + { + "epoch": 0.5654496883348175, + "ewc_loss": 0.029994037002325058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4997018297435716e-05, + "grad_norm": 23.584089279174805, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8629038333892822, + "num_tokens": 169732429.0, + "step": 4445 + }, + { + "epoch": 0.565576898613408, + "ewc_loss": 0.030028369277715683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.501418500993168e-05, + "grad_norm": 23.536008834838867, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8486206531524658, + "num_tokens": 169769224.0, + "step": 4446 + }, + { + "epoch": 0.5657041088919985, + "ewc_loss": 0.02998083271086216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4990416275395546e-05, + "grad_norm": 23.624160766601562, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8559633493423462, + "num_tokens": 169812830.0, + "step": 4447 + }, + { + "epoch": 0.5658313191705889, + "ewc_loss": 0.030044980347156525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.502249051554827e-05, + "grad_norm": 23.54183006286621, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8519636988639832, + "num_tokens": 169843823.0, + "step": 4448 + }, + { + "epoch": 0.5659585294491795, + "ewc_loss": 0.029966924339532852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.498346227890579e-05, + "grad_norm": 23.52646255493164, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8575537204742432, + "num_tokens": 169882847.0, + "step": 4449 + }, + { + "epoch": 0.56608573972777, + "ewc_loss": 0.030045196413993835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.502259783592308e-05, + "grad_norm": 23.516902923583984, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8371915817260742, + "num_tokens": 169921264.0, + "step": 4450 + }, + { + "epoch": 0.5662129500063605, + "ewc_loss": 0.029936203733086586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.4968101822887547e-05, + "grad_norm": 23.367568969726562, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8609589338302612, + "num_tokens": 169949681.0, + "step": 4451 + }, + { + "epoch": 0.566340160284951, + "ewc_loss": 0.03008180484175682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5040902326290961e-05, + "grad_norm": 23.728281021118164, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8704301118850708, + "num_tokens": 169986254.0, + "step": 4452 + }, + { + "epoch": 0.5664673705635416, + "ewc_loss": 0.03010465018451214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5052324670250528e-05, + "grad_norm": 23.460121154785156, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8512232303619385, + "num_tokens": 170022745.0, + "step": 4453 + }, + { + "epoch": 0.566594580842132, + "ewc_loss": 0.030006468296051025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5003233784227632e-05, + "grad_norm": 23.59661102294922, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8660074472427368, + "num_tokens": 170057038.0, + "step": 4454 + }, + { + "epoch": 0.5667217911207225, + "ewc_loss": 0.030134478583931923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5067239473864902e-05, + "grad_norm": 23.552377700805664, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8743350505828857, + "num_tokens": 170098460.0, + "step": 4455 + }, + { + "epoch": 0.566849001399313, + "ewc_loss": 0.030028752982616425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5014376913313754e-05, + "grad_norm": 23.475879669189453, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8432973623275757, + "num_tokens": 170137689.0, + "step": 4456 + }, + { + "epoch": 0.5669762116779036, + "ewc_loss": 0.030062252655625343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5031126167741604e-05, + "grad_norm": 23.514892578125, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8567219376564026, + "num_tokens": 170176716.0, + "step": 4457 + }, + { + "epoch": 0.5671034219564941, + "ewc_loss": 0.03008013404905796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5040067410154734e-05, + "grad_norm": 23.455432891845703, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.851860523223877, + "num_tokens": 170212621.0, + "step": 4458 + }, + { + "epoch": 0.5672306322350846, + "ewc_loss": 0.03008355386555195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5041777260194067e-05, + "grad_norm": 23.513717651367188, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.845893383026123, + "num_tokens": 170254493.0, + "step": 4459 + }, + { + "epoch": 0.5673578425136752, + "ewc_loss": 0.030102796852588654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5051398804644123e-05, + "grad_norm": 23.466285705566406, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8761143684387207, + "num_tokens": 170290559.0, + "step": 4460 + }, + { + "epoch": 0.5674850527922656, + "ewc_loss": 0.030095385387539864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5047692613734398e-05, + "grad_norm": 23.521486282348633, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8498680591583252, + "num_tokens": 170335079.0, + "step": 4461 + }, + { + "epoch": 0.5676122630708561, + "ewc_loss": 0.030192943289875984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5096471543074585e-05, + "grad_norm": 23.606101989746094, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.858747124671936, + "num_tokens": 170370665.0, + "step": 4462 + }, + { + "epoch": 0.5677394733494466, + "ewc_loss": 0.030076298862695694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5038149285828695e-05, + "grad_norm": 23.532794952392578, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8544539213180542, + "num_tokens": 170412585.0, + "step": 4463 + }, + { + "epoch": 0.5678666836280372, + "ewc_loss": 0.03007468394935131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.503734165453352e-05, + "grad_norm": 23.516159057617188, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8612789511680603, + "num_tokens": 170450806.0, + "step": 4464 + }, + { + "epoch": 0.5679938939066277, + "ewc_loss": 0.030050771310925484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5025385437184013e-05, + "grad_norm": 23.485095977783203, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8554393649101257, + "num_tokens": 170483265.0, + "step": 4465 + }, + { + "epoch": 0.5681211041852182, + "ewc_loss": 0.030131284147500992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5065642401168589e-05, + "grad_norm": 23.55502700805664, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8719877004623413, + "num_tokens": 170520910.0, + "step": 4466 + }, + { + "epoch": 0.5682483144638086, + "ewc_loss": 0.030116144567728043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5058072676765732e-05, + "grad_norm": 23.558364868164062, + "learning_rate": 1e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8379812240600586, + "num_tokens": 170556516.0, + "step": 4467 + }, + { + "epoch": 0.5683755247423992, + "ewc_loss": 0.030070818960666656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5035409887786955e-05, + "grad_norm": 23.496824264526367, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8542171716690063, + "num_tokens": 170595571.0, + "step": 4468 + }, + { + "epoch": 0.5685027350209897, + "ewc_loss": 0.030068479478359222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5034239368105773e-05, + "grad_norm": 23.56744956970215, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8750503063201904, + "num_tokens": 170634117.0, + "step": 4469 + }, + { + "epoch": 0.5686299452995802, + "ewc_loss": 0.03009386546909809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5046932276163716e-05, + "grad_norm": 23.575714111328125, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8514573574066162, + "num_tokens": 170677624.0, + "step": 4470 + }, + { + "epoch": 0.5687571555781707, + "ewc_loss": 0.030107485130429268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5053742572490592e-05, + "grad_norm": 23.49921417236328, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.8414292931556702, + "num_tokens": 170716732.0, + "step": 4471 + }, + { + "epoch": 0.5688843658567613, + "ewc_loss": 0.03010307438671589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5051537047838792e-05, + "grad_norm": 23.62031364440918, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8488079309463501, + "num_tokens": 170758521.0, + "step": 4472 + }, + { + "epoch": 0.5690115761353517, + "ewc_loss": 0.03010161779820919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5050808542582672e-05, + "grad_norm": 23.45302391052246, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8550848960876465, + "num_tokens": 170796799.0, + "step": 4473 + }, + { + "epoch": 0.5691387864139422, + "ewc_loss": 0.030158961191773415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5079480363056064e-05, + "grad_norm": 23.77410316467285, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8419642448425293, + "num_tokens": 170838569.0, + "step": 4474 + }, + { + "epoch": 0.5692659966925327, + "ewc_loss": 0.03010900318622589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5054502000566572e-05, + "grad_norm": 23.45027732849121, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8578023314476013, + "num_tokens": 170878206.0, + "step": 4475 + }, + { + "epoch": 0.5693932069711233, + "ewc_loss": 0.03000812418758869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5004062333900947e-05, + "grad_norm": 23.68444061279297, + "learning_rate": 1e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8414539098739624, + "num_tokens": 170915506.0, + "step": 4476 + }, + { + "epoch": 0.5695204172497138, + "ewc_loss": 0.030125970020890236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5062984857650008e-05, + "grad_norm": 23.617910385131836, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8514323234558105, + "num_tokens": 170952303.0, + "step": 4477 + }, + { + "epoch": 0.5696476275283043, + "ewc_loss": 0.030023571103811264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5011785762908403e-05, + "grad_norm": 23.442886352539062, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8486578464508057, + "num_tokens": 170993757.0, + "step": 4478 + }, + { + "epoch": 0.5697748378068948, + "ewc_loss": 0.030085697770118713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5042848644952755e-05, + "grad_norm": 23.692115783691406, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8528615236282349, + "num_tokens": 171030240.0, + "step": 4479 + }, + { + "epoch": 0.5699020480854853, + "ewc_loss": 0.030127739533782005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5063869795994833e-05, + "grad_norm": 23.655059814453125, + "learning_rate": 1e-06, + "loss": 0.529, + "mean_token_accuracy": 0.8313668966293335, + "num_tokens": 171069654.0, + "step": 4480 + }, + { + "epoch": 0.5700292583640758, + "ewc_loss": 0.030039386823773384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5019693819340318e-05, + "grad_norm": 23.670101165771484, + "learning_rate": 1e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.839584231376648, + "num_tokens": 171107341.0, + "step": 4481 + }, + { + "epoch": 0.5701564686426663, + "ewc_loss": 0.030095187947154045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5047594388306607e-05, + "grad_norm": 23.70325469970703, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8454264998435974, + "num_tokens": 171144063.0, + "step": 4482 + }, + { + "epoch": 0.5702836789212569, + "ewc_loss": 0.030051400884985924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5025700122350827e-05, + "grad_norm": 23.650644302368164, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8479322195053101, + "num_tokens": 171180448.0, + "step": 4483 + }, + { + "epoch": 0.5704108891998474, + "ewc_loss": 0.030047360807657242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.502368013461819e-05, + "grad_norm": 23.565534591674805, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8602502346038818, + "num_tokens": 171214220.0, + "step": 4484 + }, + { + "epoch": 0.5705380994784378, + "ewc_loss": 0.03003838285803795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.501919177826494e-05, + "grad_norm": 23.659427642822266, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8618247509002686, + "num_tokens": 171252798.0, + "step": 4485 + }, + { + "epoch": 0.5706653097570283, + "ewc_loss": 0.030090859159827232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5045429790916387e-05, + "grad_norm": 23.5651798248291, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8600696325302124, + "num_tokens": 171295434.0, + "step": 4486 + }, + { + "epoch": 0.5707925200356189, + "ewc_loss": 0.030022023245692253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5011011782917194e-05, + "grad_norm": 23.46181869506836, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8582271933555603, + "num_tokens": 171334129.0, + "step": 4487 + }, + { + "epoch": 0.5709197303142094, + "ewc_loss": 0.030098222196102142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5049111425469164e-05, + "grad_norm": 23.61076545715332, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8513102531433105, + "num_tokens": 171370239.0, + "step": 4488 + }, + { + "epoch": 0.5710469405927999, + "ewc_loss": 0.030121879652142525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.506093940406572e-05, + "grad_norm": 23.568538665771484, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8686729073524475, + "num_tokens": 171402411.0, + "step": 4489 + }, + { + "epoch": 0.5711741508713905, + "ewc_loss": 0.030103225260972977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.505161253589904e-05, + "grad_norm": 23.494489669799805, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8654129505157471, + "num_tokens": 171438736.0, + "step": 4490 + }, + { + "epoch": 0.5713013611499809, + "ewc_loss": 0.030167270451784134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5083634934853762e-05, + "grad_norm": 23.601577758789062, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8619741797447205, + "num_tokens": 171477949.0, + "step": 4491 + }, + { + "epoch": 0.5714285714285714, + "ewc_loss": 0.030094563961029053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5047282431623898e-05, + "grad_norm": 23.52309226989746, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8500216007232666, + "num_tokens": 171520953.0, + "step": 4492 + }, + { + "epoch": 0.5715557817071619, + "ewc_loss": 0.03010253980755806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5051269656396471e-05, + "grad_norm": 23.573780059814453, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8609545826911926, + "num_tokens": 171559415.0, + "step": 4493 + }, + { + "epoch": 0.5716829919857525, + "ewc_loss": 0.030138622969388962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.506931130279554e-05, + "grad_norm": 23.563772201538086, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8637592792510986, + "num_tokens": 171593118.0, + "step": 4494 + }, + { + "epoch": 0.571810202264343, + "ewc_loss": 0.03011307492852211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.505653744970914e-05, + "grad_norm": 23.537761688232422, + "learning_rate": 1e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8412806987762451, + "num_tokens": 171634514.0, + "step": 4495 + }, + { + "epoch": 0.5719374125429335, + "ewc_loss": 0.030104730278253555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5052364688017406e-05, + "grad_norm": 23.56675910949707, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8593107461929321, + "num_tokens": 171675239.0, + "step": 4496 + }, + { + "epoch": 0.5720646228215239, + "ewc_loss": 0.03013649769127369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.506824901298387e-05, + "grad_norm": 23.517301559448242, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8646959066390991, + "num_tokens": 171713154.0, + "step": 4497 + }, + { + "epoch": 0.5721918331001145, + "ewc_loss": 0.03011023998260498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5055119547469076e-05, + "grad_norm": 23.713294982910156, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.848132848739624, + "num_tokens": 171750046.0, + "step": 4498 + }, + { + "epoch": 0.572319043378705, + "ewc_loss": 0.030106477439403534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.505323871242581e-05, + "grad_norm": 23.536319732666016, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8455281257629395, + "num_tokens": 171782708.0, + "step": 4499 + }, + { + "epoch": 0.5724462536572955, + "ewc_loss": 0.03012329712510109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5061648809933104e-05, + "grad_norm": 23.663427352905273, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8502722978591919, + "num_tokens": 171819421.0, + "step": 4500 + }, + { + "epoch": 0.572573463935886, + "ewc_loss": 0.03010711818933487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5053558854560833e-05, + "grad_norm": 23.491945266723633, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8540363907814026, + "num_tokens": 171860993.0, + "step": 4501 + }, + { + "epoch": 0.5727006742144766, + "ewc_loss": 0.030101675540208817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5050837646413129e-05, + "grad_norm": 23.665645599365234, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8533161878585815, + "num_tokens": 171896894.0, + "step": 4502 + }, + { + "epoch": 0.572827884493067, + "ewc_loss": 0.03018871136009693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5094356058398262e-05, + "grad_norm": 23.390920639038086, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.852415919303894, + "num_tokens": 171938694.0, + "step": 4503 + }, + { + "epoch": 0.5729550947716575, + "ewc_loss": 0.030123505741357803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5061752492329106e-05, + "grad_norm": 23.721784591674805, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.880073070526123, + "num_tokens": 171972606.0, + "step": 4504 + }, + { + "epoch": 0.573082305050248, + "ewc_loss": 0.030224686488509178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5112343135115225e-05, + "grad_norm": 23.541614532470703, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8581315279006958, + "num_tokens": 172010861.0, + "step": 4505 + }, + { + "epoch": 0.5732095153288386, + "ewc_loss": 0.030122453346848488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5061226804391481e-05, + "grad_norm": 23.66242790222168, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.8399635553359985, + "num_tokens": 172049867.0, + "step": 4506 + }, + { + "epoch": 0.5733367256074291, + "ewc_loss": 0.030207594856619835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5103797522897366e-05, + "grad_norm": 23.568437576293945, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8546297550201416, + "num_tokens": 172092176.0, + "step": 4507 + }, + { + "epoch": 0.5734639358860196, + "ewc_loss": 0.030142581090331078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5071290363266598e-05, + "grad_norm": 23.554363250732422, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8549612760543823, + "num_tokens": 172132789.0, + "step": 4508 + }, + { + "epoch": 0.5735911461646102, + "ewc_loss": 0.030229289084672928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.511464415671071e-05, + "grad_norm": 23.571500778198242, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8700671195983887, + "num_tokens": 172170536.0, + "step": 4509 + }, + { + "epoch": 0.5737183564432006, + "ewc_loss": 0.030225923284888268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.511296159151243e-05, + "grad_norm": 23.517499923706055, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8543187379837036, + "num_tokens": 172206828.0, + "step": 4510 + }, + { + "epoch": 0.5738455667217911, + "ewc_loss": 0.03028869442641735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5144347344175912e-05, + "grad_norm": 23.667987823486328, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8462724089622498, + "num_tokens": 172242311.0, + "step": 4511 + }, + { + "epoch": 0.5739727770003816, + "ewc_loss": 0.030271464958786964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.513573261036072e-05, + "grad_norm": 23.704797744750977, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8495350480079651, + "num_tokens": 172275135.0, + "step": 4512 + }, + { + "epoch": 0.5740999872789722, + "ewc_loss": 0.030216889455914497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5108445040823426e-05, + "grad_norm": 23.58857536315918, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8631555438041687, + "num_tokens": 172308682.0, + "step": 4513 + }, + { + "epoch": 0.5742271975575627, + "ewc_loss": 0.03018643893301487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5093219190021046e-05, + "grad_norm": 23.58625030517578, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8505887389183044, + "num_tokens": 172345834.0, + "step": 4514 + }, + { + "epoch": 0.5743544078361532, + "ewc_loss": 0.030218755826354027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5109378182387445e-05, + "grad_norm": 23.601316452026367, + "learning_rate": 1e-06, + "loss": 0.545, + "mean_token_accuracy": 0.8333254456520081, + "num_tokens": 172387976.0, + "step": 4515 + }, + { + "epoch": 0.5744816181147436, + "ewc_loss": 0.030207550153136253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5103774785529822e-05, + "grad_norm": 23.580623626708984, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8540494441986084, + "num_tokens": 172427447.0, + "step": 4516 + }, + { + "epoch": 0.5746088283933342, + "ewc_loss": 0.030273152515292168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5136576621443965e-05, + "grad_norm": 23.63229751586914, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8456540107727051, + "num_tokens": 172468686.0, + "step": 4517 + }, + { + "epoch": 0.5747360386719247, + "ewc_loss": 0.030201908200979233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5100954442459624e-05, + "grad_norm": 23.662853240966797, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.851952075958252, + "num_tokens": 172508190.0, + "step": 4518 + }, + { + "epoch": 0.5748632489505152, + "ewc_loss": 0.030278611928224564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5139306015043985e-05, + "grad_norm": 23.628694534301758, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8706645965576172, + "num_tokens": 172552295.0, + "step": 4519 + }, + { + "epoch": 0.5749904592291057, + "ewc_loss": 0.03016609512269497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5083047401276417e-05, + "grad_norm": 23.609188079833984, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8603871464729309, + "num_tokens": 172591065.0, + "step": 4520 + }, + { + "epoch": 0.5751176695076963, + "ewc_loss": 0.030225912109017372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.511295613454422e-05, + "grad_norm": 23.51779556274414, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8575694561004639, + "num_tokens": 172633741.0, + "step": 4521 + }, + { + "epoch": 0.5752448797862867, + "ewc_loss": 0.030209679156541824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5104839803825598e-05, + "grad_norm": 23.758182525634766, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8643481135368347, + "num_tokens": 172669081.0, + "step": 4522 + }, + { + "epoch": 0.5753720900648772, + "ewc_loss": 0.03024017997086048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5120090210984927e-05, + "grad_norm": 23.700855255126953, + "learning_rate": 1e-06, + "loss": 0.5413, + "mean_token_accuracy": 0.833320140838623, + "num_tokens": 172704956.0, + "step": 4523 + }, + { + "epoch": 0.5754993003434677, + "ewc_loss": 0.03012342005968094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5061709746078122e-05, + "grad_norm": 23.560781478881836, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8590484261512756, + "num_tokens": 172742270.0, + "step": 4524 + }, + { + "epoch": 0.5756265106220583, + "ewc_loss": 0.030114885419607162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5057442396937404e-05, + "grad_norm": 23.630483627319336, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8433179259300232, + "num_tokens": 172778065.0, + "step": 4525 + }, + { + "epoch": 0.5757537209006488, + "ewc_loss": 0.030234981328248978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.511749087512726e-05, + "grad_norm": 23.79015350341797, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8629598021507263, + "num_tokens": 172815825.0, + "step": 4526 + }, + { + "epoch": 0.5758809311792393, + "ewc_loss": 0.030120275914669037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5060138139233459e-05, + "grad_norm": 23.540233612060547, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8534774780273438, + "num_tokens": 172849560.0, + "step": 4527 + }, + { + "epoch": 0.5760081414578297, + "ewc_loss": 0.030138865113258362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5069432265590876e-05, + "grad_norm": 23.64018440246582, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8559854030609131, + "num_tokens": 172885203.0, + "step": 4528 + }, + { + "epoch": 0.5761353517364203, + "ewc_loss": 0.030179260298609734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.508963032392785e-05, + "grad_norm": 23.613298416137695, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.853773832321167, + "num_tokens": 172922341.0, + "step": 4529 + }, + { + "epoch": 0.5762625620150108, + "ewc_loss": 0.030112018808722496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.505600903328741e-05, + "grad_norm": 23.661455154418945, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.85981684923172, + "num_tokens": 172953363.0, + "step": 4530 + }, + { + "epoch": 0.5763897722936013, + "ewc_loss": 0.030265357345342636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5132678527152166e-05, + "grad_norm": 23.666866302490234, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8607597351074219, + "num_tokens": 172986253.0, + "step": 4531 + }, + { + "epoch": 0.5765169825721919, + "ewc_loss": 0.030222536996006966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.511126811237773e-05, + "grad_norm": 23.716773986816406, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8658654689788818, + "num_tokens": 173023466.0, + "step": 4532 + }, + { + "epoch": 0.5766441928507824, + "ewc_loss": 0.03025589883327484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5127949154702947e-05, + "grad_norm": 23.720563888549805, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8501557111740112, + "num_tokens": 173063329.0, + "step": 4533 + }, + { + "epoch": 0.5767714031293728, + "ewc_loss": 0.03017081506550312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5085407540027518e-05, + "grad_norm": 23.70534896850586, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8615303039550781, + "num_tokens": 173100676.0, + "step": 4534 + }, + { + "epoch": 0.5768986134079633, + "ewc_loss": 0.03019840642809868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.509920275566401e-05, + "grad_norm": 23.699512481689453, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.865859866142273, + "num_tokens": 173135088.0, + "step": 4535 + }, + { + "epoch": 0.5770258236865539, + "ewc_loss": 0.030214592814445496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5107296349015087e-05, + "grad_norm": 23.703859329223633, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8474174737930298, + "num_tokens": 173175810.0, + "step": 4536 + }, + { + "epoch": 0.5771530339651444, + "ewc_loss": 0.030135752633213997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5067876120156143e-05, + "grad_norm": 23.522546768188477, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8562174439430237, + "num_tokens": 173220826.0, + "step": 4537 + }, + { + "epoch": 0.5772802442437349, + "ewc_loss": 0.03016597218811512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5082986465131398e-05, + "grad_norm": 23.901657104492188, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8665773868560791, + "num_tokens": 173258411.0, + "step": 4538 + }, + { + "epoch": 0.5774074545223254, + "ewc_loss": 0.030225500464439392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5112750588741619e-05, + "grad_norm": 23.534425735473633, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8393838405609131, + "num_tokens": 173297298.0, + "step": 4539 + }, + { + "epoch": 0.5775346648009159, + "ewc_loss": 0.030163463205099106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5081731362442952e-05, + "grad_norm": 23.777690887451172, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8377367854118347, + "num_tokens": 173330839.0, + "step": 4540 + }, + { + "epoch": 0.5776618750795064, + "ewc_loss": 0.03027823381125927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5139116840146016e-05, + "grad_norm": 23.77906036376953, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8480397462844849, + "num_tokens": 173370219.0, + "step": 4541 + }, + { + "epoch": 0.5777890853580969, + "ewc_loss": 0.030196376144886017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5098187759576831e-05, + "grad_norm": 23.69964599609375, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8570944666862488, + "num_tokens": 173410425.0, + "step": 4542 + }, + { + "epoch": 0.5779162956366874, + "ewc_loss": 0.03017612174153328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.508806053607259e-05, + "grad_norm": 23.607275009155273, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8545461893081665, + "num_tokens": 173447109.0, + "step": 4543 + }, + { + "epoch": 0.578043505915278, + "ewc_loss": 0.030230406671762466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.51152034959523e-05, + "grad_norm": 23.987104415893555, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8603214621543884, + "num_tokens": 173490064.0, + "step": 4544 + }, + { + "epoch": 0.5781707161938685, + "ewc_loss": 0.030225537717342377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5112768778635655e-05, + "grad_norm": 23.495466232299805, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8575757741928101, + "num_tokens": 173523791.0, + "step": 4545 + }, + { + "epoch": 0.5782979264724589, + "ewc_loss": 0.030184784904122353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5092392459337134e-05, + "grad_norm": 23.956905364990234, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8637168407440186, + "num_tokens": 173564069.0, + "step": 4546 + }, + { + "epoch": 0.5784251367510495, + "ewc_loss": 0.030314678326249123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5157339475990739e-05, + "grad_norm": 24.003049850463867, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8548635244369507, + "num_tokens": 173601893.0, + "step": 4547 + }, + { + "epoch": 0.57855234702964, + "ewc_loss": 0.030108803883194923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5054401956149377e-05, + "grad_norm": 23.613277435302734, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8649178743362427, + "num_tokens": 173642299.0, + "step": 4548 + }, + { + "epoch": 0.5786795573082305, + "ewc_loss": 0.030144881457090378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5072440874064341e-05, + "grad_norm": 24.22249412536621, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8565976619720459, + "num_tokens": 173675824.0, + "step": 4549 + }, + { + "epoch": 0.578806767586821, + "ewc_loss": 0.030187975615262985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5093987713044044e-05, + "grad_norm": 23.96257209777832, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8571434617042542, + "num_tokens": 173712192.0, + "step": 4550 + }, + { + "epoch": 0.5789339778654116, + "ewc_loss": 0.029947564005851746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.497378161730012e-05, + "grad_norm": 23.53470230102539, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8621424436569214, + "num_tokens": 173746949.0, + "step": 4551 + }, + { + "epoch": 0.579061188144002, + "ewc_loss": 0.030045678839087486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5022839761513751e-05, + "grad_norm": 24.219457626342773, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8578138947486877, + "num_tokens": 173784527.0, + "step": 4552 + }, + { + "epoch": 0.5791883984225925, + "ewc_loss": 0.030216487124562263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5108243132999633e-05, + "grad_norm": 23.871463775634766, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8617908954620361, + "num_tokens": 173820648.0, + "step": 4553 + }, + { + "epoch": 0.579315608701183, + "ewc_loss": 0.02991129644215107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.495564811193617e-05, + "grad_norm": 23.636455535888672, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8486366271972656, + "num_tokens": 173860427.0, + "step": 4554 + }, + { + "epoch": 0.5794428189797736, + "ewc_loss": 0.03003842942416668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5019214515632484e-05, + "grad_norm": 23.8765869140625, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8578616380691528, + "num_tokens": 173899876.0, + "step": 4555 + }, + { + "epoch": 0.5795700292583641, + "ewc_loss": 0.030067449435591698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.503372459410457e-05, + "grad_norm": 23.44692039489746, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8640236258506775, + "num_tokens": 173937354.0, + "step": 4556 + }, + { + "epoch": 0.5796972395369546, + "ewc_loss": 0.03002556972205639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5012785297585651e-05, + "grad_norm": 23.810325622558594, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8458728194236755, + "num_tokens": 173976039.0, + "step": 4557 + }, + { + "epoch": 0.5798244498155452, + "ewc_loss": 0.030214648693799973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5107324543350842e-05, + "grad_norm": 23.659196853637695, + "learning_rate": 1e-06, + "loss": 0.5331, + "mean_token_accuracy": 0.8365393877029419, + "num_tokens": 174006359.0, + "step": 4558 + }, + { + "epoch": 0.5799516600941356, + "ewc_loss": 0.03008679300546646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5043396160763223e-05, + "grad_norm": 23.57677459716797, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8471455574035645, + "num_tokens": 174044662.0, + "step": 4559 + }, + { + "epoch": 0.5800788703727261, + "ewc_loss": 0.030194930732250214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5097465620783623e-05, + "grad_norm": 23.722929000854492, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8516278266906738, + "num_tokens": 174077133.0, + "step": 4560 + }, + { + "epoch": 0.5802060806513166, + "ewc_loss": 0.030205881223082542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5102940778888296e-05, + "grad_norm": 23.584983825683594, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8504371643066406, + "num_tokens": 174121873.0, + "step": 4561 + }, + { + "epoch": 0.5803332909299072, + "ewc_loss": 0.030185727402567863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5092863577592652e-05, + "grad_norm": 23.817623138427734, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8579776287078857, + "num_tokens": 174158873.0, + "step": 4562 + }, + { + "epoch": 0.5804605012084977, + "ewc_loss": 0.03026626631617546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5133133274503052e-05, + "grad_norm": 23.767192840576172, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8622848987579346, + "num_tokens": 174189222.0, + "step": 4563 + }, + { + "epoch": 0.5805877114870882, + "ewc_loss": 0.030134430155158043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5067214917507954e-05, + "grad_norm": 23.59189796447754, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.847908616065979, + "num_tokens": 174229018.0, + "step": 4564 + }, + { + "epoch": 0.5807149217656786, + "ewc_loss": 0.03023535944521427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5117680050025228e-05, + "grad_norm": 23.82216453552246, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8553407788276672, + "num_tokens": 174269215.0, + "step": 4565 + }, + { + "epoch": 0.5808421320442692, + "ewc_loss": 0.030259711667895317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5129855455597863e-05, + "grad_norm": 23.753578186035156, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.863389790058136, + "num_tokens": 174303926.0, + "step": 4566 + }, + { + "epoch": 0.5809693423228597, + "ewc_loss": 0.030218251049518585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5109125342860352e-05, + "grad_norm": 23.661779403686523, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.85774827003479, + "num_tokens": 174347491.0, + "step": 4567 + }, + { + "epoch": 0.5810965526014502, + "ewc_loss": 0.030241522938013077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5120761418074835e-05, + "grad_norm": 23.86116600036621, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8567557334899902, + "num_tokens": 174384246.0, + "step": 4568 + }, + { + "epoch": 0.5812237628800407, + "ewc_loss": 0.030216647312045097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.510832407802809e-05, + "grad_norm": 23.502399444580078, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8537086844444275, + "num_tokens": 174417081.0, + "step": 4569 + }, + { + "epoch": 0.5813509731586313, + "ewc_loss": 0.030215395614504814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5107697436178569e-05, + "grad_norm": 23.844432830810547, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8457106947898865, + "num_tokens": 174457364.0, + "step": 4570 + }, + { + "epoch": 0.5814781834372217, + "ewc_loss": 0.030298728495836258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5149364116950892e-05, + "grad_norm": 23.52969741821289, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8572033643722534, + "num_tokens": 174494304.0, + "step": 4571 + }, + { + "epoch": 0.5816053937158122, + "ewc_loss": 0.03018072433769703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5090362467162777e-05, + "grad_norm": 23.6926212310791, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.858057975769043, + "num_tokens": 174528047.0, + "step": 4572 + }, + { + "epoch": 0.5817326039944027, + "ewc_loss": 0.030356545001268387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5178272406046744e-05, + "grad_norm": 23.777742385864258, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.860734224319458, + "num_tokens": 174565990.0, + "step": 4573 + }, + { + "epoch": 0.5818598142729933, + "ewc_loss": 0.030250903218984604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5125451682251878e-05, + "grad_norm": 23.503204345703125, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8409971594810486, + "num_tokens": 174599039.0, + "step": 4574 + }, + { + "epoch": 0.5819870245515838, + "ewc_loss": 0.03030896931886673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5154484572121873e-05, + "grad_norm": 23.631179809570312, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.849457859992981, + "num_tokens": 174634786.0, + "step": 4575 + }, + { + "epoch": 0.5821142348301743, + "ewc_loss": 0.03039042092859745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5195210835372563e-05, + "grad_norm": 23.55708122253418, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8444774150848389, + "num_tokens": 174677875.0, + "step": 4576 + }, + { + "epoch": 0.5822414451087647, + "ewc_loss": 0.030380716547369957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5190358681138605e-05, + "grad_norm": 23.761709213256836, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.849196195602417, + "num_tokens": 174715421.0, + "step": 4577 + }, + { + "epoch": 0.5823686553873553, + "ewc_loss": 0.030361376702785492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5180688023974653e-05, + "grad_norm": 23.619104385375977, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8521600365638733, + "num_tokens": 174746555.0, + "step": 4578 + }, + { + "epoch": 0.5824958656659458, + "ewc_loss": 0.030325276777148247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5162638192123268e-05, + "grad_norm": 23.785470962524414, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8440520763397217, + "num_tokens": 174786470.0, + "step": 4579 + }, + { + "epoch": 0.5826230759445363, + "ewc_loss": 0.030386323109269142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5193161743809469e-05, + "grad_norm": 23.625879287719727, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8527740240097046, + "num_tokens": 174827131.0, + "step": 4580 + }, + { + "epoch": 0.5827502862231269, + "ewc_loss": 0.030335774645209312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.51678868860472e-05, + "grad_norm": 23.635740280151367, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8487654328346252, + "num_tokens": 174861391.0, + "step": 4581 + }, + { + "epoch": 0.5828774965017174, + "ewc_loss": 0.030413564294576645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.520678233646322e-05, + "grad_norm": 23.75233268737793, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8592039346694946, + "num_tokens": 174897437.0, + "step": 4582 + }, + { + "epoch": 0.5830047067803078, + "ewc_loss": 0.03034350275993347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5171751329035033e-05, + "grad_norm": 23.658432006835938, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8466712236404419, + "num_tokens": 174932504.0, + "step": 4583 + }, + { + "epoch": 0.5831319170588983, + "ewc_loss": 0.030358288437128067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5179144611465745e-05, + "grad_norm": 23.70783233642578, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.849837064743042, + "num_tokens": 174974746.0, + "step": 4584 + }, + { + "epoch": 0.5832591273374889, + "ewc_loss": 0.030435333028435707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5217666259559337e-05, + "grad_norm": 23.759685516357422, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8498549461364746, + "num_tokens": 175016155.0, + "step": 4585 + }, + { + "epoch": 0.5833863376160794, + "ewc_loss": 0.030355948954820633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5177974091784563e-05, + "grad_norm": 23.69019889831543, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8405899405479431, + "num_tokens": 175060595.0, + "step": 4586 + }, + { + "epoch": 0.5835135478946699, + "ewc_loss": 0.03035053424537182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5175267435552087e-05, + "grad_norm": 23.779348373413086, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8545366525650024, + "num_tokens": 175101278.0, + "step": 4587 + }, + { + "epoch": 0.5836407581732604, + "ewc_loss": 0.03035876899957657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5179384718067013e-05, + "grad_norm": 23.73383331298828, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8480695486068726, + "num_tokens": 175139479.0, + "step": 4588 + }, + { + "epoch": 0.5837679684518509, + "ewc_loss": 0.030329901725053787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5164951037149876e-05, + "grad_norm": 23.721712112426758, + "learning_rate": 1e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8382328748703003, + "num_tokens": 175182376.0, + "step": 4589 + }, + { + "epoch": 0.5838951787304414, + "ewc_loss": 0.030369458720088005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.518472890893463e-05, + "grad_norm": 23.746376037597656, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8476938605308533, + "num_tokens": 175225449.0, + "step": 4590 + }, + { + "epoch": 0.5840223890090319, + "ewc_loss": 0.03032499924302101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5162499948928598e-05, + "grad_norm": 23.673158645629883, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8524972796440125, + "num_tokens": 175255959.0, + "step": 4591 + }, + { + "epoch": 0.5841495992876224, + "ewc_loss": 0.030357081443071365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5178540706983767e-05, + "grad_norm": 23.766742706298828, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.841721773147583, + "num_tokens": 175292142.0, + "step": 4592 + }, + { + "epoch": 0.584276809566213, + "ewc_loss": 0.030385959893465042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5192979844869114e-05, + "grad_norm": 23.756999969482422, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8482997417449951, + "num_tokens": 175336630.0, + "step": 4593 + }, + { + "epoch": 0.5844040198448035, + "ewc_loss": 0.030346930027008057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5173464817053173e-05, + "grad_norm": 23.58735466003418, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8658528327941895, + "num_tokens": 175372697.0, + "step": 4594 + }, + { + "epoch": 0.5845312301233939, + "ewc_loss": 0.030358972027897835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.517948567197891e-05, + "grad_norm": 23.82392692565918, + "learning_rate": 1e-06, + "loss": 0.515, + "mean_token_accuracy": 0.8423689603805542, + "num_tokens": 175412395.0, + "step": 4595 + }, + { + "epoch": 0.5846584404019844, + "ewc_loss": 0.030371686443686485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5185843039944302e-05, + "grad_norm": 23.62210464477539, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8539086580276489, + "num_tokens": 175447295.0, + "step": 4596 + }, + { + "epoch": 0.584785650680575, + "ewc_loss": 0.030313313007354736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5156656445469707e-05, + "grad_norm": 23.67997932434082, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8641701936721802, + "num_tokens": 175480229.0, + "step": 4597 + }, + { + "epoch": 0.5849128609591655, + "ewc_loss": 0.03031732514500618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5158662790781818e-05, + "grad_norm": 23.65102767944336, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8539865016937256, + "num_tokens": 175523081.0, + "step": 4598 + }, + { + "epoch": 0.585040071237756, + "ewc_loss": 0.030369186773896217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5184593394224066e-05, + "grad_norm": 23.62060546875, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8585526943206787, + "num_tokens": 175565036.0, + "step": 4599 + }, + { + "epoch": 0.5851672815163466, + "ewc_loss": 0.030410710722208023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5205355339276139e-05, + "grad_norm": 23.723831176757812, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8542995452880859, + "num_tokens": 175606733.0, + "step": 4600 + }, + { + "epoch": 0.585294491794937, + "ewc_loss": 0.030442645773291588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5221322428260464e-05, + "grad_norm": 23.767778396606445, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8699780702590942, + "num_tokens": 175637663.0, + "step": 4601 + }, + { + "epoch": 0.5854217020735275, + "ewc_loss": 0.03038857690989971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5194288607744966e-05, + "grad_norm": 23.729806900024414, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8571149706840515, + "num_tokens": 175677381.0, + "step": 4602 + }, + { + "epoch": 0.585548912352118, + "ewc_loss": 0.030370555818080902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5185278243734501e-05, + "grad_norm": 23.69849395751953, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8654706478118896, + "num_tokens": 175713049.0, + "step": 4603 + }, + { + "epoch": 0.5856761226307086, + "ewc_loss": 0.03040209412574768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.520104706287384e-05, + "grad_norm": 23.792156219482422, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8448909521102905, + "num_tokens": 175753681.0, + "step": 4604 + }, + { + "epoch": 0.5858033329092991, + "ewc_loss": 0.030367815867066383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.518390763521893e-05, + "grad_norm": 23.68433380126953, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8421084880828857, + "num_tokens": 175789725.0, + "step": 4605 + }, + { + "epoch": 0.5859305431878896, + "ewc_loss": 0.030374610796570778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5187305507424753e-05, + "grad_norm": 23.744421005249023, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.85466068983078, + "num_tokens": 175823203.0, + "step": 4606 + }, + { + "epoch": 0.5860577534664801, + "ewc_loss": 0.030400387942790985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5200193956843577e-05, + "grad_norm": 23.666717529296875, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8486545085906982, + "num_tokens": 175860506.0, + "step": 4607 + }, + { + "epoch": 0.5861849637450706, + "ewc_loss": 0.03040476143360138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.520238038210664e-05, + "grad_norm": 23.714475631713867, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8706722259521484, + "num_tokens": 175895080.0, + "step": 4608 + }, + { + "epoch": 0.5863121740236611, + "ewc_loss": 0.03043944388628006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5219721717585344e-05, + "grad_norm": 23.67267608642578, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8481725454330444, + "num_tokens": 175937113.0, + "step": 4609 + }, + { + "epoch": 0.5864393843022516, + "ewc_loss": 0.030387740582227707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.519387024018215e-05, + "grad_norm": 23.658803939819336, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8674110174179077, + "num_tokens": 175966193.0, + "step": 4610 + }, + { + "epoch": 0.5865665945808421, + "ewc_loss": 0.03042055480182171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5210277524602134e-05, + "grad_norm": 23.617950439453125, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8481696844100952, + "num_tokens": 176003771.0, + "step": 4611 + }, + { + "epoch": 0.5866938048594327, + "ewc_loss": 0.03044344298541546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5221721696434543e-05, + "grad_norm": 23.676145553588867, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8597025871276855, + "num_tokens": 176050069.0, + "step": 4612 + }, + { + "epoch": 0.5868210151380232, + "ewc_loss": 0.030440624803304672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5220311979646794e-05, + "grad_norm": 23.612751007080078, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8692438006401062, + "num_tokens": 176088763.0, + "step": 4613 + }, + { + "epoch": 0.5869482254166136, + "ewc_loss": 0.030434293672442436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5217146938084625e-05, + "grad_norm": 23.722179412841797, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8567402362823486, + "num_tokens": 176127853.0, + "step": 4614 + }, + { + "epoch": 0.5870754356952042, + "ewc_loss": 0.030484799295663834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5242399967974052e-05, + "grad_norm": 23.621952056884766, + "learning_rate": 1e-06, + "loss": 0.5124, + "mean_token_accuracy": 0.8414700031280518, + "num_tokens": 176173775.0, + "step": 4615 + }, + { + "epoch": 0.5872026459737947, + "ewc_loss": 0.030432848259806633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5216423889796715e-05, + "grad_norm": 23.787425994873047, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8579437732696533, + "num_tokens": 176210878.0, + "step": 4616 + }, + { + "epoch": 0.5873298562523852, + "ewc_loss": 0.030512360855937004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5256180631695315e-05, + "grad_norm": 23.730010986328125, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8489038944244385, + "num_tokens": 176255094.0, + "step": 4617 + }, + { + "epoch": 0.5874570665309757, + "ewc_loss": 0.03040817193686962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5204085684672464e-05, + "grad_norm": 23.75635528564453, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8514465093612671, + "num_tokens": 176293586.0, + "step": 4618 + }, + { + "epoch": 0.5875842768095663, + "ewc_loss": 0.030456751585006714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5228375559672713e-05, + "grad_norm": 23.72350311279297, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8576494455337524, + "num_tokens": 176329203.0, + "step": 4619 + }, + { + "epoch": 0.5877114870881567, + "ewc_loss": 0.030416838824748993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5208419426926412e-05, + "grad_norm": 23.66551971435547, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.872259259223938, + "num_tokens": 176367984.0, + "step": 4620 + }, + { + "epoch": 0.5878386973667472, + "ewc_loss": 0.03040464036166668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5202320355456322e-05, + "grad_norm": 23.724241256713867, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8684929609298706, + "num_tokens": 176408949.0, + "step": 4621 + }, + { + "epoch": 0.5879659076453377, + "ewc_loss": 0.030400512740015984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5200256711978e-05, + "grad_norm": 23.770092010498047, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8704761266708374, + "num_tokens": 176451073.0, + "step": 4622 + }, + { + "epoch": 0.5880931179239283, + "ewc_loss": 0.03043448179960251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5217240616038907e-05, + "grad_norm": 23.733327865600586, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8625684976577759, + "num_tokens": 176490623.0, + "step": 4623 + }, + { + "epoch": 0.5882203282025188, + "ewc_loss": 0.030328210443258286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5164105207077228e-05, + "grad_norm": 23.71516990661621, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8418412804603577, + "num_tokens": 176525771.0, + "step": 4624 + }, + { + "epoch": 0.5883475384811093, + "ewc_loss": 0.03039281815290451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5196408639894798e-05, + "grad_norm": 23.757814407348633, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8742780685424805, + "num_tokens": 176565417.0, + "step": 4625 + }, + { + "epoch": 0.5884747487596997, + "ewc_loss": 0.030387552455067635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5193776562227868e-05, + "grad_norm": 23.586442947387695, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8513363599777222, + "num_tokens": 176599005.0, + "step": 4626 + }, + { + "epoch": 0.5886019590382903, + "ewc_loss": 0.030381031334400177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5190516023722012e-05, + "grad_norm": 23.833742141723633, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.857401967048645, + "num_tokens": 176637947.0, + "step": 4627 + }, + { + "epoch": 0.5887291693168808, + "ewc_loss": 0.030514433979988098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5257217455655336e-05, + "grad_norm": 23.739768981933594, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8455933928489685, + "num_tokens": 176675241.0, + "step": 4628 + }, + { + "epoch": 0.5888563795954713, + "ewc_loss": 0.030283229425549507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5141614312597085e-05, + "grad_norm": 23.799049377441406, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8495543003082275, + "num_tokens": 176715375.0, + "step": 4629 + }, + { + "epoch": 0.5889835898740619, + "ewc_loss": 0.030448399484157562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5224200069496874e-05, + "grad_norm": 23.933618545532227, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8548322916030884, + "num_tokens": 176750977.0, + "step": 4630 + }, + { + "epoch": 0.5891108001526524, + "ewc_loss": 0.030314411967992783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5157205780269578e-05, + "grad_norm": 23.769886016845703, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8582615852355957, + "num_tokens": 176788968.0, + "step": 4631 + }, + { + "epoch": 0.5892380104312428, + "ewc_loss": 0.03036515973508358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5182579772954341e-05, + "grad_norm": 23.8480167388916, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8519518375396729, + "num_tokens": 176824399.0, + "step": 4632 + }, + { + "epoch": 0.5893652207098333, + "ewc_loss": 0.03033485822379589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5167429410212208e-05, + "grad_norm": 23.830120086669922, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8502275347709656, + "num_tokens": 176862053.0, + "step": 4633 + }, + { + "epoch": 0.5894924309884239, + "ewc_loss": 0.030325692147016525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5162846466409974e-05, + "grad_norm": 23.724130630493164, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8695806264877319, + "num_tokens": 176896943.0, + "step": 4634 + }, + { + "epoch": 0.5896196412670144, + "ewc_loss": 0.030321814119815826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5160907423705794e-05, + "grad_norm": 23.72549819946289, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.855546236038208, + "num_tokens": 176936263.0, + "step": 4635 + }, + { + "epoch": 0.5897468515456049, + "ewc_loss": 0.030363192781805992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5181596609181724e-05, + "grad_norm": 23.728849411010742, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8436325192451477, + "num_tokens": 176976632.0, + "step": 4636 + }, + { + "epoch": 0.5898740618241954, + "ewc_loss": 0.030362719669938087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5181360140559264e-05, + "grad_norm": 23.734506607055664, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8602432012557983, + "num_tokens": 177007036.0, + "step": 4637 + }, + { + "epoch": 0.5900012721027859, + "ewc_loss": 0.03038271889090538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5191359125310555e-05, + "grad_norm": 23.696109771728516, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8461998701095581, + "num_tokens": 177042189.0, + "step": 4638 + }, + { + "epoch": 0.5901284823813764, + "ewc_loss": 0.030436452478170395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.521822650829563e-05, + "grad_norm": 23.792972564697266, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8498559594154358, + "num_tokens": 177082428.0, + "step": 4639 + }, + { + "epoch": 0.5902556926599669, + "ewc_loss": 0.03038027510046959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5190137673926074e-05, + "grad_norm": 23.67379379272461, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8580339550971985, + "num_tokens": 177124447.0, + "step": 4640 + }, + { + "epoch": 0.5903829029385574, + "ewc_loss": 0.03039710409939289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5198552318906877e-05, + "grad_norm": 23.6723575592041, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8708791732788086, + "num_tokens": 177160516.0, + "step": 4641 + }, + { + "epoch": 0.590510113217148, + "ewc_loss": 0.030473580583930016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5236790204653516e-05, + "grad_norm": 23.661523818969727, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8626866340637207, + "num_tokens": 177198100.0, + "step": 4642 + }, + { + "epoch": 0.5906373234957385, + "ewc_loss": 0.030419282615184784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5209640878310893e-05, + "grad_norm": 23.74228286743164, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8487774133682251, + "num_tokens": 177243170.0, + "step": 4643 + }, + { + "epoch": 0.5907645337743289, + "ewc_loss": 0.030473388731479645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.523669470770983e-05, + "grad_norm": 23.685449600219727, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.852160632610321, + "num_tokens": 177282976.0, + "step": 4644 + }, + { + "epoch": 0.5908917440529194, + "ewc_loss": 0.030333558097481728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.516677912150044e-05, + "grad_norm": 23.600067138671875, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8378117084503174, + "num_tokens": 177323725.0, + "step": 4645 + }, + { + "epoch": 0.59101895433151, + "ewc_loss": 0.03043845295906067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5219226042972878e-05, + "grad_norm": 23.712419509887695, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8549057245254517, + "num_tokens": 177352178.0, + "step": 4646 + }, + { + "epoch": 0.5911461646101005, + "ewc_loss": 0.030482878908514977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.524143954156898e-05, + "grad_norm": 23.872215270996094, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8525237441062927, + "num_tokens": 177392322.0, + "step": 4647 + }, + { + "epoch": 0.591273374888691, + "ewc_loss": 0.030418356880545616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5209178855002392e-05, + "grad_norm": 23.59838104248047, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8537549376487732, + "num_tokens": 177432013.0, + "step": 4648 + }, + { + "epoch": 0.5914005851672816, + "ewc_loss": 0.03042936511337757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.521468220744282e-05, + "grad_norm": 23.787254333496094, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8685237765312195, + "num_tokens": 177473542.0, + "step": 4649 + }, + { + "epoch": 0.591527795445872, + "ewc_loss": 0.030470073223114014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5235036698868498e-05, + "grad_norm": 23.748018264770508, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8521538376808167, + "num_tokens": 177516590.0, + "step": 4650 + }, + { + "epoch": 0.5916550057244625, + "ewc_loss": 0.030403446406126022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5201722817437258e-05, + "grad_norm": 23.813556671142578, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8686283826828003, + "num_tokens": 177553750.0, + "step": 4651 + }, + { + "epoch": 0.591782216003053, + "ewc_loss": 0.030474429950118065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5237214938679244e-05, + "grad_norm": 23.860942840576172, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.848697304725647, + "num_tokens": 177586093.0, + "step": 4652 + }, + { + "epoch": 0.5919094262816436, + "ewc_loss": 0.03039836511015892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5199182598735206e-05, + "grad_norm": 23.73998260498047, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.85097336769104, + "num_tokens": 177626800.0, + "step": 4653 + }, + { + "epoch": 0.5920366365602341, + "ewc_loss": 0.030396806076169014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5198403161775786e-05, + "grad_norm": 23.733922958374023, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8543009161949158, + "num_tokens": 177662336.0, + "step": 4654 + }, + { + "epoch": 0.5921638468388246, + "ewc_loss": 0.030457885935902596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.522894308436662e-05, + "grad_norm": 23.749710083007812, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8638287782669067, + "num_tokens": 177706922.0, + "step": 4655 + }, + { + "epoch": 0.592291057117415, + "ewc_loss": 0.030412614345550537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5206307580228895e-05, + "grad_norm": 23.700868606567383, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.853775143623352, + "num_tokens": 177743922.0, + "step": 4656 + }, + { + "epoch": 0.5924182673960056, + "ewc_loss": 0.030424663797020912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5212332073133439e-05, + "grad_norm": 23.610496520996094, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.861139714717865, + "num_tokens": 177777069.0, + "step": 4657 + }, + { + "epoch": 0.5925454776745961, + "ewc_loss": 0.030470602214336395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5235301361826714e-05, + "grad_norm": 23.698755264282227, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8632254004478455, + "num_tokens": 177810959.0, + "step": 4658 + }, + { + "epoch": 0.5926726879531866, + "ewc_loss": 0.030499977990984917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5249988791765645e-05, + "grad_norm": 23.730575561523438, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8469064831733704, + "num_tokens": 177846366.0, + "step": 4659 + }, + { + "epoch": 0.5927998982317771, + "ewc_loss": 0.03049815073609352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5249075659085065e-05, + "grad_norm": 23.777393341064453, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8674355149269104, + "num_tokens": 177881450.0, + "step": 4660 + }, + { + "epoch": 0.5929271085103677, + "ewc_loss": 0.030521761626005173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5260880900314078e-05, + "grad_norm": 23.785476684570312, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8558457493782043, + "num_tokens": 177922931.0, + "step": 4661 + }, + { + "epoch": 0.5930543187889582, + "ewc_loss": 0.030455444008111954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5227721632982139e-05, + "grad_norm": 23.646236419677734, + "learning_rate": 1e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8388399481773376, + "num_tokens": 177962722.0, + "step": 4662 + }, + { + "epoch": 0.5931815290675486, + "ewc_loss": 0.03051839955151081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.52592001541052e-05, + "grad_norm": 23.77033042907715, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8542256355285645, + "num_tokens": 177999956.0, + "step": 4663 + }, + { + "epoch": 0.5933087393461391, + "ewc_loss": 0.03055485710501671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5277428246918134e-05, + "grad_norm": 23.741220474243164, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8450425267219543, + "num_tokens": 178032450.0, + "step": 4664 + }, + { + "epoch": 0.5934359496247297, + "ewc_loss": 0.03051598370075226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5257991435646545e-05, + "grad_norm": 23.764558792114258, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.856380820274353, + "num_tokens": 178075028.0, + "step": 4665 + }, + { + "epoch": 0.5935631599033202, + "ewc_loss": 0.030488643795251846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5244321730278898e-05, + "grad_norm": 23.71017074584961, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.851066529750824, + "num_tokens": 178108267.0, + "step": 4666 + }, + { + "epoch": 0.5936903701819107, + "ewc_loss": 0.03047686070203781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5238430023600813e-05, + "grad_norm": 23.613988876342773, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8550123572349548, + "num_tokens": 178148091.0, + "step": 4667 + }, + { + "epoch": 0.5938175804605013, + "ewc_loss": 0.03051932156085968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.52596603584243e-05, + "grad_norm": 23.926881790161133, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8691729307174683, + "num_tokens": 178185626.0, + "step": 4668 + }, + { + "epoch": 0.5939447907390917, + "ewc_loss": 0.030566353350877762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.528317625343334e-05, + "grad_norm": 23.696699142456055, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8626434206962585, + "num_tokens": 178224165.0, + "step": 4669 + }, + { + "epoch": 0.5940720010176822, + "ewc_loss": 0.03047974780201912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5239874301187228e-05, + "grad_norm": 23.779129028320312, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8546155691146851, + "num_tokens": 178264118.0, + "step": 4670 + }, + { + "epoch": 0.5941992112962727, + "ewc_loss": 0.030585404485464096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.529270230093971e-05, + "grad_norm": 23.856958389282227, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8665863275527954, + "num_tokens": 178295634.0, + "step": 4671 + }, + { + "epoch": 0.5943264215748633, + "ewc_loss": 0.030487950891256332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5243975212797523e-05, + "grad_norm": 23.568574905395508, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8623391389846802, + "num_tokens": 178327076.0, + "step": 4672 + }, + { + "epoch": 0.5944536318534538, + "ewc_loss": 0.030499355867505074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5249677744577639e-05, + "grad_norm": 23.66152000427246, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8466416597366333, + "num_tokens": 178357469.0, + "step": 4673 + }, + { + "epoch": 0.5945808421320443, + "ewc_loss": 0.03062291257083416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5311456081690267e-05, + "grad_norm": 23.780683517456055, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8612849712371826, + "num_tokens": 178393830.0, + "step": 4674 + }, + { + "epoch": 0.5947080524106347, + "ewc_loss": 0.030548689886927605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5274345059879124e-05, + "grad_norm": 23.5739803314209, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.854494571685791, + "num_tokens": 178427560.0, + "step": 4675 + }, + { + "epoch": 0.5948352626892253, + "ewc_loss": 0.03063574992120266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.531787529529538e-05, + "grad_norm": 23.881690979003906, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.856874942779541, + "num_tokens": 178464666.0, + "step": 4676 + }, + { + "epoch": 0.5949624729678158, + "ewc_loss": 0.030656909570097923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5328454537666403e-05, + "grad_norm": 23.71795654296875, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8454339504241943, + "num_tokens": 178508771.0, + "step": 4677 + }, + { + "epoch": 0.5950896832464063, + "ewc_loss": 0.03054128959774971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5270645235432312e-05, + "grad_norm": 23.763307571411133, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8593260049819946, + "num_tokens": 178548516.0, + "step": 4678 + }, + { + "epoch": 0.5952168935249968, + "ewc_loss": 0.030596554279327393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5298277503461577e-05, + "grad_norm": 23.791994094848633, + "learning_rate": 1e-06, + "loss": 0.5814, + "mean_token_accuracy": 0.8210119009017944, + "num_tokens": 178591276.0, + "step": 4679 + }, + { + "epoch": 0.5953441038035874, + "ewc_loss": 0.030606457963585854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.530322879261803e-05, + "grad_norm": 23.74225616455078, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8536086082458496, + "num_tokens": 178632580.0, + "step": 4680 + }, + { + "epoch": 0.5954713140821778, + "ewc_loss": 0.03058898076415062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5294490367523395e-05, + "grad_norm": 23.744646072387695, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8503720164299011, + "num_tokens": 178672858.0, + "step": 4681 + }, + { + "epoch": 0.5955985243607683, + "ewc_loss": 0.03060953877866268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5304769476642832e-05, + "grad_norm": 23.758771896362305, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8561346530914307, + "num_tokens": 178706435.0, + "step": 4682 + }, + { + "epoch": 0.5957257346393589, + "ewc_loss": 0.03051668405532837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5258341591106728e-05, + "grad_norm": 23.68695831298828, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8519766926765442, + "num_tokens": 178746309.0, + "step": 4683 + }, + { + "epoch": 0.5958529449179494, + "ewc_loss": 0.030687926337122917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5343963241321035e-05, + "grad_norm": 23.8643856048584, + "learning_rate": 1e-06, + "loss": 0.5155, + "mean_token_accuracy": 0.8373851180076599, + "num_tokens": 178787639.0, + "step": 4684 + }, + { + "epoch": 0.5959801551965399, + "ewc_loss": 0.030580712482333183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5290355804609135e-05, + "grad_norm": 23.63985824584961, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8602011203765869, + "num_tokens": 178828271.0, + "step": 4685 + }, + { + "epoch": 0.5961073654751304, + "ewc_loss": 0.0305642019957304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5282101230695844e-05, + "grad_norm": 23.806121826171875, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8761887550354004, + "num_tokens": 178862862.0, + "step": 4686 + }, + { + "epoch": 0.5962345757537209, + "ewc_loss": 0.030640341341495514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5320170859922655e-05, + "grad_norm": 23.83928871154785, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8670467138290405, + "num_tokens": 178897969.0, + "step": 4687 + }, + { + "epoch": 0.5963617860323114, + "ewc_loss": 0.03058725968003273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5293629985535517e-05, + "grad_norm": 23.660717010498047, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8606570959091187, + "num_tokens": 178942002.0, + "step": 4688 + }, + { + "epoch": 0.5964889963109019, + "ewc_loss": 0.03055211715400219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5276058547897264e-05, + "grad_norm": 23.644947052001953, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8654248714447021, + "num_tokens": 178983473.0, + "step": 4689 + }, + { + "epoch": 0.5966162065894924, + "ewc_loss": 0.03068041428923607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5340207028202713e-05, + "grad_norm": 23.7193603515625, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8433448672294617, + "num_tokens": 179022370.0, + "step": 4690 + }, + { + "epoch": 0.596743416868083, + "ewc_loss": 0.030661528930068016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5330764654208906e-05, + "grad_norm": 23.72089385986328, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8698407411575317, + "num_tokens": 179064841.0, + "step": 4691 + }, + { + "epoch": 0.5968706271466735, + "ewc_loss": 0.030684372410178185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.534218608867377e-05, + "grad_norm": 23.824296951293945, + "learning_rate": 1e-06, + "loss": 0.5438, + "mean_token_accuracy": 0.8269956111907959, + "num_tokens": 179099399.0, + "step": 4692 + }, + { + "epoch": 0.5969978374252639, + "ewc_loss": 0.03063056245446205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5315281416405924e-05, + "grad_norm": 23.63814926147461, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8462023735046387, + "num_tokens": 179133969.0, + "step": 4693 + }, + { + "epoch": 0.5971250477038544, + "ewc_loss": 0.030688028782606125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5344014173024334e-05, + "grad_norm": 23.697967529296875, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8525711297988892, + "num_tokens": 179165456.0, + "step": 4694 + }, + { + "epoch": 0.597252257982445, + "ewc_loss": 0.030694980174303055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.534749026177451e-05, + "grad_norm": 23.803749084472656, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8581790924072266, + "num_tokens": 179198209.0, + "step": 4695 + }, + { + "epoch": 0.5973794682610355, + "ewc_loss": 0.030730778351426125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.53653891175054e-05, + "grad_norm": 23.800825119018555, + "learning_rate": 1e-06, + "loss": 0.5064, + "mean_token_accuracy": 0.8417009115219116, + "num_tokens": 179233872.0, + "step": 4696 + }, + { + "epoch": 0.597506678539626, + "ewc_loss": 0.03065628372132778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5328141671488993e-05, + "grad_norm": 23.66425895690918, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8765266537666321, + "num_tokens": 179272721.0, + "step": 4697 + }, + { + "epoch": 0.5976338888182166, + "ewc_loss": 0.0307171531021595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5358577002189122e-05, + "grad_norm": 23.781848907470703, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8537139296531677, + "num_tokens": 179305781.0, + "step": 4698 + }, + { + "epoch": 0.597761099096807, + "ewc_loss": 0.03070630505681038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.535315277578775e-05, + "grad_norm": 23.688793182373047, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.859825849533081, + "num_tokens": 179350049.0, + "step": 4699 + }, + { + "epoch": 0.5978883093753975, + "ewc_loss": 0.0306987427175045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5349371096817777e-05, + "grad_norm": 23.65878677368164, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8556283116340637, + "num_tokens": 179388063.0, + "step": 4700 + }, + { + "epoch": 0.598015519653988, + "ewc_loss": 0.030676906928420067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5338453522417694e-05, + "grad_norm": 23.667814254760742, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8629246950149536, + "num_tokens": 179427607.0, + "step": 4701 + }, + { + "epoch": 0.5981427299325786, + "ewc_loss": 0.030750317499041557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5375158909591846e-05, + "grad_norm": 23.75527000427246, + "learning_rate": 1e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.838297963142395, + "num_tokens": 179471456.0, + "step": 4702 + }, + { + "epoch": 0.5982699402111691, + "ewc_loss": 0.030726533383131027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5363266356871463e-05, + "grad_norm": 23.648488998413086, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8665475249290466, + "num_tokens": 179514318.0, + "step": 4703 + }, + { + "epoch": 0.5983971504897596, + "ewc_loss": 0.03074932098388672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5374660506495275e-05, + "grad_norm": 23.831329345703125, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8688815236091614, + "num_tokens": 179553307.0, + "step": 4704 + }, + { + "epoch": 0.59852436076835, + "ewc_loss": 0.030771473422646523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5385736332973465e-05, + "grad_norm": 23.723270416259766, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8474039435386658, + "num_tokens": 179597288.0, + "step": 4705 + }, + { + "epoch": 0.5986515710469406, + "ewc_loss": 0.03068370930850506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5341855032602325e-05, + "grad_norm": 23.729955673217773, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8561257123947144, + "num_tokens": 179629583.0, + "step": 4706 + }, + { + "epoch": 0.5987787813255311, + "ewc_loss": 0.03071068972349167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.535534465801902e-05, + "grad_norm": 23.77429962158203, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8496882915496826, + "num_tokens": 179672104.0, + "step": 4707 + }, + { + "epoch": 0.5989059916041216, + "ewc_loss": 0.03068324364721775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.534162220195867e-05, + "grad_norm": 23.71600914001465, + "learning_rate": 1e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.8461217880249023, + "num_tokens": 179712951.0, + "step": 4708 + }, + { + "epoch": 0.5990332018827121, + "ewc_loss": 0.03068551793694496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5342759070335887e-05, + "grad_norm": 23.721298217773438, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8541975021362305, + "num_tokens": 179754626.0, + "step": 4709 + }, + { + "epoch": 0.5991604121613027, + "ewc_loss": 0.03072560951113701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5362804333562963e-05, + "grad_norm": 23.774133682250977, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8606493473052979, + "num_tokens": 179790068.0, + "step": 4710 + }, + { + "epoch": 0.5992876224398932, + "ewc_loss": 0.030653445050120354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5326722859754227e-05, + "grad_norm": 23.73212432861328, + "learning_rate": 1e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.8360505700111389, + "num_tokens": 179823936.0, + "step": 4711 + }, + { + "epoch": 0.5994148327184836, + "ewc_loss": 0.030724553391337395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5362276826635934e-05, + "grad_norm": 23.896461486816406, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8569843769073486, + "num_tokens": 179860920.0, + "step": 4712 + }, + { + "epoch": 0.5995420429970741, + "ewc_loss": 0.03064468875527382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5322344552259892e-05, + "grad_norm": 23.829212188720703, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8453823328018188, + "num_tokens": 179899848.0, + "step": 4713 + }, + { + "epoch": 0.5996692532756647, + "ewc_loss": 0.030610058456659317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.530502959212754e-05, + "grad_norm": 23.76258659362793, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8727470636367798, + "num_tokens": 179934462.0, + "step": 4714 + }, + { + "epoch": 0.5997964635542552, + "ewc_loss": 0.030581369996070862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5290685041691177e-05, + "grad_norm": 23.808422088623047, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8558081388473511, + "num_tokens": 179973955.0, + "step": 4715 + }, + { + "epoch": 0.5999236738328457, + "ewc_loss": 0.030691448599100113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5345724023063667e-05, + "grad_norm": 23.801870346069336, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8574218153953552, + "num_tokens": 180015256.0, + "step": 4716 + }, + { + "epoch": 0.6000508841114363, + "ewc_loss": 0.030632363632321358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.531618181616068e-05, + "grad_norm": 23.855772018432617, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8626418113708496, + "num_tokens": 180054145.0, + "step": 4717 + }, + { + "epoch": 0.6001780943900267, + "ewc_loss": 0.0307060144841671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5353007256635465e-05, + "grad_norm": 23.646678924560547, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.860239565372467, + "num_tokens": 180093411.0, + "step": 4718 + }, + { + "epoch": 0.6003053046686172, + "ewc_loss": 0.030620809644460678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5310404705815017e-05, + "grad_norm": 23.963930130004883, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8654606342315674, + "num_tokens": 180135799.0, + "step": 4719 + }, + { + "epoch": 0.6004325149472077, + "ewc_loss": 0.030748877674341202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5374438589788042e-05, + "grad_norm": 23.693824768066406, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.849621057510376, + "num_tokens": 180172623.0, + "step": 4720 + }, + { + "epoch": 0.6005597252257983, + "ewc_loss": 0.030596595257520676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5298297512345016e-05, + "grad_norm": 23.90122413635254, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8524349927902222, + "num_tokens": 180213833.0, + "step": 4721 + }, + { + "epoch": 0.6006869355043888, + "ewc_loss": 0.03070675954222679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.535338014946319e-05, + "grad_norm": 23.705331802368164, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8629828095436096, + "num_tokens": 180254982.0, + "step": 4722 + }, + { + "epoch": 0.6008141457829793, + "ewc_loss": 0.03060278296470642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5301391613320448e-05, + "grad_norm": 23.818004608154297, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8643795847892761, + "num_tokens": 180288544.0, + "step": 4723 + }, + { + "epoch": 0.6009413560615697, + "ewc_loss": 0.03071659430861473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5358296877820976e-05, + "grad_norm": 23.769258499145508, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8520324230194092, + "num_tokens": 180330233.0, + "step": 4724 + }, + { + "epoch": 0.6010685663401603, + "ewc_loss": 0.03056933544576168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5284667824744247e-05, + "grad_norm": 23.727468490600586, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8593413829803467, + "num_tokens": 180361516.0, + "step": 4725 + }, + { + "epoch": 0.6011957766187508, + "ewc_loss": 0.0306658074259758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5332903785747476e-05, + "grad_norm": 23.93732261657715, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8544480800628662, + "num_tokens": 180404380.0, + "step": 4726 + }, + { + "epoch": 0.6013229868973413, + "ewc_loss": 0.030666319653391838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5333160263253376e-05, + "grad_norm": 23.763286590576172, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8486570119857788, + "num_tokens": 180436916.0, + "step": 4727 + }, + { + "epoch": 0.6014501971759318, + "ewc_loss": 0.03054683841764927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.527341919427272e-05, + "grad_norm": 23.75271224975586, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8389633297920227, + "num_tokens": 180476895.0, + "step": 4728 + }, + { + "epoch": 0.6015774074545224, + "ewc_loss": 0.03066888451576233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5334442650782876e-05, + "grad_norm": 23.859085083007812, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8558401465415955, + "num_tokens": 180515248.0, + "step": 4729 + }, + { + "epoch": 0.6017046177331128, + "ewc_loss": 0.030655184760689735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5327592336689122e-05, + "grad_norm": 23.799030303955078, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8626513481140137, + "num_tokens": 180554575.0, + "step": 4730 + }, + { + "epoch": 0.6018318280117033, + "ewc_loss": 0.03061066009104252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5305329725379124e-05, + "grad_norm": 23.790569305419922, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8461233973503113, + "num_tokens": 180594485.0, + "step": 4731 + }, + { + "epoch": 0.6019590382902938, + "ewc_loss": 0.030651306733489037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5325653293984942e-05, + "grad_norm": 23.824411392211914, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8412461280822754, + "num_tokens": 180634746.0, + "step": 4732 + }, + { + "epoch": 0.6020862485688844, + "ewc_loss": 0.030639387667179108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5319694284698926e-05, + "grad_norm": 23.734609603881836, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8462757468223572, + "num_tokens": 180674946.0, + "step": 4733 + }, + { + "epoch": 0.6022134588474749, + "ewc_loss": 0.030613504350185394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5306752175092697e-05, + "grad_norm": 23.813932418823242, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8549906015396118, + "num_tokens": 180715538.0, + "step": 4734 + }, + { + "epoch": 0.6023406691260654, + "ewc_loss": 0.030639993026852608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5319996236939915e-05, + "grad_norm": 23.792400360107422, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8539804220199585, + "num_tokens": 180755298.0, + "step": 4735 + }, + { + "epoch": 0.6024678794046558, + "ewc_loss": 0.030615566298365593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5307783542084508e-05, + "grad_norm": 23.763931274414062, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8491643071174622, + "num_tokens": 180797104.0, + "step": 4736 + }, + { + "epoch": 0.6025950896832464, + "ewc_loss": 0.03060944750905037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5304724001907744e-05, + "grad_norm": 23.78101921081543, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8663079142570496, + "num_tokens": 180839845.0, + "step": 4737 + }, + { + "epoch": 0.6027222999618369, + "ewc_loss": 0.03066129982471466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5330650057876483e-05, + "grad_norm": 23.68630599975586, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8692004680633545, + "num_tokens": 180878849.0, + "step": 4738 + }, + { + "epoch": 0.6028495102404274, + "ewc_loss": 0.030666084960103035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5333042028942145e-05, + "grad_norm": 23.811298370361328, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8540085554122925, + "num_tokens": 180918427.0, + "step": 4739 + }, + { + "epoch": 0.602976720519018, + "ewc_loss": 0.03066542185842991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.53327109728707e-05, + "grad_norm": 23.91626739501953, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8482261896133423, + "num_tokens": 180956206.0, + "step": 4740 + }, + { + "epoch": 0.6031039307976085, + "ewc_loss": 0.030636360868811607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5318180885515176e-05, + "grad_norm": 23.692790985107422, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8610726594924927, + "num_tokens": 180996945.0, + "step": 4741 + }, + { + "epoch": 0.6032311410761989, + "ewc_loss": 0.03056836500763893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.52841821545735e-05, + "grad_norm": 23.840621948242188, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8605664968490601, + "num_tokens": 181031763.0, + "step": 4742 + }, + { + "epoch": 0.6033583513547894, + "ewc_loss": 0.030610401183366776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5305200577131473e-05, + "grad_norm": 23.66683006286621, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8638923764228821, + "num_tokens": 181070661.0, + "step": 4743 + }, + { + "epoch": 0.60348556163338, + "ewc_loss": 0.030589552596211433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.529477594885975e-05, + "grad_norm": 23.80513572692871, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8423140645027161, + "num_tokens": 181112769.0, + "step": 4744 + }, + { + "epoch": 0.6036127719119705, + "ewc_loss": 0.030654963105916977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5327481378335506e-05, + "grad_norm": 23.784141540527344, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8560006022453308, + "num_tokens": 181151155.0, + "step": 4745 + }, + { + "epoch": 0.603739982190561, + "ewc_loss": 0.03058956004679203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.529477958683856e-05, + "grad_norm": 23.72331428527832, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.855652391910553, + "num_tokens": 181188689.0, + "step": 4746 + }, + { + "epoch": 0.6038671924691515, + "ewc_loss": 0.03059942089021206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.529971086711157e-05, + "grad_norm": 23.703079223632812, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8606383800506592, + "num_tokens": 181230031.0, + "step": 4747 + }, + { + "epoch": 0.603994402747742, + "ewc_loss": 0.0306649561971426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5332478142227046e-05, + "grad_norm": 23.823225021362305, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8549792170524597, + "num_tokens": 181271464.0, + "step": 4748 + }, + { + "epoch": 0.6041216130263325, + "ewc_loss": 0.030652886256575584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.532644273538608e-05, + "grad_norm": 23.64998435974121, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8503737449645996, + "num_tokens": 181312607.0, + "step": 4749 + }, + { + "epoch": 0.604248823304923, + "ewc_loss": 0.030616389587521553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.530819463368971e-05, + "grad_norm": 23.829219818115234, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8641179800033569, + "num_tokens": 181348628.0, + "step": 4750 + }, + { + "epoch": 0.6043760335835135, + "ewc_loss": 0.03073720633983612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5368603271781467e-05, + "grad_norm": 23.751615524291992, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8484801054000854, + "num_tokens": 181387975.0, + "step": 4751 + }, + { + "epoch": 0.6045032438621041, + "ewc_loss": 0.030607877299189568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5303938198485412e-05, + "grad_norm": 23.823427200317383, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8690565228462219, + "num_tokens": 181424163.0, + "step": 4752 + }, + { + "epoch": 0.6046304541406946, + "ewc_loss": 0.030725529417395592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5362764315796085e-05, + "grad_norm": 23.81037712097168, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8459177017211914, + "num_tokens": 181457002.0, + "step": 4753 + }, + { + "epoch": 0.604757664419285, + "ewc_loss": 0.03064224123954773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5321120372391306e-05, + "grad_norm": 23.708885192871094, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8517624139785767, + "num_tokens": 181494410.0, + "step": 4754 + }, + { + "epoch": 0.6048848746978756, + "ewc_loss": 0.030685674399137497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.534283728688024e-05, + "grad_norm": 23.76959800720215, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8531942963600159, + "num_tokens": 181534595.0, + "step": 4755 + }, + { + "epoch": 0.6050120849764661, + "ewc_loss": 0.03070615604519844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5353078197222203e-05, + "grad_norm": 23.728239059448242, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8618717789649963, + "num_tokens": 181570654.0, + "step": 4756 + }, + { + "epoch": 0.6051392952550566, + "ewc_loss": 0.030647795647382736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.532389796921052e-05, + "grad_norm": 23.905506134033203, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8500403165817261, + "num_tokens": 181608366.0, + "step": 4757 + }, + { + "epoch": 0.6052665055336471, + "ewc_loss": 0.03076382912695408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5381914636236615e-05, + "grad_norm": 23.666851043701172, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8433193564414978, + "num_tokens": 181647168.0, + "step": 4758 + }, + { + "epoch": 0.6053937158122377, + "ewc_loss": 0.03065459616482258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5327297660405748e-05, + "grad_norm": 23.88243865966797, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8589780926704407, + "num_tokens": 181689177.0, + "step": 4759 + }, + { + "epoch": 0.6055209260908282, + "ewc_loss": 0.030718790367245674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5359395547420718e-05, + "grad_norm": 23.712121963500977, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8603368997573853, + "num_tokens": 181723742.0, + "step": 4760 + }, + { + "epoch": 0.6056481363694186, + "ewc_loss": 0.030639398843050003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5319699741667137e-05, + "grad_norm": 23.717655181884766, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8649383783340454, + "num_tokens": 181761539.0, + "step": 4761 + }, + { + "epoch": 0.6057753466480091, + "ewc_loss": 0.03079552762210369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.539776349090971e-05, + "grad_norm": 23.808473587036133, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8547090291976929, + "num_tokens": 181799674.0, + "step": 4762 + }, + { + "epoch": 0.6059025569265997, + "ewc_loss": 0.030691998079419136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5345998690463603e-05, + "grad_norm": 23.715835571289062, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8440937995910645, + "num_tokens": 181838784.0, + "step": 4763 + }, + { + "epoch": 0.6060297672051902, + "ewc_loss": 0.03076687827706337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5383438949356787e-05, + "grad_norm": 23.820817947387695, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8516618013381958, + "num_tokens": 181875669.0, + "step": 4764 + }, + { + "epoch": 0.6061569774837807, + "ewc_loss": 0.030769307166337967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5384654034278356e-05, + "grad_norm": 23.747220993041992, + "learning_rate": 1e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8386988639831543, + "num_tokens": 181912450.0, + "step": 4765 + }, + { + "epoch": 0.6062841877623713, + "ewc_loss": 0.03073861449956894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.536930722068064e-05, + "grad_norm": 23.780651092529297, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8563375473022461, + "num_tokens": 181947691.0, + "step": 4766 + }, + { + "epoch": 0.6064113980409617, + "ewc_loss": 0.03078722208738327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5393610738101415e-05, + "grad_norm": 23.84657096862793, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8604981899261475, + "num_tokens": 181982364.0, + "step": 4767 + }, + { + "epoch": 0.6065386083195522, + "ewc_loss": 0.030764130875468254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.538206561235711e-05, + "grad_norm": 23.78881072998047, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8557631969451904, + "num_tokens": 182017071.0, + "step": 4768 + }, + { + "epoch": 0.6066658185981427, + "ewc_loss": 0.030771370977163315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5385685401270166e-05, + "grad_norm": 23.923768997192383, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8553215265274048, + "num_tokens": 182057234.0, + "step": 4769 + }, + { + "epoch": 0.6067930288767333, + "ewc_loss": 0.03074600361287594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.537300158815924e-05, + "grad_norm": 23.758724212646484, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8664658069610596, + "num_tokens": 182092450.0, + "step": 4770 + }, + { + "epoch": 0.6069202391553238, + "ewc_loss": 0.030763020738959312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5381510820589028e-05, + "grad_norm": 23.9937801361084, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8485895395278931, + "num_tokens": 182126398.0, + "step": 4771 + }, + { + "epoch": 0.6070474494339143, + "ewc_loss": 0.03074074350297451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5370371329481713e-05, + "grad_norm": 23.682662963867188, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8695158958435059, + "num_tokens": 182160848.0, + "step": 4772 + }, + { + "epoch": 0.6071746597125047, + "ewc_loss": 0.030755287036299706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.537764364911709e-05, + "grad_norm": 23.786623001098633, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8644120693206787, + "num_tokens": 182202596.0, + "step": 4773 + }, + { + "epoch": 0.6073018699910953, + "ewc_loss": 0.030832571908831596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5416286260006018e-05, + "grad_norm": 23.772905349731445, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8547238707542419, + "num_tokens": 182239235.0, + "step": 4774 + }, + { + "epoch": 0.6074290802696858, + "ewc_loss": 0.03075251169502735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5376255760202184e-05, + "grad_norm": 23.819931030273438, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8680213689804077, + "num_tokens": 182273662.0, + "step": 4775 + }, + { + "epoch": 0.6075562905482763, + "ewc_loss": 0.030818501487374306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5409250408993103e-05, + "grad_norm": 23.759342193603516, + "learning_rate": 1e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8408982753753662, + "num_tokens": 182311991.0, + "step": 4776 + }, + { + "epoch": 0.6076835008268668, + "ewc_loss": 0.030781256034970284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.53906275954796e-05, + "grad_norm": 23.84480094909668, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8575747013092041, + "num_tokens": 182350722.0, + "step": 4777 + }, + { + "epoch": 0.6078107111054574, + "ewc_loss": 0.030866798013448715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5433399312314577e-05, + "grad_norm": 23.67959213256836, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8586714267730713, + "num_tokens": 182386755.0, + "step": 4778 + }, + { + "epoch": 0.6079379213840478, + "ewc_loss": 0.03078053891658783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.53902692545671e-05, + "grad_norm": 23.84141731262207, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8724658489227295, + "num_tokens": 182423128.0, + "step": 4779 + }, + { + "epoch": 0.6080651316626383, + "ewc_loss": 0.03093845583498478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.546922794659622e-05, + "grad_norm": 23.9422607421875, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8423342704772949, + "num_tokens": 182461518.0, + "step": 4780 + }, + { + "epoch": 0.6081923419412288, + "ewc_loss": 0.030801162123680115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5400581105495803e-05, + "grad_norm": 23.787334442138672, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8553354740142822, + "num_tokens": 182495707.0, + "step": 4781 + }, + { + "epoch": 0.6083195522198194, + "ewc_loss": 0.030849631875753403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5424815501319245e-05, + "grad_norm": 23.922094345092773, + "learning_rate": 1e-06, + "loss": 0.4907, + "mean_token_accuracy": 0.845862865447998, + "num_tokens": 182539058.0, + "step": 4782 + }, + { + "epoch": 0.6084467624984099, + "ewc_loss": 0.030832478776574135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5416238966281526e-05, + "grad_norm": 23.873289108276367, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8423184752464294, + "num_tokens": 182572482.0, + "step": 4783 + }, + { + "epoch": 0.6085739727770004, + "ewc_loss": 0.030764561146497726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5382280253106728e-05, + "grad_norm": 23.759212493896484, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8601049184799194, + "num_tokens": 182607665.0, + "step": 4784 + }, + { + "epoch": 0.6087011830555908, + "ewc_loss": 0.03089243918657303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.544621954963077e-05, + "grad_norm": 23.894527435302734, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.858122706413269, + "num_tokens": 182650503.0, + "step": 4785 + }, + { + "epoch": 0.6088283933341814, + "ewc_loss": 0.030830273404717445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5415136658702977e-05, + "grad_norm": 23.797584533691406, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8673731088638306, + "num_tokens": 182681662.0, + "step": 4786 + }, + { + "epoch": 0.6089556036127719, + "ewc_loss": 0.030804570764303207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5402285498566926e-05, + "grad_norm": 23.798675537109375, + "learning_rate": 1e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8430790901184082, + "num_tokens": 182714912.0, + "step": 4787 + }, + { + "epoch": 0.6090828138913624, + "ewc_loss": 0.030904939398169518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5452469597221352e-05, + "grad_norm": 23.927610397338867, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8524420261383057, + "num_tokens": 182748813.0, + "step": 4788 + }, + { + "epoch": 0.609210024169953, + "ewc_loss": 0.030874798074364662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5437399270012975e-05, + "grad_norm": 23.803668975830078, + "learning_rate": 1e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8405290842056274, + "num_tokens": 182786123.0, + "step": 4789 + }, + { + "epoch": 0.6093372344485435, + "ewc_loss": 0.03084401786327362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5422008800669573e-05, + "grad_norm": 23.759061813354492, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8687238693237305, + "num_tokens": 182820550.0, + "step": 4790 + }, + { + "epoch": 0.6094644447271339, + "ewc_loss": 0.030933380126953125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5466690456378274e-05, + "grad_norm": 23.810142517089844, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8574995398521423, + "num_tokens": 182863135.0, + "step": 4791 + }, + { + "epoch": 0.6095916550057244, + "ewc_loss": 0.03092142939567566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5460715076187626e-05, + "grad_norm": 23.81153678894043, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8583804368972778, + "num_tokens": 182905193.0, + "step": 4792 + }, + { + "epoch": 0.609718865284315, + "ewc_loss": 0.030913343653082848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5456671462743543e-05, + "grad_norm": 23.78605842590332, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8559662103652954, + "num_tokens": 182944934.0, + "step": 4793 + }, + { + "epoch": 0.6098460755629055, + "ewc_loss": 0.03086564876139164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5432824511663057e-05, + "grad_norm": 23.799116134643555, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8428787589073181, + "num_tokens": 182984604.0, + "step": 4794 + }, + { + "epoch": 0.609973285841496, + "ewc_loss": 0.030946219339966774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5473109669983387e-05, + "grad_norm": 23.722700119018555, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8538612127304077, + "num_tokens": 183020421.0, + "step": 4795 + }, + { + "epoch": 0.6101004961200865, + "ewc_loss": 0.030939768999814987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.54698846017709e-05, + "grad_norm": 23.849416732788086, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8480699062347412, + "num_tokens": 183057635.0, + "step": 4796 + }, + { + "epoch": 0.610227706398677, + "ewc_loss": 0.030959906056523323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5479952708119527e-05, + "grad_norm": 23.805925369262695, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.843469500541687, + "num_tokens": 183094689.0, + "step": 4797 + }, + { + "epoch": 0.6103549166772675, + "ewc_loss": 0.0308841485530138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5442074072780088e-05, + "grad_norm": 23.811864852905273, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8625339269638062, + "num_tokens": 183129118.0, + "step": 4798 + }, + { + "epoch": 0.610482126955858, + "ewc_loss": 0.030955856665968895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.547792817291338e-05, + "grad_norm": 23.813720703125, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8533563017845154, + "num_tokens": 183163365.0, + "step": 4799 + }, + { + "epoch": 0.6106093372344485, + "ewc_loss": 0.030914776027202606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.545738814456854e-05, + "grad_norm": 23.799169540405273, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8601779937744141, + "num_tokens": 183199271.0, + "step": 4800 + }, + { + "epoch": 0.6107365475130391, + "ewc_loss": 0.03090745583176613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5453728337888606e-05, + "grad_norm": 23.752408981323242, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8759514093399048, + "num_tokens": 183234523.0, + "step": 4801 + }, + { + "epoch": 0.6108637577916296, + "ewc_loss": 0.030931536108255386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5465768228750676e-05, + "grad_norm": 23.7838077545166, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8569126129150391, + "num_tokens": 183267098.0, + "step": 4802 + }, + { + "epoch": 0.61099096807022, + "ewc_loss": 0.03101094253361225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.550547131046187e-05, + "grad_norm": 23.811803817749023, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8563886880874634, + "num_tokens": 183306825.0, + "step": 4803 + }, + { + "epoch": 0.6111181783488105, + "ewc_loss": 0.030972681939601898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.548634099890478e-05, + "grad_norm": 23.80918312072754, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8560612201690674, + "num_tokens": 183348712.0, + "step": 4804 + }, + { + "epoch": 0.6112453886274011, + "ewc_loss": 0.030995212495326996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.549760600028094e-05, + "grad_norm": 23.780048370361328, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8687665462493896, + "num_tokens": 183383797.0, + "step": 4805 + }, + { + "epoch": 0.6113725989059916, + "ewc_loss": 0.03093370422720909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.546685234643519e-05, + "grad_norm": 23.70372772216797, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8680733442306519, + "num_tokens": 183417484.0, + "step": 4806 + }, + { + "epoch": 0.6114998091845821, + "ewc_loss": 0.030943218618631363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.547160900372546e-05, + "grad_norm": 23.716205596923828, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8628696203231812, + "num_tokens": 183453808.0, + "step": 4807 + }, + { + "epoch": 0.6116270194631727, + "ewc_loss": 0.03101656399667263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.550828164909035e-05, + "grad_norm": 23.773883819580078, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8635939359664917, + "num_tokens": 183496693.0, + "step": 4808 + }, + { + "epoch": 0.6117542297417632, + "ewc_loss": 0.030977025628089905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5488512872252613e-05, + "grad_norm": 23.77338409423828, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8595080375671387, + "num_tokens": 183536727.0, + "step": 4809 + }, + { + "epoch": 0.6118814400203536, + "ewc_loss": 0.03098306432366371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.54915323946625e-05, + "grad_norm": 23.754621505737305, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8580471873283386, + "num_tokens": 183571809.0, + "step": 4810 + }, + { + "epoch": 0.6120086502989441, + "ewc_loss": 0.030957195907831192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5478597561013885e-05, + "grad_norm": 23.745853424072266, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8552441596984863, + "num_tokens": 183610761.0, + "step": 4811 + }, + { + "epoch": 0.6121358605775347, + "ewc_loss": 0.03098417818546295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5492089005419984e-05, + "grad_norm": 23.776031494140625, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8555482029914856, + "num_tokens": 183648304.0, + "step": 4812 + }, + { + "epoch": 0.6122630708561252, + "ewc_loss": 0.031007962301373482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5503981558140367e-05, + "grad_norm": 23.84618377685547, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8514008522033691, + "num_tokens": 183684201.0, + "step": 4813 + }, + { + "epoch": 0.6123902811347157, + "ewc_loss": 0.030972223728895187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5486111806239933e-05, + "grad_norm": 23.712203979492188, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8498595952987671, + "num_tokens": 183726595.0, + "step": 4814 + }, + { + "epoch": 0.6125174914133062, + "ewc_loss": 0.03093203529715538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5466017430298962e-05, + "grad_norm": 23.817642211914062, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.865302324295044, + "num_tokens": 183762222.0, + "step": 4815 + }, + { + "epoch": 0.6126447016918967, + "ewc_loss": 0.031008053570985794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5504027032875456e-05, + "grad_norm": 23.729190826416016, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.870303213596344, + "num_tokens": 183797167.0, + "step": 4816 + }, + { + "epoch": 0.6127719119704872, + "ewc_loss": 0.030938947573304176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.54694735101657e-05, + "grad_norm": 23.81224822998047, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8555757403373718, + "num_tokens": 183839591.0, + "step": 4817 + }, + { + "epoch": 0.6128991222490777, + "ewc_loss": 0.03095286153256893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5476431144634262e-05, + "grad_norm": 23.772279739379883, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8548619747161865, + "num_tokens": 183877862.0, + "step": 4818 + }, + { + "epoch": 0.6130263325276682, + "ewc_loss": 0.03094135969877243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.547067950014025e-05, + "grad_norm": 23.846940994262695, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8614961504936218, + "num_tokens": 183915964.0, + "step": 4819 + }, + { + "epoch": 0.6131535428062588, + "ewc_loss": 0.030950384214520454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5475192412850447e-05, + "grad_norm": 23.804737091064453, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8533948659896851, + "num_tokens": 183952823.0, + "step": 4820 + }, + { + "epoch": 0.6132807530848493, + "ewc_loss": 0.03088853880763054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5444269592990167e-05, + "grad_norm": 23.760976791381836, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8467863202095032, + "num_tokens": 183991650.0, + "step": 4821 + }, + { + "epoch": 0.6134079633634397, + "ewc_loss": 0.030940188094973564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5470093785552308e-05, + "grad_norm": 23.808364868164062, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.858586311340332, + "num_tokens": 184027768.0, + "step": 4822 + }, + { + "epoch": 0.6135351736420303, + "ewc_loss": 0.0309009850025177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5450492355739698e-05, + "grad_norm": 23.731962203979492, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8677446246147156, + "num_tokens": 184063754.0, + "step": 4823 + }, + { + "epoch": 0.6136623839206208, + "ewc_loss": 0.03096199408173561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5480996808037162e-05, + "grad_norm": 23.78792381286621, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8503943681716919, + "num_tokens": 184105286.0, + "step": 4824 + }, + { + "epoch": 0.6137895941992113, + "ewc_loss": 0.030945848673582077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5472924133064225e-05, + "grad_norm": 23.85194969177246, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8609205484390259, + "num_tokens": 184139312.0, + "step": 4825 + }, + { + "epoch": 0.6139168044778018, + "ewc_loss": 0.030897993594408035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5448997146449983e-05, + "grad_norm": 23.71949577331543, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8564692735671997, + "num_tokens": 184181541.0, + "step": 4826 + }, + { + "epoch": 0.6140440147563924, + "ewc_loss": 0.03092750534415245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5463752788491547e-05, + "grad_norm": 23.922178268432617, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8559998273849487, + "num_tokens": 184216857.0, + "step": 4827 + }, + { + "epoch": 0.6141712250349828, + "ewc_loss": 0.030968399718403816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5484200048376806e-05, + "grad_norm": 23.860063552856445, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8596531748771667, + "num_tokens": 184253318.0, + "step": 4828 + }, + { + "epoch": 0.6142984353135733, + "ewc_loss": 0.030901402235031128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5450701539521106e-05, + "grad_norm": 23.791757583618164, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8677316308021545, + "num_tokens": 184298625.0, + "step": 4829 + }, + { + "epoch": 0.6144256455921638, + "ewc_loss": 0.03093481808900833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5467408957192674e-05, + "grad_norm": 23.9188289642334, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.854602038860321, + "num_tokens": 184334569.0, + "step": 4830 + }, + { + "epoch": 0.6145528558707544, + "ewc_loss": 0.030899878591299057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.544993938296102e-05, + "grad_norm": 23.808319091796875, + "learning_rate": 1e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8393838405609131, + "num_tokens": 184377477.0, + "step": 4831 + }, + { + "epoch": 0.6146800661493449, + "ewc_loss": 0.030882755294442177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.544137739983853e-05, + "grad_norm": 23.82052230834961, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.84395432472229, + "num_tokens": 184416671.0, + "step": 4832 + }, + { + "epoch": 0.6148072764279354, + "ewc_loss": 0.03091329149901867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5456645996891893e-05, + "grad_norm": 23.819406509399414, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8510964512825012, + "num_tokens": 184453174.0, + "step": 4833 + }, + { + "epoch": 0.6149344867065258, + "ewc_loss": 0.03089756891131401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5448784324689768e-05, + "grad_norm": 23.807037353515625, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8633372783660889, + "num_tokens": 184488313.0, + "step": 4834 + }, + { + "epoch": 0.6150616969851164, + "ewc_loss": 0.030999042093753815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5499521396122873e-05, + "grad_norm": 23.858619689941406, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8375498056411743, + "num_tokens": 184523733.0, + "step": 4835 + }, + { + "epoch": 0.6151889072637069, + "ewc_loss": 0.030926326289772987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5463163435924798e-05, + "grad_norm": 23.885751724243164, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8537552356719971, + "num_tokens": 184568399.0, + "step": 4836 + }, + { + "epoch": 0.6153161175422974, + "ewc_loss": 0.030987713485956192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.549385706312023e-05, + "grad_norm": 23.80692481994629, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8556982278823853, + "num_tokens": 184603302.0, + "step": 4837 + }, + { + "epoch": 0.615443327820888, + "ewc_loss": 0.030990147963166237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5495073967031203e-05, + "grad_norm": 23.871137619018555, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8755119442939758, + "num_tokens": 184637346.0, + "step": 4838 + }, + { + "epoch": 0.6155705380994785, + "ewc_loss": 0.03098287247121334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.549143598822411e-05, + "grad_norm": 23.896041870117188, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.86228346824646, + "num_tokens": 184671687.0, + "step": 4839 + }, + { + "epoch": 0.6156977483780689, + "ewc_loss": 0.030960075557231903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5480038200621493e-05, + "grad_norm": 23.857881546020508, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8655581474304199, + "num_tokens": 184706867.0, + "step": 4840 + }, + { + "epoch": 0.6158249586566594, + "ewc_loss": 0.030961383134126663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5480691217817366e-05, + "grad_norm": 23.79141616821289, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8535308837890625, + "num_tokens": 184743916.0, + "step": 4841 + }, + { + "epoch": 0.61595216893525, + "ewc_loss": 0.030970754101872444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.54853769345209e-05, + "grad_norm": 23.87197494506836, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8647468090057373, + "num_tokens": 184783247.0, + "step": 4842 + }, + { + "epoch": 0.6160793792138405, + "ewc_loss": 0.031012794002890587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5506397176068276e-05, + "grad_norm": 23.876266479492188, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8566190600395203, + "num_tokens": 184826501.0, + "step": 4843 + }, + { + "epoch": 0.616206589492431, + "ewc_loss": 0.03092050552368164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5460253052879125e-05, + "grad_norm": 23.726778030395508, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8550131320953369, + "num_tokens": 184868743.0, + "step": 4844 + }, + { + "epoch": 0.6163337997710215, + "ewc_loss": 0.03102492354810238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5512461686739698e-05, + "grad_norm": 23.940553665161133, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8535726070404053, + "num_tokens": 184908992.0, + "step": 4845 + }, + { + "epoch": 0.616461010049612, + "ewc_loss": 0.030979910865426064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5489955330849625e-05, + "grad_norm": 23.84833335876465, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8449472188949585, + "num_tokens": 184942456.0, + "step": 4846 + }, + { + "epoch": 0.6165882203282025, + "ewc_loss": 0.030922386795282364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5461193470400758e-05, + "grad_norm": 23.85470199584961, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8457062244415283, + "num_tokens": 184975834.0, + "step": 4847 + }, + { + "epoch": 0.616715430606793, + "ewc_loss": 0.031044790521264076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5522395187872462e-05, + "grad_norm": 23.932086944580078, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8790360689163208, + "num_tokens": 185012295.0, + "step": 4848 + }, + { + "epoch": 0.6168426408853835, + "ewc_loss": 0.031023340299725533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5511670426349156e-05, + "grad_norm": 23.92193031311035, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8606958985328674, + "num_tokens": 185049272.0, + "step": 4849 + }, + { + "epoch": 0.6169698511639741, + "ewc_loss": 0.030946265906095505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5473133316845633e-05, + "grad_norm": 23.84349822998047, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8683507442474365, + "num_tokens": 185081054.0, + "step": 4850 + }, + { + "epoch": 0.6170970614425646, + "ewc_loss": 0.030955955386161804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5477977285627276e-05, + "grad_norm": 24.001245498657227, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8520196080207825, + "num_tokens": 185119809.0, + "step": 4851 + }, + { + "epoch": 0.617224271721155, + "ewc_loss": 0.030946258455514908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5473129678866826e-05, + "grad_norm": 23.791763305664062, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8676378726959229, + "num_tokens": 185161462.0, + "step": 4852 + }, + { + "epoch": 0.6173514819997455, + "ewc_loss": 0.030965056270360947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5482528397114947e-05, + "grad_norm": 24.02321434020996, + "learning_rate": 1e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8481028079986572, + "num_tokens": 185196238.0, + "step": 4853 + }, + { + "epoch": 0.6174786922783361, + "ewc_loss": 0.031003128737211227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5501564121223055e-05, + "grad_norm": 23.964515686035156, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8551048636436462, + "num_tokens": 185239317.0, + "step": 4854 + }, + { + "epoch": 0.6176059025569266, + "ewc_loss": 0.03091459348797798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5457297195098363e-05, + "grad_norm": 23.866262435913086, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8659101724624634, + "num_tokens": 185273555.0, + "step": 4855 + }, + { + "epoch": 0.6177331128355171, + "ewc_loss": 0.030976945534348488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5488472854485735e-05, + "grad_norm": 23.967546463012695, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8644335269927979, + "num_tokens": 185309721.0, + "step": 4856 + }, + { + "epoch": 0.6178603231141077, + "ewc_loss": 0.030949518084526062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5474759493372403e-05, + "grad_norm": 23.916427612304688, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.848407506942749, + "num_tokens": 185350846.0, + "step": 4857 + }, + { + "epoch": 0.6179875333926982, + "ewc_loss": 0.030897535383701324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5448767953785136e-05, + "grad_norm": 23.887887954711914, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8492674827575684, + "num_tokens": 185392472.0, + "step": 4858 + }, + { + "epoch": 0.6181147436712886, + "ewc_loss": 0.0309324711561203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5466235709027387e-05, + "grad_norm": 23.9482479095459, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8537673950195312, + "num_tokens": 185438250.0, + "step": 4859 + }, + { + "epoch": 0.6182419539498791, + "ewc_loss": 0.030953390523791313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5476694898097776e-05, + "grad_norm": 23.801694869995117, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8536101579666138, + "num_tokens": 185479455.0, + "step": 4860 + }, + { + "epoch": 0.6183691642284697, + "ewc_loss": 0.03092990629374981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5464953321497887e-05, + "grad_norm": 24.016918182373047, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8530703186988831, + "num_tokens": 185518719.0, + "step": 4861 + }, + { + "epoch": 0.6184963745070602, + "ewc_loss": 0.030976658686995506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5488329154322855e-05, + "grad_norm": 23.897125244140625, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8418878316879272, + "num_tokens": 185556671.0, + "step": 4862 + }, + { + "epoch": 0.6186235847856507, + "ewc_loss": 0.030891042202711105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5445521057699807e-05, + "grad_norm": 23.959592819213867, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8536820411682129, + "num_tokens": 185595063.0, + "step": 4863 + }, + { + "epoch": 0.6187507950642412, + "ewc_loss": 0.03094550222158432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.547275132907089e-05, + "grad_norm": 23.80840301513672, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8468519449234009, + "num_tokens": 185635626.0, + "step": 4864 + }, + { + "epoch": 0.6188780053428317, + "ewc_loss": 0.030896339565515518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.544816950627137e-05, + "grad_norm": 23.92496681213379, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8448549509048462, + "num_tokens": 185671687.0, + "step": 4865 + }, + { + "epoch": 0.6190052156214222, + "ewc_loss": 0.03097335807979107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.548667933093384e-05, + "grad_norm": 23.851688385009766, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.858818769454956, + "num_tokens": 185706459.0, + "step": 4866 + }, + { + "epoch": 0.6191324259000127, + "ewc_loss": 0.03095158189535141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5475790860364214e-05, + "grad_norm": 23.838306427001953, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8748784065246582, + "num_tokens": 185743162.0, + "step": 4867 + }, + { + "epoch": 0.6192596361786032, + "ewc_loss": 0.030964231118559837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5482115486520343e-05, + "grad_norm": 23.893430709838867, + "learning_rate": 1e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.8406549692153931, + "num_tokens": 185784036.0, + "step": 4868 + }, + { + "epoch": 0.6193868464571938, + "ewc_loss": 0.03094971552491188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5474857718800195e-05, + "grad_norm": 23.85505485534668, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8535993099212646, + "num_tokens": 185821407.0, + "step": 4869 + }, + { + "epoch": 0.6195140567357843, + "ewc_loss": 0.030914489179849625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.545724444440566e-05, + "grad_norm": 23.783832550048828, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8607332110404968, + "num_tokens": 185857789.0, + "step": 4870 + }, + { + "epoch": 0.6196412670143747, + "ewc_loss": 0.03103497065603733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5517485735472292e-05, + "grad_norm": 23.969411849975586, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8668527603149414, + "num_tokens": 185896989.0, + "step": 4871 + }, + { + "epoch": 0.6197684772929652, + "ewc_loss": 0.03105274587869644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5526373317698017e-05, + "grad_norm": 23.91020393371582, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8709989786148071, + "num_tokens": 185938123.0, + "step": 4872 + }, + { + "epoch": 0.6198956875715558, + "ewc_loss": 0.030928196385502815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.546409839647822e-05, + "grad_norm": 24.011011123657227, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8557142019271851, + "num_tokens": 185973440.0, + "step": 4873 + }, + { + "epoch": 0.6200228978501463, + "ewc_loss": 0.030996907502412796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.549845364934299e-05, + "grad_norm": 23.837488174438477, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8584247827529907, + "num_tokens": 186010069.0, + "step": 4874 + }, + { + "epoch": 0.6201501081287368, + "ewc_loss": 0.030894974246621132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.544748738524504e-05, + "grad_norm": 24.065221786499023, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8500815629959106, + "num_tokens": 186048365.0, + "step": 4875 + }, + { + "epoch": 0.6202773184073274, + "ewc_loss": 0.031059324741363525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5529662050539628e-05, + "grad_norm": 23.904565811157227, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8623022437095642, + "num_tokens": 186085358.0, + "step": 4876 + }, + { + "epoch": 0.6204045286859178, + "ewc_loss": 0.03091897815465927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5459489077329636e-05, + "grad_norm": 24.04808807373047, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8606116771697998, + "num_tokens": 186118476.0, + "step": 4877 + }, + { + "epoch": 0.6205317389645083, + "ewc_loss": 0.031020259484648705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5510129742324352e-05, + "grad_norm": 23.901113510131836, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8530957102775574, + "num_tokens": 186155487.0, + "step": 4878 + }, + { + "epoch": 0.6206589492430988, + "ewc_loss": 0.030935822054743767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5467910998268053e-05, + "grad_norm": 24.00575828552246, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8622699975967407, + "num_tokens": 186192941.0, + "step": 4879 + }, + { + "epoch": 0.6207861595216894, + "ewc_loss": 0.030989712104201317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5494855688302778e-05, + "grad_norm": 23.874448776245117, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8591663837432861, + "num_tokens": 186233860.0, + "step": 4880 + }, + { + "epoch": 0.6209133698002799, + "ewc_loss": 0.030912058427929878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.545602935948409e-05, + "grad_norm": 23.846328735351562, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8453313112258911, + "num_tokens": 186273081.0, + "step": 4881 + }, + { + "epoch": 0.6210405800788704, + "ewc_loss": 0.030984602868556976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.54923018271802e-05, + "grad_norm": 23.85198211669922, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8618607521057129, + "num_tokens": 186310596.0, + "step": 4882 + }, + { + "epoch": 0.6211677903574608, + "ewc_loss": 0.031000396236777306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5500198060180992e-05, + "grad_norm": 23.827531814575195, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8525910377502441, + "num_tokens": 186346262.0, + "step": 4883 + }, + { + "epoch": 0.6212950006360514, + "ewc_loss": 0.031023697927594185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5511848687310703e-05, + "grad_norm": 23.902009963989258, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8595490455627441, + "num_tokens": 186386508.0, + "step": 4884 + }, + { + "epoch": 0.6214222109146419, + "ewc_loss": 0.031029358506202698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.551467903482262e-05, + "grad_norm": 24.048662185668945, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8521875143051147, + "num_tokens": 186425184.0, + "step": 4885 + }, + { + "epoch": 0.6215494211932324, + "ewc_loss": 0.031088413670659065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.554420668981038e-05, + "grad_norm": 24.082624435424805, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8548349738121033, + "num_tokens": 186463072.0, + "step": 4886 + }, + { + "epoch": 0.621676631471823, + "ewc_loss": 0.030954917892813683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5477458873647265e-05, + "grad_norm": 23.88322639465332, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8674004077911377, + "num_tokens": 186504062.0, + "step": 4887 + }, + { + "epoch": 0.6218038417504135, + "ewc_loss": 0.030994433909654617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.549721673654858e-05, + "grad_norm": 24.128721237182617, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8470357656478882, + "num_tokens": 186544564.0, + "step": 4888 + }, + { + "epoch": 0.6219310520290039, + "ewc_loss": 0.030978091061115265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5489045836147852e-05, + "grad_norm": 23.932884216308594, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8429896831512451, + "num_tokens": 186587196.0, + "step": 4889 + }, + { + "epoch": 0.6220582623075944, + "ewc_loss": 0.030882660299539566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5441330106114037e-05, + "grad_norm": 23.91434097290039, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8570408821105957, + "num_tokens": 186620673.0, + "step": 4890 + }, + { + "epoch": 0.622185472586185, + "ewc_loss": 0.030991116538643837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5495557818212546e-05, + "grad_norm": 24.075332641601562, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8588576316833496, + "num_tokens": 186654028.0, + "step": 4891 + }, + { + "epoch": 0.6223126828647755, + "ewc_loss": 0.03096063993871212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5480320143979043e-05, + "grad_norm": 23.812137603759766, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.854038417339325, + "num_tokens": 186693395.0, + "step": 4892 + }, + { + "epoch": 0.622439893143366, + "ewc_loss": 0.03095315769314766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.547657848277595e-05, + "grad_norm": 24.069080352783203, + "learning_rate": 1e-06, + "loss": 0.5185, + "mean_token_accuracy": 0.8390786051750183, + "num_tokens": 186734440.0, + "step": 4893 + }, + { + "epoch": 0.6225671034219565, + "ewc_loss": 0.03097144141793251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.548572072351817e-05, + "grad_norm": 23.945287704467773, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8587924242019653, + "num_tokens": 186769637.0, + "step": 4894 + }, + { + "epoch": 0.622694313700547, + "ewc_loss": 0.03089098446071148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.544549195386935e-05, + "grad_norm": 24.00619888305664, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8519362211227417, + "num_tokens": 186810188.0, + "step": 4895 + }, + { + "epoch": 0.6228215239791375, + "ewc_loss": 0.03098447248339653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.549223634356167e-05, + "grad_norm": 23.972034454345703, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.846676230430603, + "num_tokens": 186847512.0, + "step": 4896 + }, + { + "epoch": 0.622948734257728, + "ewc_loss": 0.030962590128183365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5481295122299343e-05, + "grad_norm": 23.861034393310547, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8660434484481812, + "num_tokens": 186878721.0, + "step": 4897 + }, + { + "epoch": 0.6230759445363185, + "ewc_loss": 0.030921658501029015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.546082967252005e-05, + "grad_norm": 23.927734375, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8614075779914856, + "num_tokens": 186920239.0, + "step": 4898 + }, + { + "epoch": 0.6232031548149091, + "ewc_loss": 0.031015640124678612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.550781962578185e-05, + "grad_norm": 23.883140563964844, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.849014937877655, + "num_tokens": 186964158.0, + "step": 4899 + }, + { + "epoch": 0.6233303650934996, + "ewc_loss": 0.030945127829909325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5472563973162323e-05, + "grad_norm": 23.997058868408203, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.872035026550293, + "num_tokens": 187003176.0, + "step": 4900 + }, + { + "epoch": 0.62345757537209, + "ewc_loss": 0.03104853816330433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5524268746958114e-05, + "grad_norm": 23.83965492248535, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8551722764968872, + "num_tokens": 187046152.0, + "step": 4901 + }, + { + "epoch": 0.6235847856506805, + "ewc_loss": 0.030984243378043175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5492121747229248e-05, + "grad_norm": 24.0504207611084, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8681339025497437, + "num_tokens": 187086629.0, + "step": 4902 + }, + { + "epoch": 0.6237119959292711, + "ewc_loss": 0.03103267215192318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.551633613416925e-05, + "grad_norm": 23.855342864990234, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.851448118686676, + "num_tokens": 187122856.0, + "step": 4903 + }, + { + "epoch": 0.6238392062078616, + "ewc_loss": 0.030961070209741592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.548053478472866e-05, + "grad_norm": 24.030317306518555, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8711941242218018, + "num_tokens": 187161568.0, + "step": 4904 + }, + { + "epoch": 0.6239664164864521, + "ewc_loss": 0.031070368364453316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5535184502368793e-05, + "grad_norm": 23.78663444519043, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8564755916595459, + "num_tokens": 187202831.0, + "step": 4905 + }, + { + "epoch": 0.6240936267650427, + "ewc_loss": 0.030954986810684204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5477493434445933e-05, + "grad_norm": 23.9740047454834, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8640637993812561, + "num_tokens": 187239059.0, + "step": 4906 + }, + { + "epoch": 0.6242208370436332, + "ewc_loss": 0.031038401648402214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5519201042479835e-05, + "grad_norm": 23.905426025390625, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8513907194137573, + "num_tokens": 187272940.0, + "step": 4907 + }, + { + "epoch": 0.6243480473222236, + "ewc_loss": 0.030990323051810265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5495161278522573e-05, + "grad_norm": 23.917367935180664, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8692847490310669, + "num_tokens": 187310963.0, + "step": 4908 + }, + { + "epoch": 0.6244752576008141, + "ewc_loss": 0.031047005206346512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.552350295241922e-05, + "grad_norm": 23.88381576538086, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8757061958312988, + "num_tokens": 187345698.0, + "step": 4909 + }, + { + "epoch": 0.6246024678794047, + "ewc_loss": 0.031076328828930855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.55381640070118e-05, + "grad_norm": 23.964345932006836, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8554545640945435, + "num_tokens": 187384762.0, + "step": 4910 + }, + { + "epoch": 0.6247296781579952, + "ewc_loss": 0.031043652445077896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5521825844189152e-05, + "grad_norm": 23.961687088012695, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.845679521560669, + "num_tokens": 187424288.0, + "step": 4911 + }, + { + "epoch": 0.6248568884365857, + "ewc_loss": 0.03106345236301422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5531726603512652e-05, + "grad_norm": 23.999147415161133, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8658263087272644, + "num_tokens": 187461481.0, + "step": 4912 + }, + { + "epoch": 0.6249840987151762, + "ewc_loss": 0.0310093741863966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5504687326028943e-05, + "grad_norm": 23.858545303344727, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8724803924560547, + "num_tokens": 187497690.0, + "step": 4913 + }, + { + "epoch": 0.6251113089937667, + "ewc_loss": 0.031062103807926178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5531051758443937e-05, + "grad_norm": 24.011716842651367, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8504576086997986, + "num_tokens": 187537825.0, + "step": 4914 + }, + { + "epoch": 0.6252385192723572, + "ewc_loss": 0.03097992204129696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5489960787817836e-05, + "grad_norm": 23.73383903503418, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8671870827674866, + "num_tokens": 187583998.0, + "step": 4915 + }, + { + "epoch": 0.6253657295509477, + "ewc_loss": 0.031035054475069046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5517527572228573e-05, + "grad_norm": 23.945697784423828, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8561648726463318, + "num_tokens": 187621962.0, + "step": 4916 + }, + { + "epoch": 0.6254929398295382, + "ewc_loss": 0.031091364100575447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5545681890216656e-05, + "grad_norm": 23.835494995117188, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8628242611885071, + "num_tokens": 187655913.0, + "step": 4917 + }, + { + "epoch": 0.6256201501081288, + "ewc_loss": 0.031065478920936584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5532739780610427e-05, + "grad_norm": 23.97016143798828, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8484315276145935, + "num_tokens": 187692433.0, + "step": 4918 + }, + { + "epoch": 0.6257473603867193, + "ewc_loss": 0.031132815405726433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5566407455480658e-05, + "grad_norm": 23.938426971435547, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8684881925582886, + "num_tokens": 187724311.0, + "step": 4919 + }, + { + "epoch": 0.6258745706653097, + "ewc_loss": 0.03102649562060833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.551324749016203e-05, + "grad_norm": 23.930282592773438, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8658921718597412, + "num_tokens": 187766614.0, + "step": 4920 + }, + { + "epoch": 0.6260017809439002, + "ewc_loss": 0.031090447679162025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5545223504886962e-05, + "grad_norm": 23.949857711791992, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8678699731826782, + "num_tokens": 187804847.0, + "step": 4921 + }, + { + "epoch": 0.6261289912224908, + "ewc_loss": 0.03102751635015011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5513758626184426e-05, + "grad_norm": 23.960872650146484, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8577139377593994, + "num_tokens": 187842791.0, + "step": 4922 + }, + { + "epoch": 0.6262562015010813, + "ewc_loss": 0.03106156922876835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5530784367001615e-05, + "grad_norm": 23.913328170776367, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8465935587882996, + "num_tokens": 187878304.0, + "step": 4923 + }, + { + "epoch": 0.6263834117796718, + "ewc_loss": 0.031099792569875717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5549896488664672e-05, + "grad_norm": 23.927234649658203, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8590186834335327, + "num_tokens": 187915022.0, + "step": 4924 + }, + { + "epoch": 0.6265106220582624, + "ewc_loss": 0.03111242689192295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5556213838863187e-05, + "grad_norm": 23.963958740234375, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8576194643974304, + "num_tokens": 187950533.0, + "step": 4925 + }, + { + "epoch": 0.6266378323368528, + "ewc_loss": 0.03111676499247551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5558382074232213e-05, + "grad_norm": 23.915660858154297, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8529055118560791, + "num_tokens": 187989893.0, + "step": 4926 + }, + { + "epoch": 0.6267650426154433, + "ewc_loss": 0.031083622947335243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.554181108076591e-05, + "grad_norm": 23.871789932250977, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8499354124069214, + "num_tokens": 188025031.0, + "step": 4927 + }, + { + "epoch": 0.6268922528940338, + "ewc_loss": 0.03114236891269684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.557118412165437e-05, + "grad_norm": 24.04551887512207, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8495720028877258, + "num_tokens": 188060797.0, + "step": 4928 + }, + { + "epoch": 0.6270194631726244, + "ewc_loss": 0.031190920621156693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5595460354234092e-05, + "grad_norm": 23.915023803710938, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8506168723106384, + "num_tokens": 188096271.0, + "step": 4929 + }, + { + "epoch": 0.6271466734512149, + "ewc_loss": 0.0310859102755785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.554295522510074e-05, + "grad_norm": 23.92232322692871, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8580896854400635, + "num_tokens": 188136697.0, + "step": 4930 + }, + { + "epoch": 0.6272738837298054, + "ewc_loss": 0.031230440363287926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.561522003612481e-05, + "grad_norm": 23.93329620361328, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8555424809455872, + "num_tokens": 188174715.0, + "step": 4931 + }, + { + "epoch": 0.6274010940083958, + "ewc_loss": 0.03114219754934311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5571098629152402e-05, + "grad_norm": 23.934537887573242, + "learning_rate": 1e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.8356770277023315, + "num_tokens": 188216379.0, + "step": 4932 + }, + { + "epoch": 0.6275283042869864, + "ewc_loss": 0.03112567402422428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5562836779281497e-05, + "grad_norm": 23.834228515625, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.861530601978302, + "num_tokens": 188252183.0, + "step": 4933 + }, + { + "epoch": 0.6276555145655769, + "ewc_loss": 0.0311364084482193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.556820461701136e-05, + "grad_norm": 23.932058334350586, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8585968017578125, + "num_tokens": 188291128.0, + "step": 4934 + }, + { + "epoch": 0.6277827248441674, + "ewc_loss": 0.031197967007756233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.559898373670876e-05, + "grad_norm": 23.88020133972168, + "learning_rate": 1e-06, + "loss": 0.5273, + "mean_token_accuracy": 0.8405709266662598, + "num_tokens": 188334256.0, + "step": 4935 + }, + { + "epoch": 0.627909935122758, + "ewc_loss": 0.03116072528064251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.558036274218466e-05, + "grad_norm": 24.060110092163086, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8490036725997925, + "num_tokens": 188373207.0, + "step": 4936 + }, + { + "epoch": 0.6280371454013485, + "ewc_loss": 0.031244955956935883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.562247780384496e-05, + "grad_norm": 23.964143753051758, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8526852130889893, + "num_tokens": 188414937.0, + "step": 4937 + }, + { + "epoch": 0.6281643556799389, + "ewc_loss": 0.0310966856777668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5548343071714044e-05, + "grad_norm": 23.940866470336914, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8534560203552246, + "num_tokens": 188455398.0, + "step": 4938 + }, + { + "epoch": 0.6282915659585294, + "ewc_loss": 0.031140051782131195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.557002542540431e-05, + "grad_norm": 23.963340759277344, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8565741181373596, + "num_tokens": 188496561.0, + "step": 4939 + }, + { + "epoch": 0.62841877623712, + "ewc_loss": 0.031163254752755165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5581626939820126e-05, + "grad_norm": 24.02088737487793, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8588714003562927, + "num_tokens": 188531318.0, + "step": 4940 + }, + { + "epoch": 0.6285459865157105, + "ewc_loss": 0.031138163059949875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.556908136990387e-05, + "grad_norm": 23.933021545410156, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.847562313079834, + "num_tokens": 188566807.0, + "step": 4941 + }, + { + "epoch": 0.628673196794301, + "ewc_loss": 0.03114764392375946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.557382165628951e-05, + "grad_norm": 23.95029067993164, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8530034422874451, + "num_tokens": 188608062.0, + "step": 4942 + }, + { + "epoch": 0.6288004070728915, + "ewc_loss": 0.03111044131219387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.555522067064885e-05, + "grad_norm": 23.967817306518555, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8674455881118774, + "num_tokens": 188644953.0, + "step": 4943 + }, + { + "epoch": 0.628927617351482, + "ewc_loss": 0.031171802431344986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.558590156491846e-05, + "grad_norm": 24.047229766845703, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8717015385627747, + "num_tokens": 188687720.0, + "step": 4944 + }, + { + "epoch": 0.6290548276300725, + "ewc_loss": 0.03110996074974537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5554980564047582e-05, + "grad_norm": 23.99228286743164, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8494110107421875, + "num_tokens": 188730077.0, + "step": 4945 + }, + { + "epoch": 0.629182037908663, + "ewc_loss": 0.031102914363145828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5551457181572914e-05, + "grad_norm": 23.906831741333008, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8623757362365723, + "num_tokens": 188768306.0, + "step": 4946 + }, + { + "epoch": 0.6293092481872535, + "ewc_loss": 0.03105398640036583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5526993593084626e-05, + "grad_norm": 23.96145248413086, + "learning_rate": 1e-06, + "loss": 0.5261, + "mean_token_accuracy": 0.8361923694610596, + "num_tokens": 188807480.0, + "step": 4947 + }, + { + "epoch": 0.6294364584658441, + "ewc_loss": 0.031099673360586166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5549836462014355e-05, + "grad_norm": 23.83673095703125, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8462810516357422, + "num_tokens": 188836383.0, + "step": 4948 + }, + { + "epoch": 0.6295636687444346, + "ewc_loss": 0.03116881661117077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5584408174618147e-05, + "grad_norm": 24.0031681060791, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8597289323806763, + "num_tokens": 188876079.0, + "step": 4949 + }, + { + "epoch": 0.629690879023025, + "ewc_loss": 0.031150247901678085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.557512405270245e-05, + "grad_norm": 23.846561431884766, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8532611727714539, + "num_tokens": 188912936.0, + "step": 4950 + }, + { + "epoch": 0.6298180893016155, + "ewc_loss": 0.031123662367463112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5561830878141336e-05, + "grad_norm": 23.891414642333984, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.854217529296875, + "num_tokens": 188951475.0, + "step": 4951 + }, + { + "epoch": 0.6299452995802061, + "ewc_loss": 0.031187398359179497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.559369957249146e-05, + "grad_norm": 23.830509185791016, + "learning_rate": 1e-06, + "loss": 0.513, + "mean_token_accuracy": 0.8422162532806396, + "num_tokens": 188996972.0, + "step": 4952 + }, + { + "epoch": 0.6300725098587966, + "ewc_loss": 0.031163305044174194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5581652405671775e-05, + "grad_norm": 23.952205657958984, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8763812780380249, + "num_tokens": 189030706.0, + "step": 4953 + }, + { + "epoch": 0.6301997201373871, + "ewc_loss": 0.031227486208081245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.561374301672913e-05, + "grad_norm": 23.870128631591797, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8785629272460938, + "num_tokens": 189065638.0, + "step": 4954 + }, + { + "epoch": 0.6303269304159776, + "ewc_loss": 0.031226973980665207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.561348653922323e-05, + "grad_norm": 23.983722686767578, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.869457483291626, + "num_tokens": 189101747.0, + "step": 4955 + }, + { + "epoch": 0.6304541406945681, + "ewc_loss": 0.0312255397439003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5612769857398234e-05, + "grad_norm": 23.919307708740234, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8532546758651733, + "num_tokens": 189142965.0, + "step": 4956 + }, + { + "epoch": 0.6305813509731586, + "ewc_loss": 0.031223153695464134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5611576600349508e-05, + "grad_norm": 23.998519897460938, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8614245057106018, + "num_tokens": 189177822.0, + "step": 4957 + }, + { + "epoch": 0.6307085612517491, + "ewc_loss": 0.031178470700979233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5589235772495158e-05, + "grad_norm": 23.843904495239258, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8506603240966797, + "num_tokens": 189215710.0, + "step": 4958 + }, + { + "epoch": 0.6308357715303397, + "ewc_loss": 0.03117598406970501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5587991583743133e-05, + "grad_norm": 23.92668342590332, + "learning_rate": 1e-06, + "loss": 0.5514, + "mean_token_accuracy": 0.829802930355072, + "num_tokens": 189250788.0, + "step": 4959 + }, + { + "epoch": 0.6309629818089302, + "ewc_loss": 0.03126179799437523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5630899724783376e-05, + "grad_norm": 23.965576171875, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8539690971374512, + "num_tokens": 189278451.0, + "step": 4960 + }, + { + "epoch": 0.6310901920875207, + "ewc_loss": 0.031229985877871513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5614992662449367e-05, + "grad_norm": 23.832744598388672, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8603297472000122, + "num_tokens": 189316857.0, + "step": 4961 + }, + { + "epoch": 0.6312174023661112, + "ewc_loss": 0.03126721829175949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.563361001899466e-05, + "grad_norm": 23.8718204498291, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8675674796104431, + "num_tokens": 189356600.0, + "step": 4962 + }, + { + "epoch": 0.6313446126447017, + "ewc_loss": 0.03129599243402481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5647996406187303e-05, + "grad_norm": 23.853635787963867, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8510671854019165, + "num_tokens": 189393134.0, + "step": 4963 + }, + { + "epoch": 0.6314718229232922, + "ewc_loss": 0.031287387013435364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5643692677258514e-05, + "grad_norm": 23.87059783935547, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8408204317092896, + "num_tokens": 189433431.0, + "step": 4964 + }, + { + "epoch": 0.6315990332018827, + "ewc_loss": 0.0313030444085598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5651521607651375e-05, + "grad_norm": 23.861513137817383, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8428701162338257, + "num_tokens": 189473738.0, + "step": 4965 + }, + { + "epoch": 0.6317262434804732, + "ewc_loss": 0.03131738305091858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5658692063880153e-05, + "grad_norm": 23.969087600708008, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8710131645202637, + "num_tokens": 189509930.0, + "step": 4966 + }, + { + "epoch": 0.6318534537590638, + "ewc_loss": 0.031341612339019775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.567080653330777e-05, + "grad_norm": 23.884445190429688, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8612263798713684, + "num_tokens": 189547801.0, + "step": 4967 + }, + { + "epoch": 0.6319806640376543, + "ewc_loss": 0.03128695487976074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5643478036508895e-05, + "grad_norm": 23.975767135620117, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8698028922080994, + "num_tokens": 189582226.0, + "step": 4968 + }, + { + "epoch": 0.6321078743162447, + "ewc_loss": 0.03130268678069115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5651343346689828e-05, + "grad_norm": 23.844982147216797, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8558905124664307, + "num_tokens": 189622915.0, + "step": 4969 + }, + { + "epoch": 0.6322350845948352, + "ewc_loss": 0.03128952905535698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5644764062017202e-05, + "grad_norm": 23.99120330810547, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8650285005569458, + "num_tokens": 189658182.0, + "step": 4970 + }, + { + "epoch": 0.6323622948734258, + "ewc_loss": 0.03133616968989372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5668085325160064e-05, + "grad_norm": 23.855609893798828, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8583946228027344, + "num_tokens": 189696627.0, + "step": 4971 + }, + { + "epoch": 0.6324895051520163, + "ewc_loss": 0.03121541440486908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5607707609888166e-05, + "grad_norm": 23.93939781188965, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8551846146583557, + "num_tokens": 189729826.0, + "step": 4972 + }, + { + "epoch": 0.6326167154306068, + "ewc_loss": 0.03136462718248367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.56823134602746e-05, + "grad_norm": 23.916606903076172, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.863118052482605, + "num_tokens": 189771504.0, + "step": 4973 + }, + { + "epoch": 0.6327439257091974, + "ewc_loss": 0.03128228709101677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5641144273104146e-05, + "grad_norm": 23.9460391998291, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8732613325119019, + "num_tokens": 189809220.0, + "step": 4974 + }, + { + "epoch": 0.6328711359877878, + "ewc_loss": 0.03126422315835953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5632111171726137e-05, + "grad_norm": 23.815689086914062, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8648965358734131, + "num_tokens": 189849586.0, + "step": 4975 + }, + { + "epoch": 0.6329983462663783, + "ewc_loss": 0.03128373995423317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.564187004987616e-05, + "grad_norm": 23.97612762451172, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8592568039894104, + "num_tokens": 189885054.0, + "step": 4976 + }, + { + "epoch": 0.6331255565449688, + "ewc_loss": 0.031302064657211304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.565103229950182e-05, + "grad_norm": 23.924455642700195, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8503029942512512, + "num_tokens": 189918286.0, + "step": 4977 + }, + { + "epoch": 0.6332527668235594, + "ewc_loss": 0.031269390136003494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5634695955668576e-05, + "grad_norm": 23.968402862548828, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8382885456085205, + "num_tokens": 189956845.0, + "step": 4978 + }, + { + "epoch": 0.6333799771021499, + "ewc_loss": 0.031259916722774506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.562995748827234e-05, + "grad_norm": 23.875205993652344, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8739327192306519, + "num_tokens": 189994827.0, + "step": 4979 + }, + { + "epoch": 0.6335071873807404, + "ewc_loss": 0.03131091222167015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5655456081731245e-05, + "grad_norm": 24.007261276245117, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8519213199615479, + "num_tokens": 190036066.0, + "step": 4980 + }, + { + "epoch": 0.6336343976593308, + "ewc_loss": 0.03131336718797684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.565668389957864e-05, + "grad_norm": 24.005022048950195, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8685765266418457, + "num_tokens": 190073326.0, + "step": 4981 + }, + { + "epoch": 0.6337616079379214, + "ewc_loss": 0.031236056238412857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5618028555763885e-05, + "grad_norm": 23.99028968811035, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8552848100662231, + "num_tokens": 190109125.0, + "step": 4982 + }, + { + "epoch": 0.6338888182165119, + "ewc_loss": 0.031271353363990784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5635676390957087e-05, + "grad_norm": 23.990821838378906, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8634695410728455, + "num_tokens": 190148520.0, + "step": 4983 + }, + { + "epoch": 0.6340160284951024, + "ewc_loss": 0.03122405894100666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.561202952871099e-05, + "grad_norm": 23.97517967224121, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8602319955825806, + "num_tokens": 190184272.0, + "step": 4984 + }, + { + "epoch": 0.6341432387736929, + "ewc_loss": 0.031255874782800674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5627936591045e-05, + "grad_norm": 23.999921798706055, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8561400175094604, + "num_tokens": 190221483.0, + "step": 4985 + }, + { + "epoch": 0.6342704490522835, + "ewc_loss": 0.031235937029123306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.561796852911357e-05, + "grad_norm": 23.900487899780273, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8691843748092651, + "num_tokens": 190266430.0, + "step": 4986 + }, + { + "epoch": 0.6343976593308739, + "ewc_loss": 0.031260982155799866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5630490452167578e-05, + "grad_norm": 23.987525939941406, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8690545558929443, + "num_tokens": 190308575.0, + "step": 4987 + }, + { + "epoch": 0.6345248696094644, + "ewc_loss": 0.03125371038913727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5626856111339293e-05, + "grad_norm": 24.018646240234375, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.867007851600647, + "num_tokens": 190344584.0, + "step": 4988 + }, + { + "epoch": 0.6346520798880549, + "ewc_loss": 0.031242547556757927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.562127363285981e-05, + "grad_norm": 24.02496337890625, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8637088537216187, + "num_tokens": 190376497.0, + "step": 4989 + }, + { + "epoch": 0.6347792901666455, + "ewc_loss": 0.031277261674404144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5638630429748446e-05, + "grad_norm": 24.088695526123047, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8461779356002808, + "num_tokens": 190416879.0, + "step": 4990 + }, + { + "epoch": 0.634906500445236, + "ewc_loss": 0.03122010827064514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.561005410621874e-05, + "grad_norm": 23.876840591430664, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8776200413703918, + "num_tokens": 190454684.0, + "step": 4991 + }, + { + "epoch": 0.6350337107238265, + "ewc_loss": 0.031187810003757477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.559390511829406e-05, + "grad_norm": 24.013090133666992, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8511248826980591, + "num_tokens": 190490524.0, + "step": 4992 + }, + { + "epoch": 0.635160921002417, + "ewc_loss": 0.031246814876794815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.562340730743017e-05, + "grad_norm": 23.924341201782227, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.864147424697876, + "num_tokens": 190526558.0, + "step": 4993 + }, + { + "epoch": 0.6352881312810075, + "ewc_loss": 0.03117987886071205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.558993972139433e-05, + "grad_norm": 24.018264770507812, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8577101230621338, + "num_tokens": 190571206.0, + "step": 4994 + }, + { + "epoch": 0.635415341559598, + "ewc_loss": 0.03121940977871418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.560970486025326e-05, + "grad_norm": 23.927047729492188, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8457852005958557, + "num_tokens": 190611480.0, + "step": 4995 + }, + { + "epoch": 0.6355425518381885, + "ewc_loss": 0.031216241419315338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.560812052048277e-05, + "grad_norm": 24.007638931274414, + "learning_rate": 1e-06, + "loss": 0.5368, + "mean_token_accuracy": 0.8294505476951599, + "num_tokens": 190647699.0, + "step": 4996 + }, + { + "epoch": 0.6356697621167791, + "ewc_loss": 0.03126918524503708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5634592273272574e-05, + "grad_norm": 24.031957626342773, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8578073382377625, + "num_tokens": 190684863.0, + "step": 4997 + }, + { + "epoch": 0.6357969723953696, + "ewc_loss": 0.03113345429301262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.556672759761568e-05, + "grad_norm": 23.899198532104492, + "learning_rate": 1e-06, + "loss": 0.5029, + "mean_token_accuracy": 0.8429096937179565, + "num_tokens": 190725006.0, + "step": 4998 + }, + { + "epoch": 0.63592418267396, + "ewc_loss": 0.0312295313924551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5614765288773924e-05, + "grad_norm": 23.963802337646484, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8553293943405151, + "num_tokens": 190757431.0, + "step": 4999 + }, + { + "epoch": 0.6360513929525505, + "ewc_loss": 0.031178634613752365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5589317627018318e-05, + "grad_norm": 23.96087074279785, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8620824813842773, + "num_tokens": 190793118.0, + "step": 5000 + }, + { + "epoch": 0.6361786032311411, + "ewc_loss": 0.03122953698039055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.561476892675273e-05, + "grad_norm": 23.97376251220703, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8668568134307861, + "num_tokens": 190837105.0, + "step": 5001 + }, + { + "epoch": 0.6363058135097316, + "ewc_loss": 0.03122911974787712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5614559742971323e-05, + "grad_norm": 23.90501594543457, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8518484830856323, + "num_tokens": 190875372.0, + "step": 5002 + }, + { + "epoch": 0.6364330237883221, + "ewc_loss": 0.03129452094435692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5647259715478867e-05, + "grad_norm": 24.053152084350586, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8627498149871826, + "num_tokens": 190921307.0, + "step": 5003 + }, + { + "epoch": 0.6365602340669126, + "ewc_loss": 0.03125962242484093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5629811969120055e-05, + "grad_norm": 23.92913818359375, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8576589226722717, + "num_tokens": 190967919.0, + "step": 5004 + }, + { + "epoch": 0.6366874443455031, + "ewc_loss": 0.031232815235853195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5616407836205326e-05, + "grad_norm": 23.92098045349121, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8559994101524353, + "num_tokens": 191005668.0, + "step": 5005 + }, + { + "epoch": 0.6368146546240936, + "ewc_loss": 0.03129732608795166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5648663975298405e-05, + "grad_norm": 24.061492919921875, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8469898104667664, + "num_tokens": 191044455.0, + "step": 5006 + }, + { + "epoch": 0.6369418649026841, + "ewc_loss": 0.031227296218276024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5613648429280147e-05, + "grad_norm": 23.842283248901367, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8629313707351685, + "num_tokens": 191086369.0, + "step": 5007 + }, + { + "epoch": 0.6370690751812746, + "ewc_loss": 0.03124338388442993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5621692000422627e-05, + "grad_norm": 24.003053665161133, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8536500930786133, + "num_tokens": 191127070.0, + "step": 5008 + }, + { + "epoch": 0.6371962854598652, + "ewc_loss": 0.03125043958425522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5625219020876102e-05, + "grad_norm": 23.945478439331055, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8719225525856018, + "num_tokens": 191163623.0, + "step": 5009 + }, + { + "epoch": 0.6373234957384557, + "ewc_loss": 0.031248124316334724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5624062143615447e-05, + "grad_norm": 24.02222442626953, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8530675172805786, + "num_tokens": 191196790.0, + "step": 5010 + }, + { + "epoch": 0.6374507060170462, + "ewc_loss": 0.031238272786140442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5619136320310645e-05, + "grad_norm": 23.986833572387695, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8539279699325562, + "num_tokens": 191230265.0, + "step": 5011 + }, + { + "epoch": 0.6375779162956366, + "ewc_loss": 0.031247850507497787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.562392571941018e-05, + "grad_norm": 23.95592498779297, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8492085933685303, + "num_tokens": 191266781.0, + "step": 5012 + }, + { + "epoch": 0.6377051265742272, + "ewc_loss": 0.03124246746301651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5621233615092933e-05, + "grad_norm": 24.037845611572266, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.847021758556366, + "num_tokens": 191305869.0, + "step": 5013 + }, + { + "epoch": 0.6378323368528177, + "ewc_loss": 0.03129372373223305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.564686135679949e-05, + "grad_norm": 23.955904006958008, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8455854654312134, + "num_tokens": 191343498.0, + "step": 5014 + }, + { + "epoch": 0.6379595471314082, + "ewc_loss": 0.03128346800804138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5641733625670895e-05, + "grad_norm": 24.123266220092773, + "learning_rate": 1e-06, + "loss": 0.51, + "mean_token_accuracy": 0.8421561121940613, + "num_tokens": 191383260.0, + "step": 5015 + }, + { + "epoch": 0.6380867574099988, + "ewc_loss": 0.0312860906124115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.564304511703085e-05, + "grad_norm": 23.9357852935791, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8486015796661377, + "num_tokens": 191417350.0, + "step": 5016 + }, + { + "epoch": 0.6382139676885893, + "ewc_loss": 0.03124798648059368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5623993022018112e-05, + "grad_norm": 24.058757781982422, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8584722280502319, + "num_tokens": 191454336.0, + "step": 5017 + }, + { + "epoch": 0.6383411779671797, + "ewc_loss": 0.031313274055719376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5656636605854146e-05, + "grad_norm": 24.01294708251953, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8584732413291931, + "num_tokens": 191489977.0, + "step": 5018 + }, + { + "epoch": 0.6384683882457702, + "ewc_loss": 0.031243307515978813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5621653801645152e-05, + "grad_norm": 24.09733772277832, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8468022346496582, + "num_tokens": 191524622.0, + "step": 5019 + }, + { + "epoch": 0.6385955985243608, + "ewc_loss": 0.03127403184771538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5637015167158097e-05, + "grad_norm": 23.992643356323242, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8463548421859741, + "num_tokens": 191565389.0, + "step": 5020 + }, + { + "epoch": 0.6387228088029513, + "ewc_loss": 0.031211860477924347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5605930457240902e-05, + "grad_norm": 24.015079498291016, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8761987090110779, + "num_tokens": 191604640.0, + "step": 5021 + }, + { + "epoch": 0.6388500190815418, + "ewc_loss": 0.031311530619859695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5655765309929848e-05, + "grad_norm": 23.96043586730957, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8569563627243042, + "num_tokens": 191651029.0, + "step": 5022 + }, + { + "epoch": 0.6389772293601323, + "ewc_loss": 0.031267814338207245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5633906514267437e-05, + "grad_norm": 24.045997619628906, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8582901358604431, + "num_tokens": 191690524.0, + "step": 5023 + }, + { + "epoch": 0.6391044396387228, + "ewc_loss": 0.031331051141023636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5665526007069275e-05, + "grad_norm": 23.90455436706543, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8522830605506897, + "num_tokens": 191733992.0, + "step": 5024 + }, + { + "epoch": 0.6392316499173133, + "ewc_loss": 0.031239628791809082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5619814803358167e-05, + "grad_norm": 24.041576385498047, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8672682046890259, + "num_tokens": 191769717.0, + "step": 5025 + }, + { + "epoch": 0.6393588601959038, + "ewc_loss": 0.031372178345918655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.568608968227636e-05, + "grad_norm": 23.995941162109375, + "learning_rate": 1e-06, + "loss": 0.505, + "mean_token_accuracy": 0.8423081636428833, + "num_tokens": 191813334.0, + "step": 5026 + }, + { + "epoch": 0.6394860704744944, + "ewc_loss": 0.03133179992437363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5665900718886405e-05, + "grad_norm": 23.98094367980957, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8593312501907349, + "num_tokens": 191848295.0, + "step": 5027 + }, + { + "epoch": 0.6396132807530849, + "ewc_loss": 0.03136911615729332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5684558093198575e-05, + "grad_norm": 24.10426902770996, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8499690294265747, + "num_tokens": 191885329.0, + "step": 5028 + }, + { + "epoch": 0.6397404910316754, + "ewc_loss": 0.03130531311035156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5652656657039188e-05, + "grad_norm": 23.91408348083496, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8702547550201416, + "num_tokens": 191918070.0, + "step": 5029 + }, + { + "epoch": 0.6398677013102658, + "ewc_loss": 0.03134666383266449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.567333129059989e-05, + "grad_norm": 24.017961502075195, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8617247343063354, + "num_tokens": 191966084.0, + "step": 5030 + }, + { + "epoch": 0.6399949115888564, + "ewc_loss": 0.031337492167949677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.566874561831355e-05, + "grad_norm": 23.975658416748047, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8564010262489319, + "num_tokens": 192004907.0, + "step": 5031 + }, + { + "epoch": 0.6401221218674469, + "ewc_loss": 0.031314749270677567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5657375115551986e-05, + "grad_norm": 23.963855743408203, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8397293090820312, + "num_tokens": 192042942.0, + "step": 5032 + }, + { + "epoch": 0.6402493321460374, + "ewc_loss": 0.031347669661045074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5673835150664672e-05, + "grad_norm": 24.028606414794922, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8572680950164795, + "num_tokens": 192081133.0, + "step": 5033 + }, + { + "epoch": 0.6403765424246279, + "ewc_loss": 0.03133280575275421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5666402759961784e-05, + "grad_norm": 23.999109268188477, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.85782790184021, + "num_tokens": 192120171.0, + "step": 5034 + }, + { + "epoch": 0.6405037527032185, + "ewc_loss": 0.03128661960363388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5643308870494366e-05, + "grad_norm": 23.934978485107422, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8581559062004089, + "num_tokens": 192159583.0, + "step": 5035 + }, + { + "epoch": 0.6406309629818089, + "ewc_loss": 0.03133022040128708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5665109458495863e-05, + "grad_norm": 23.992496490478516, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8547649383544922, + "num_tokens": 192199513.0, + "step": 5036 + }, + { + "epoch": 0.6407581732603994, + "ewc_loss": 0.03133787587285042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5668938431190327e-05, + "grad_norm": 23.866558074951172, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8599933385848999, + "num_tokens": 192228352.0, + "step": 5037 + }, + { + "epoch": 0.6408853835389899, + "ewc_loss": 0.03132397308945656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5661986253689975e-05, + "grad_norm": 23.9942626953125, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8523609638214111, + "num_tokens": 192264480.0, + "step": 5038 + }, + { + "epoch": 0.6410125938175805, + "ewc_loss": 0.0314425528049469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5721276213298552e-05, + "grad_norm": 24.012510299682617, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.855718195438385, + "num_tokens": 192303307.0, + "step": 5039 + }, + { + "epoch": 0.641139804096171, + "ewc_loss": 0.03134169429540634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5670846551074646e-05, + "grad_norm": 24.011241912841797, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8589766025543213, + "num_tokens": 192342064.0, + "step": 5040 + }, + { + "epoch": 0.6412670143747615, + "ewc_loss": 0.03137432411313057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5687162886024453e-05, + "grad_norm": 23.964736938476562, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8648604154586792, + "num_tokens": 192379297.0, + "step": 5041 + }, + { + "epoch": 0.6413942246533519, + "ewc_loss": 0.03132941573858261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.566470746183768e-05, + "grad_norm": 24.02816390991211, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8608652353286743, + "num_tokens": 192416087.0, + "step": 5042 + }, + { + "epoch": 0.6415214349319425, + "ewc_loss": 0.03143007308244705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.571503707964439e-05, + "grad_norm": 24.008930206298828, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8627464771270752, + "num_tokens": 192454421.0, + "step": 5043 + }, + { + "epoch": 0.641648645210533, + "ewc_loss": 0.03138645738363266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5693229215685278e-05, + "grad_norm": 23.93657875061035, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8536920547485352, + "num_tokens": 192493860.0, + "step": 5044 + }, + { + "epoch": 0.6417758554891235, + "ewc_loss": 0.03144235163927078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5721176168881357e-05, + "grad_norm": 24.078073501586914, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8481253385543823, + "num_tokens": 192533672.0, + "step": 5045 + }, + { + "epoch": 0.641903065767714, + "ewc_loss": 0.03141838312149048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5709190847701393e-05, + "grad_norm": 23.95387077331543, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8456570506095886, + "num_tokens": 192569569.0, + "step": 5046 + }, + { + "epoch": 0.6420302760463046, + "ewc_loss": 0.03145868703722954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5729343431303278e-05, + "grad_norm": 24.071449279785156, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8629704713821411, + "num_tokens": 192610716.0, + "step": 5047 + }, + { + "epoch": 0.642157486324895, + "ewc_loss": 0.03141593560576439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.570796848682221e-05, + "grad_norm": 23.9335994720459, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8514363765716553, + "num_tokens": 192648040.0, + "step": 5048 + }, + { + "epoch": 0.6422846966034855, + "ewc_loss": 0.03145955502986908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5729778169770725e-05, + "grad_norm": 24.077316284179688, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8555974364280701, + "num_tokens": 192690063.0, + "step": 5049 + }, + { + "epoch": 0.6424119068820761, + "ewc_loss": 0.03140566870570183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5702833479736e-05, + "grad_norm": 23.942733764648438, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8519853949546814, + "num_tokens": 192732067.0, + "step": 5050 + }, + { + "epoch": 0.6425391171606666, + "ewc_loss": 0.03135668858885765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5678344425396062e-05, + "grad_norm": 24.050148010253906, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.866970956325531, + "num_tokens": 192768598.0, + "step": 5051 + }, + { + "epoch": 0.6426663274392571, + "ewc_loss": 0.03141195699572563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.570597851241473e-05, + "grad_norm": 23.980449676513672, + "learning_rate": 1e-06, + "loss": 0.5512, + "mean_token_accuracy": 0.8288270235061646, + "num_tokens": 192805420.0, + "step": 5052 + }, + { + "epoch": 0.6427935377178476, + "ewc_loss": 0.0313410647213459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5670531865907833e-05, + "grad_norm": 23.91740608215332, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8522558212280273, + "num_tokens": 192852082.0, + "step": 5053 + }, + { + "epoch": 0.6429207479964381, + "ewc_loss": 0.031438130885362625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5719066141173244e-05, + "grad_norm": 24.00706672668457, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8577059507369995, + "num_tokens": 192892846.0, + "step": 5054 + }, + { + "epoch": 0.6430479582750286, + "ewc_loss": 0.03140418231487274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5702091332059354e-05, + "grad_norm": 23.930049896240234, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8752027153968811, + "num_tokens": 192932091.0, + "step": 5055 + }, + { + "epoch": 0.6431751685536191, + "ewc_loss": 0.03144952282309532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5724761396995746e-05, + "grad_norm": 24.032798767089844, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8579353094100952, + "num_tokens": 192960628.0, + "step": 5056 + }, + { + "epoch": 0.6433023788322096, + "ewc_loss": 0.03145059198141098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5725296179880388e-05, + "grad_norm": 24.048952102661133, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8501793742179871, + "num_tokens": 193002075.0, + "step": 5057 + }, + { + "epoch": 0.6434295891108002, + "ewc_loss": 0.03137965500354767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5689827705500647e-05, + "grad_norm": 23.978946685791016, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8465881943702698, + "num_tokens": 193039720.0, + "step": 5058 + }, + { + "epoch": 0.6435567993893907, + "ewc_loss": 0.031378235667943954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5689118299633265e-05, + "grad_norm": 24.024742126464844, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.856316328048706, + "num_tokens": 193072806.0, + "step": 5059 + }, + { + "epoch": 0.6436840096679812, + "ewc_loss": 0.0313873291015625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5693663954152726e-05, + "grad_norm": 24.043546676635742, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8653690218925476, + "num_tokens": 193108478.0, + "step": 5060 + }, + { + "epoch": 0.6438112199465716, + "ewc_loss": 0.03141620755195618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5708103092038073e-05, + "grad_norm": 24.039289474487305, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8597331047058105, + "num_tokens": 193150663.0, + "step": 5061 + }, + { + "epoch": 0.6439384302251622, + "ewc_loss": 0.03134649619460106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5673247617087327e-05, + "grad_norm": 23.98574447631836, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8600764274597168, + "num_tokens": 193191499.0, + "step": 5062 + }, + { + "epoch": 0.6440656405037527, + "ewc_loss": 0.03137994557619095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.568997322465293e-05, + "grad_norm": 23.984291076660156, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8480278849601746, + "num_tokens": 193236103.0, + "step": 5063 + }, + { + "epoch": 0.6441928507823432, + "ewc_loss": 0.031370434910058975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.568521656736266e-05, + "grad_norm": 24.120101928710938, + "learning_rate": 1e-06, + "loss": 0.5281, + "mean_token_accuracy": 0.8358449935913086, + "num_tokens": 193274095.0, + "step": 5064 + }, + { + "epoch": 0.6443200610609338, + "ewc_loss": 0.031460970640182495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5730485756648704e-05, + "grad_norm": 23.988489151000977, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8626030683517456, + "num_tokens": 193316988.0, + "step": 5065 + }, + { + "epoch": 0.6444472713395243, + "ewc_loss": 0.0313652828335762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5682640878367238e-05, + "grad_norm": 24.04191017150879, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8550583124160767, + "num_tokens": 193357430.0, + "step": 5066 + }, + { + "epoch": 0.6445744816181147, + "ewc_loss": 0.03141232207417488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5706160411355086e-05, + "grad_norm": 23.996004104614258, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8647139668464661, + "num_tokens": 193390392.0, + "step": 5067 + }, + { + "epoch": 0.6447016918967052, + "ewc_loss": 0.03136489540338516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5682448065490462e-05, + "grad_norm": 24.0600643157959, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8537560701370239, + "num_tokens": 193430961.0, + "step": 5068 + }, + { + "epoch": 0.6448289021752958, + "ewc_loss": 0.03143218159675598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5716090274509043e-05, + "grad_norm": 24.089231491088867, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8549978733062744, + "num_tokens": 193473767.0, + "step": 5069 + }, + { + "epoch": 0.6449561124538863, + "ewc_loss": 0.03136463090777397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5682315279264003e-05, + "grad_norm": 23.985267639160156, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8456113338470459, + "num_tokens": 193511375.0, + "step": 5070 + }, + { + "epoch": 0.6450833227324768, + "ewc_loss": 0.031405601650476456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5702800737926736e-05, + "grad_norm": 23.967737197875977, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8673773407936096, + "num_tokens": 193551296.0, + "step": 5071 + }, + { + "epoch": 0.6452105330110673, + "ewc_loss": 0.031397927552461624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5698964489274658e-05, + "grad_norm": 23.99934196472168, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8613196015357971, + "num_tokens": 193596484.0, + "step": 5072 + }, + { + "epoch": 0.6453377432896578, + "ewc_loss": 0.03143458440899849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5717292626504786e-05, + "grad_norm": 23.992570877075195, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8624683618545532, + "num_tokens": 193637513.0, + "step": 5073 + }, + { + "epoch": 0.6454649535682483, + "ewc_loss": 0.03141658008098602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5708290447946638e-05, + "grad_norm": 24.14814567565918, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8505957126617432, + "num_tokens": 193672808.0, + "step": 5074 + }, + { + "epoch": 0.6455921638468388, + "ewc_loss": 0.03138328343629837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5691641237935983e-05, + "grad_norm": 24.02834129333496, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8719334602355957, + "num_tokens": 193713018.0, + "step": 5075 + }, + { + "epoch": 0.6457193741254293, + "ewc_loss": 0.03137996047735214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5689980500610545e-05, + "grad_norm": 24.0878849029541, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8593075275421143, + "num_tokens": 193757649.0, + "step": 5076 + }, + { + "epoch": 0.6458465844040199, + "ewc_loss": 0.031357426196336746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5678713680244982e-05, + "grad_norm": 24.063161849975586, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8538510799407959, + "num_tokens": 193792390.0, + "step": 5077 + }, + { + "epoch": 0.6459737946826104, + "ewc_loss": 0.0312696248292923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5634812370990403e-05, + "grad_norm": 23.910354614257812, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8645310997962952, + "num_tokens": 193827721.0, + "step": 5078 + }, + { + "epoch": 0.6461010049612008, + "ewc_loss": 0.03137654438614845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5688272469560616e-05, + "grad_norm": 24.09893798828125, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8478907942771912, + "num_tokens": 193867217.0, + "step": 5079 + }, + { + "epoch": 0.6462282152397913, + "ewc_loss": 0.03143230080604553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.571615030115936e-05, + "grad_norm": 24.01060676574707, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8638051152229309, + "num_tokens": 193900128.0, + "step": 5080 + }, + { + "epoch": 0.6463554255183819, + "ewc_loss": 0.03132417052984238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5662084479117766e-05, + "grad_norm": 24.01409912109375, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8682124614715576, + "num_tokens": 193935744.0, + "step": 5081 + }, + { + "epoch": 0.6464826357969724, + "ewc_loss": 0.031377170234918594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5688585335738026e-05, + "grad_norm": 24.012989044189453, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.853092610836029, + "num_tokens": 193977344.0, + "step": 5082 + }, + { + "epoch": 0.6466098460755629, + "ewc_loss": 0.0313970223069191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5698511560913175e-05, + "grad_norm": 24.057321548461914, + "learning_rate": 1e-06, + "loss": 0.5427, + "mean_token_accuracy": 0.8296394944190979, + "num_tokens": 194014295.0, + "step": 5083 + }, + { + "epoch": 0.6467370563541535, + "ewc_loss": 0.03134515881538391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5672580047976226e-05, + "grad_norm": 23.977834701538086, + "learning_rate": 1e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8413709402084351, + "num_tokens": 194055245.0, + "step": 5084 + }, + { + "epoch": 0.6468642666327439, + "ewc_loss": 0.03147631138563156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5738156434963457e-05, + "grad_norm": 24.051942825317383, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8493444919586182, + "num_tokens": 194089313.0, + "step": 5085 + }, + { + "epoch": 0.6469914769113344, + "ewc_loss": 0.03143208846449852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5716044799773954e-05, + "grad_norm": 23.993640899658203, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8633397221565247, + "num_tokens": 194128293.0, + "step": 5086 + }, + { + "epoch": 0.6471186871899249, + "ewc_loss": 0.031388625502586365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.569431333336979e-05, + "grad_norm": 23.943103790283203, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.849888026714325, + "num_tokens": 194166632.0, + "step": 5087 + }, + { + "epoch": 0.6472458974685155, + "ewc_loss": 0.031467240303754807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5733619875391014e-05, + "grad_norm": 24.027135848999023, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8426178693771362, + "num_tokens": 194209176.0, + "step": 5088 + }, + { + "epoch": 0.647373107747106, + "ewc_loss": 0.03143484517931938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5717421774752438e-05, + "grad_norm": 24.046972274780273, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8462890386581421, + "num_tokens": 194245617.0, + "step": 5089 + }, + { + "epoch": 0.6475003180256965, + "ewc_loss": 0.03144311532378197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.57215581566561e-05, + "grad_norm": 23.989349365234375, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8615020513534546, + "num_tokens": 194285983.0, + "step": 5090 + }, + { + "epoch": 0.6476275283042869, + "ewc_loss": 0.031449880450963974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5724939657957293e-05, + "grad_norm": 24.076215744018555, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8529044389724731, + "num_tokens": 194317088.0, + "step": 5091 + }, + { + "epoch": 0.6477547385828775, + "ewc_loss": 0.031457580626010895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.57287904585246e-05, + "grad_norm": 23.986238479614258, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.862085223197937, + "num_tokens": 194354025.0, + "step": 5092 + }, + { + "epoch": 0.647881948861468, + "ewc_loss": 0.03150288760662079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.575144415255636e-05, + "grad_norm": 24.10273551940918, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.860098123550415, + "num_tokens": 194388539.0, + "step": 5093 + }, + { + "epoch": 0.6480091591400585, + "ewc_loss": 0.03151234984397888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5756175343994983e-05, + "grad_norm": 24.080078125, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8578576445579529, + "num_tokens": 194424518.0, + "step": 5094 + }, + { + "epoch": 0.648136369418649, + "ewc_loss": 0.0314580537378788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.572902692714706e-05, + "grad_norm": 24.09801483154297, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8531371355056763, + "num_tokens": 194463310.0, + "step": 5095 + }, + { + "epoch": 0.6482635796972396, + "ewc_loss": 0.03146367892622948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5731839084764943e-05, + "grad_norm": 23.988086700439453, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8569107055664062, + "num_tokens": 194499911.0, + "step": 5096 + }, + { + "epoch": 0.64839078997583, + "ewc_loss": 0.03150526061654091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.575263013364747e-05, + "grad_norm": 24.099327087402344, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8466002941131592, + "num_tokens": 194536731.0, + "step": 5097 + }, + { + "epoch": 0.6485180002544205, + "ewc_loss": 0.03150034323334694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5750170859973878e-05, + "grad_norm": 23.917125701904297, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.846933901309967, + "num_tokens": 194574014.0, + "step": 5098 + }, + { + "epoch": 0.648645210533011, + "ewc_loss": 0.031456008553504944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5728004655102268e-05, + "grad_norm": 23.939302444458008, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8486714363098145, + "num_tokens": 194611944.0, + "step": 5099 + }, + { + "epoch": 0.6487724208116016, + "ewc_loss": 0.03159680590033531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.57984031829983e-05, + "grad_norm": 24.016281127929688, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8669697046279907, + "num_tokens": 194649734.0, + "step": 5100 + }, + { + "epoch": 0.6488996310901921, + "ewc_loss": 0.03157695010304451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5788475138833746e-05, + "grad_norm": 24.04994773864746, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8647940158843994, + "num_tokens": 194688344.0, + "step": 5101 + }, + { + "epoch": 0.6490268413687826, + "ewc_loss": 0.03157120943069458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.578560477355495e-05, + "grad_norm": 24.107566833496094, + "learning_rate": 1e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.843825101852417, + "num_tokens": 194728582.0, + "step": 5102 + }, + { + "epoch": 0.649154051647373, + "ewc_loss": 0.031584929674863815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5792464182595722e-05, + "grad_norm": 24.10969352722168, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8569790720939636, + "num_tokens": 194762753.0, + "step": 5103 + }, + { + "epoch": 0.6492812619259636, + "ewc_loss": 0.03150457143783569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.57522863446502e-05, + "grad_norm": 24.084009170532227, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8694312572479248, + "num_tokens": 194802325.0, + "step": 5104 + }, + { + "epoch": 0.6494084722045541, + "ewc_loss": 0.031582608819007874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.579130366735626e-05, + "grad_norm": 24.145198822021484, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.85575270652771, + "num_tokens": 194846349.0, + "step": 5105 + }, + { + "epoch": 0.6495356824831446, + "ewc_loss": 0.031553592532873154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.577679540787358e-05, + "grad_norm": 24.18882942199707, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8570843935012817, + "num_tokens": 194880334.0, + "step": 5106 + }, + { + "epoch": 0.6496628927617352, + "ewc_loss": 0.031467948108911514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5733974578324705e-05, + "grad_norm": 24.049041748046875, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.847665548324585, + "num_tokens": 194919343.0, + "step": 5107 + }, + { + "epoch": 0.6497901030403257, + "ewc_loss": 0.03149745985865593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.574873022036627e-05, + "grad_norm": 24.034543991088867, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8582439422607422, + "num_tokens": 194958800.0, + "step": 5108 + }, + { + "epoch": 0.6499173133189162, + "ewc_loss": 0.031559161841869354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5779580280650407e-05, + "grad_norm": 24.30122947692871, + "learning_rate": 1e-06, + "loss": 0.5239, + "mean_token_accuracy": 0.8304349184036255, + "num_tokens": 194997249.0, + "step": 5109 + }, + { + "epoch": 0.6500445235975066, + "ewc_loss": 0.03149621561169624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5748108125990257e-05, + "grad_norm": 24.017614364624023, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8590317964553833, + "num_tokens": 195035879.0, + "step": 5110 + }, + { + "epoch": 0.6501717338760972, + "ewc_loss": 0.03141409158706665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5707046259194613e-05, + "grad_norm": 24.09876823425293, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8518251180648804, + "num_tokens": 195075925.0, + "step": 5111 + }, + { + "epoch": 0.6502989441546877, + "ewc_loss": 0.03150032088160515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5750159946037456e-05, + "grad_norm": 24.091432571411133, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8582136631011963, + "num_tokens": 195110185.0, + "step": 5112 + }, + { + "epoch": 0.6504261544332782, + "ewc_loss": 0.03150185942649841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5750929378555156e-05, + "grad_norm": 24.094249725341797, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8491662740707397, + "num_tokens": 195154666.0, + "step": 5113 + }, + { + "epoch": 0.6505533647118688, + "ewc_loss": 0.03146078437566757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5730392988189124e-05, + "grad_norm": 24.123231887817383, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8689523935317993, + "num_tokens": 195188998.0, + "step": 5114 + }, + { + "epoch": 0.6506805749904593, + "ewc_loss": 0.031460873782634735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.573043664393481e-05, + "grad_norm": 24.023025512695312, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8635064363479614, + "num_tokens": 195224124.0, + "step": 5115 + }, + { + "epoch": 0.6508077852690497, + "ewc_loss": 0.03142506629228592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5712532331235707e-05, + "grad_norm": 24.034706115722656, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8576619029045105, + "num_tokens": 195253601.0, + "step": 5116 + }, + { + "epoch": 0.6509349955476402, + "ewc_loss": 0.031538862735033035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5769432138768025e-05, + "grad_norm": 24.196754455566406, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8481696248054504, + "num_tokens": 195289045.0, + "step": 5117 + }, + { + "epoch": 0.6510622058262308, + "ewc_loss": 0.031442150473594666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5721074305474758e-05, + "grad_norm": 23.91173553466797, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8579353094100952, + "num_tokens": 195326983.0, + "step": 5118 + }, + { + "epoch": 0.6511894161048213, + "ewc_loss": 0.031500596553087234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5750298189232126e-05, + "grad_norm": 24.189044952392578, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8685532212257385, + "num_tokens": 195360425.0, + "step": 5119 + }, + { + "epoch": 0.6513166263834118, + "ewc_loss": 0.031599260866642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5799631000845693e-05, + "grad_norm": 24.099103927612305, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8452637195587158, + "num_tokens": 195402648.0, + "step": 5120 + }, + { + "epoch": 0.6514438366620023, + "ewc_loss": 0.03151440992951393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.575720489199739e-05, + "grad_norm": 24.126163482666016, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.873985767364502, + "num_tokens": 195440945.0, + "step": 5121 + }, + { + "epoch": 0.6515710469405928, + "ewc_loss": 0.03155241906642914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5776209693285637e-05, + "grad_norm": 24.018857955932617, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8591645956039429, + "num_tokens": 195481872.0, + "step": 5122 + }, + { + "epoch": 0.6516982572191833, + "ewc_loss": 0.03149925172328949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5749625163152814e-05, + "grad_norm": 24.161895751953125, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8626159429550171, + "num_tokens": 195525460.0, + "step": 5123 + }, + { + "epoch": 0.6518254674977738, + "ewc_loss": 0.03155843913555145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5779220120748505e-05, + "grad_norm": 24.066225051879883, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8591810464859009, + "num_tokens": 195563522.0, + "step": 5124 + }, + { + "epoch": 0.6519526777763643, + "ewc_loss": 0.03151152655482292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5755762433400378e-05, + "grad_norm": 24.168916702270508, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8664568662643433, + "num_tokens": 195598114.0, + "step": 5125 + }, + { + "epoch": 0.6520798880549549, + "ewc_loss": 0.03152924403548241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.576462273078505e-05, + "grad_norm": 24.047170639038086, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8526840806007385, + "num_tokens": 195639826.0, + "step": 5126 + }, + { + "epoch": 0.6522070983335454, + "ewc_loss": 0.03153027594089508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5765137504786253e-05, + "grad_norm": 24.156648635864258, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8639256954193115, + "num_tokens": 195682047.0, + "step": 5127 + }, + { + "epoch": 0.6523343086121358, + "ewc_loss": 0.0315251462161541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5762572729727253e-05, + "grad_norm": 24.124723434448242, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8611739873886108, + "num_tokens": 195722453.0, + "step": 5128 + }, + { + "epoch": 0.6524615188907263, + "ewc_loss": 0.03148161619901657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5740808521513827e-05, + "grad_norm": 24.17352294921875, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8523459434509277, + "num_tokens": 195759182.0, + "step": 5129 + }, + { + "epoch": 0.6525887291693169, + "ewc_loss": 0.03145209327340126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5726047422504053e-05, + "grad_norm": 24.073139190673828, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8598047494888306, + "num_tokens": 195795641.0, + "step": 5130 + }, + { + "epoch": 0.6527159394479074, + "ewc_loss": 0.03145642578601837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5728212019894272e-05, + "grad_norm": 23.96431541442871, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.865766704082489, + "num_tokens": 195831200.0, + "step": 5131 + }, + { + "epoch": 0.6528431497264979, + "ewc_loss": 0.031535785645246506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5767893273732625e-05, + "grad_norm": 24.09632110595703, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8598431348800659, + "num_tokens": 195875110.0, + "step": 5132 + }, + { + "epoch": 0.6529703600050885, + "ewc_loss": 0.0315527617931366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.577638067828957e-05, + "grad_norm": 24.069700241088867, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8586746454238892, + "num_tokens": 195916724.0, + "step": 5133 + }, + { + "epoch": 0.6530975702836789, + "ewc_loss": 0.031520381569862366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5760191672598012e-05, + "grad_norm": 24.152162551879883, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8508245348930359, + "num_tokens": 195956219.0, + "step": 5134 + }, + { + "epoch": 0.6532247805622694, + "ewc_loss": 0.03151861950755119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5759309462737292e-05, + "grad_norm": 24.07811164855957, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8603708744049072, + "num_tokens": 195991563.0, + "step": 5135 + }, + { + "epoch": 0.6533519908408599, + "ewc_loss": 0.03146480396389961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5732401152490638e-05, + "grad_norm": 24.047243118286133, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8637862205505371, + "num_tokens": 196031864.0, + "step": 5136 + }, + { + "epoch": 0.6534792011194505, + "ewc_loss": 0.03152329474687576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.576164686412085e-05, + "grad_norm": 24.070192337036133, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8500654697418213, + "num_tokens": 196067208.0, + "step": 5137 + }, + { + "epoch": 0.653606411398041, + "ewc_loss": 0.03157144412398338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5785721188876778e-05, + "grad_norm": 24.079282760620117, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8610712289810181, + "num_tokens": 196103934.0, + "step": 5138 + }, + { + "epoch": 0.6537336216766315, + "ewc_loss": 0.031527988612651825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5763995179440826e-05, + "grad_norm": 24.121435165405273, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8528889417648315, + "num_tokens": 196140886.0, + "step": 5139 + }, + { + "epoch": 0.6538608319552219, + "ewc_loss": 0.031531013548374176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5765506759635173e-05, + "grad_norm": 24.087778091430664, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8658199310302734, + "num_tokens": 196178940.0, + "step": 5140 + }, + { + "epoch": 0.6539880422338125, + "ewc_loss": 0.031544025987386703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5772013284731656e-05, + "grad_norm": 24.111244201660156, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8599985837936401, + "num_tokens": 196229461.0, + "step": 5141 + }, + { + "epoch": 0.654115252512403, + "ewc_loss": 0.031538546085357666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5769273886689916e-05, + "grad_norm": 24.03925895690918, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8709243535995483, + "num_tokens": 196265793.0, + "step": 5142 + }, + { + "epoch": 0.6542424627909935, + "ewc_loss": 0.03155091404914856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.577545663167257e-05, + "grad_norm": 24.079591751098633, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8632974624633789, + "num_tokens": 196304919.0, + "step": 5143 + }, + { + "epoch": 0.654369673069584, + "ewc_loss": 0.03156642988324165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5783214621478692e-05, + "grad_norm": 24.034303665161133, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8718185424804688, + "num_tokens": 196344761.0, + "step": 5144 + }, + { + "epoch": 0.6544968833481746, + "ewc_loss": 0.03155597671866417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5777988664922304e-05, + "grad_norm": 23.978689193725586, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8719444274902344, + "num_tokens": 196382625.0, + "step": 5145 + }, + { + "epoch": 0.654624093626765, + "ewc_loss": 0.03160148859024048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5800744222360663e-05, + "grad_norm": 24.17656898498535, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8683653473854065, + "num_tokens": 196422279.0, + "step": 5146 + }, + { + "epoch": 0.6547513039053555, + "ewc_loss": 0.03159402310848236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5797011656104587e-05, + "grad_norm": 23.97544288635254, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8528095483779907, + "num_tokens": 196463684.0, + "step": 5147 + }, + { + "epoch": 0.654878514183946, + "ewc_loss": 0.03155697137117386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5778485249029472e-05, + "grad_norm": 24.14207649230957, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8615070581436157, + "num_tokens": 196506319.0, + "step": 5148 + }, + { + "epoch": 0.6550057244625366, + "ewc_loss": 0.031623851507902145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.581192555022426e-05, + "grad_norm": 24.060903549194336, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8521379232406616, + "num_tokens": 196546367.0, + "step": 5149 + }, + { + "epoch": 0.6551329347411271, + "ewc_loss": 0.0315331406891346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5766570868436247e-05, + "grad_norm": 24.111053466796875, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8423500061035156, + "num_tokens": 196576598.0, + "step": 5150 + }, + { + "epoch": 0.6552601450197176, + "ewc_loss": 0.03162267059087753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5811334378668107e-05, + "grad_norm": 24.11739158630371, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8514766693115234, + "num_tokens": 196610633.0, + "step": 5151 + }, + { + "epoch": 0.655387355298308, + "ewc_loss": 0.03155757486820221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.577878720127046e-05, + "grad_norm": 24.172395706176758, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8502874374389648, + "num_tokens": 196646183.0, + "step": 5152 + }, + { + "epoch": 0.6555145655768986, + "ewc_loss": 0.03159012272953987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5795061699463986e-05, + "grad_norm": 23.995309829711914, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8670458197593689, + "num_tokens": 196689454.0, + "step": 5153 + }, + { + "epoch": 0.6556417758554891, + "ewc_loss": 0.03152807429432869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5764037016197108e-05, + "grad_norm": 24.147762298583984, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8421492576599121, + "num_tokens": 196731820.0, + "step": 5154 + }, + { + "epoch": 0.6557689861340796, + "ewc_loss": 0.031603604555130005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5801802874193527e-05, + "grad_norm": 24.101255416870117, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8479042053222656, + "num_tokens": 196768542.0, + "step": 5155 + }, + { + "epoch": 0.6558961964126702, + "ewc_loss": 0.03153982385993004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.576991235197056e-05, + "grad_norm": 24.199005126953125, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8425201177597046, + "num_tokens": 196810300.0, + "step": 5156 + }, + { + "epoch": 0.6560234066912607, + "ewc_loss": 0.03158749267458916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5793746570125222e-05, + "grad_norm": 24.05195426940918, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8413398861885071, + "num_tokens": 196847271.0, + "step": 5157 + }, + { + "epoch": 0.6561506169698512, + "ewc_loss": 0.031627167016267776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.581358264957089e-05, + "grad_norm": 24.329408645629883, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8720139265060425, + "num_tokens": 196880439.0, + "step": 5158 + }, + { + "epoch": 0.6562778272484416, + "ewc_loss": 0.03158705681562424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5793528291396797e-05, + "grad_norm": 24.110572814941406, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8505734205245972, + "num_tokens": 196914911.0, + "step": 5159 + }, + { + "epoch": 0.6564050375270322, + "ewc_loss": 0.03150736540555954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5753683328512125e-05, + "grad_norm": 24.10096549987793, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8584780693054199, + "num_tokens": 196949419.0, + "step": 5160 + }, + { + "epoch": 0.6565322478056227, + "ewc_loss": 0.03163390979170799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5816955055925064e-05, + "grad_norm": 24.229801177978516, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8499730825424194, + "num_tokens": 196989401.0, + "step": 5161 + }, + { + "epoch": 0.6566594580842132, + "ewc_loss": 0.03157021477818489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5785108189447783e-05, + "grad_norm": 24.085508346557617, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8491860628128052, + "num_tokens": 197028377.0, + "step": 5162 + }, + { + "epoch": 0.6567866683628037, + "ewc_loss": 0.03158004581928253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5790023098816164e-05, + "grad_norm": 24.197999954223633, + "learning_rate": 1e-06, + "loss": 0.5198, + "mean_token_accuracy": 0.836844801902771, + "num_tokens": 197068334.0, + "step": 5163 + }, + { + "epoch": 0.6569138786413943, + "ewc_loss": 0.031610846519470215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5805422663106583e-05, + "grad_norm": 24.121612548828125, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8595376014709473, + "num_tokens": 197105978.0, + "step": 5164 + }, + { + "epoch": 0.6570410889199847, + "ewc_loss": 0.03153086453676224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5765432181069627e-05, + "grad_norm": 24.064956665039062, + "learning_rate": 1e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8400466442108154, + "num_tokens": 197148641.0, + "step": 5165 + }, + { + "epoch": 0.6571682991985752, + "ewc_loss": 0.03162175789475441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.581087963131722e-05, + "grad_norm": 24.18938636779785, + "learning_rate": 1e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8378579616546631, + "num_tokens": 197188820.0, + "step": 5166 + }, + { + "epoch": 0.6572955094771658, + "ewc_loss": 0.03160112351179123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.580056232342031e-05, + "grad_norm": 24.087297439575195, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8450106382369995, + "num_tokens": 197227109.0, + "step": 5167 + }, + { + "epoch": 0.6574227197557563, + "ewc_loss": 0.03160790726542473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5803952919668518e-05, + "grad_norm": 24.163585662841797, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8635501861572266, + "num_tokens": 197264083.0, + "step": 5168 + }, + { + "epoch": 0.6575499300343468, + "ewc_loss": 0.03161248937249184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5806244846316986e-05, + "grad_norm": 24.134794235229492, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.853438138961792, + "num_tokens": 197301349.0, + "step": 5169 + }, + { + "epoch": 0.6576771403129373, + "ewc_loss": 0.03155508637428284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5777543012518436e-05, + "grad_norm": 24.122486114501953, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8466758728027344, + "num_tokens": 197348176.0, + "step": 5170 + }, + { + "epoch": 0.6578043505915278, + "ewc_loss": 0.03164852038025856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5824260117369704e-05, + "grad_norm": 24.173892974853516, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8485339879989624, + "num_tokens": 197380490.0, + "step": 5171 + }, + { + "epoch": 0.6579315608701183, + "ewc_loss": 0.0316123403608799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.580617026775144e-05, + "grad_norm": 24.277957916259766, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8443697690963745, + "num_tokens": 197423075.0, + "step": 5172 + }, + { + "epoch": 0.6580587711487088, + "ewc_loss": 0.03158655762672424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.579327908984851e-05, + "grad_norm": 24.1780948638916, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.860925555229187, + "num_tokens": 197461730.0, + "step": 5173 + }, + { + "epoch": 0.6581859814272993, + "ewc_loss": 0.03151695430278778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5758476365590468e-05, + "grad_norm": 24.148941040039062, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8457474708557129, + "num_tokens": 197505801.0, + "step": 5174 + }, + { + "epoch": 0.6583131917058899, + "ewc_loss": 0.031540751457214355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5770376194268465e-05, + "grad_norm": 24.147993087768555, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8746466040611267, + "num_tokens": 197534643.0, + "step": 5175 + }, + { + "epoch": 0.6584404019844804, + "ewc_loss": 0.03157063573598862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.578531737322919e-05, + "grad_norm": 24.101619720458984, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8618327379226685, + "num_tokens": 197576490.0, + "step": 5176 + }, + { + "epoch": 0.6585676122630708, + "ewc_loss": 0.03157274052500725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5786370568093844e-05, + "grad_norm": 24.08645248413086, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8694728016853333, + "num_tokens": 197612745.0, + "step": 5177 + }, + { + "epoch": 0.6586948225416613, + "ewc_loss": 0.03165848180651665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5829240510356613e-05, + "grad_norm": 24.1427059173584, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8517917394638062, + "num_tokens": 197653756.0, + "step": 5178 + }, + { + "epoch": 0.6588220328202519, + "ewc_loss": 0.03163359314203262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5816796803846955e-05, + "grad_norm": 24.160707473754883, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8813153505325317, + "num_tokens": 197694417.0, + "step": 5179 + }, + { + "epoch": 0.6589492430988424, + "ewc_loss": 0.031587153673172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5793577404110692e-05, + "grad_norm": 24.13433074951172, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8519976139068604, + "num_tokens": 197732738.0, + "step": 5180 + }, + { + "epoch": 0.6590764533774329, + "ewc_loss": 0.031621865928173065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5810932382009923e-05, + "grad_norm": 24.159988403320312, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8474768400192261, + "num_tokens": 197759408.0, + "step": 5181 + }, + { + "epoch": 0.6592036636560235, + "ewc_loss": 0.031678881496191025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5839441402931698e-05, + "grad_norm": 24.20005989074707, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8595055341720581, + "num_tokens": 197794334.0, + "step": 5182 + }, + { + "epoch": 0.6593308739346139, + "ewc_loss": 0.031665753573179245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.58328766701743e-05, + "grad_norm": 24.207378387451172, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.865079939365387, + "num_tokens": 197832497.0, + "step": 5183 + }, + { + "epoch": 0.6594580842132044, + "ewc_loss": 0.03165917098522186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5829586118343286e-05, + "grad_norm": 24.1467342376709, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8480081558227539, + "num_tokens": 197869021.0, + "step": 5184 + }, + { + "epoch": 0.6595852944917949, + "ewc_loss": 0.03164621815085411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.582310869707726e-05, + "grad_norm": 24.19637107849121, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8431549072265625, + "num_tokens": 197906365.0, + "step": 5185 + }, + { + "epoch": 0.6597125047703855, + "ewc_loss": 0.031668439507484436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5834219084354118e-05, + "grad_norm": 24.130220413208008, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8600757718086243, + "num_tokens": 197950725.0, + "step": 5186 + }, + { + "epoch": 0.659839715048976, + "ewc_loss": 0.031647004187107086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5823501598788425e-05, + "grad_norm": 24.070953369140625, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8650572299957275, + "num_tokens": 197991178.0, + "step": 5187 + }, + { + "epoch": 0.6599669253275665, + "ewc_loss": 0.031653013080358505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5826506569283083e-05, + "grad_norm": 24.065841674804688, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8479835391044617, + "num_tokens": 198030176.0, + "step": 5188 + }, + { + "epoch": 0.6600941356061569, + "ewc_loss": 0.03166603296995163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5833016732358374e-05, + "grad_norm": 24.1413516998291, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8517035245895386, + "num_tokens": 198067169.0, + "step": 5189 + }, + { + "epoch": 0.6602213458847475, + "ewc_loss": 0.031648650765419006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.582432560098823e-05, + "grad_norm": 24.04195213317871, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8466887474060059, + "num_tokens": 198104994.0, + "step": 5190 + }, + { + "epoch": 0.660348556163338, + "ewc_loss": 0.03173358365893364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5866791727603413e-05, + "grad_norm": 24.15382957458496, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8614717125892639, + "num_tokens": 198142791.0, + "step": 5191 + }, + { + "epoch": 0.6604757664419285, + "ewc_loss": 0.03172387182712555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.586193684488535e-05, + "grad_norm": 24.194625854492188, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.851993203163147, + "num_tokens": 198179300.0, + "step": 5192 + }, + { + "epoch": 0.660602976720519, + "ewc_loss": 0.03174450248479843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5872250514803454e-05, + "grad_norm": 24.067935943603516, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8716393709182739, + "num_tokens": 198220230.0, + "step": 5193 + }, + { + "epoch": 0.6607301869991096, + "ewc_loss": 0.03169840946793556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5849203919060528e-05, + "grad_norm": 24.166189193725586, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8460308909416199, + "num_tokens": 198251067.0, + "step": 5194 + }, + { + "epoch": 0.6608573972777, + "ewc_loss": 0.03172497823834419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5862489817664027e-05, + "grad_norm": 24.099199295043945, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8654128909111023, + "num_tokens": 198282130.0, + "step": 5195 + }, + { + "epoch": 0.6609846075562905, + "ewc_loss": 0.03171775862574577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.585887912369799e-05, + "grad_norm": 24.204483032226562, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8678992390632629, + "num_tokens": 198325609.0, + "step": 5196 + }, + { + "epoch": 0.661111817834881, + "ewc_loss": 0.03176086023449898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.58804305101512e-05, + "grad_norm": 24.10797691345215, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8599701523780823, + "num_tokens": 198365116.0, + "step": 5197 + }, + { + "epoch": 0.6612390281134716, + "ewc_loss": 0.0317254364490509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5862719010328874e-05, + "grad_norm": 24.265779495239258, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8573516607284546, + "num_tokens": 198403467.0, + "step": 5198 + }, + { + "epoch": 0.6613662383920621, + "ewc_loss": 0.03168918937444687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5844594599911943e-05, + "grad_norm": 24.043107986450195, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8635232448577881, + "num_tokens": 198440676.0, + "step": 5199 + }, + { + "epoch": 0.6614934486706526, + "ewc_loss": 0.031682319939136505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.584116034791805e-05, + "grad_norm": 24.331912994384766, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8594021797180176, + "num_tokens": 198479082.0, + "step": 5200 + }, + { + "epoch": 0.661620658949243, + "ewc_loss": 0.03172216936945915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.586108555784449e-05, + "grad_norm": 24.101993560791016, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8533391952514648, + "num_tokens": 198521741.0, + "step": 5201 + }, + { + "epoch": 0.6617478692278336, + "ewc_loss": 0.03160063177347183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.580031675985083e-05, + "grad_norm": 24.163026809692383, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8500734567642212, + "num_tokens": 198562177.0, + "step": 5202 + }, + { + "epoch": 0.6618750795064241, + "ewc_loss": 0.031693726778030396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5846862879698165e-05, + "grad_norm": 24.127506256103516, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8585607409477234, + "num_tokens": 198600726.0, + "step": 5203 + }, + { + "epoch": 0.6620022897850146, + "ewc_loss": 0.03167879581451416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5839397747186013e-05, + "grad_norm": 24.038585662841797, + "learning_rate": 1e-06, + "loss": 0.5473, + "mean_token_accuracy": 0.8289728164672852, + "num_tokens": 198638156.0, + "step": 5204 + }, + { + "epoch": 0.6621295000636052, + "ewc_loss": 0.03167201206088066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.58360053319484e-05, + "grad_norm": 24.08612632751465, + "learning_rate": 1e-06, + "loss": 0.5316, + "mean_token_accuracy": 0.8384262323379517, + "num_tokens": 198680312.0, + "step": 5205 + }, + { + "epoch": 0.6622567103421957, + "ewc_loss": 0.03171567991375923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5857840480748564e-05, + "grad_norm": 24.203109741210938, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8749243021011353, + "num_tokens": 198723886.0, + "step": 5206 + }, + { + "epoch": 0.6623839206207861, + "ewc_loss": 0.03175093233585358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5875466488068923e-05, + "grad_norm": 24.165864944458008, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8573984503746033, + "num_tokens": 198765331.0, + "step": 5207 + }, + { + "epoch": 0.6625111308993766, + "ewc_loss": 0.03165549039840698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5827745301066898e-05, + "grad_norm": 24.044591903686523, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8603156805038452, + "num_tokens": 198802421.0, + "step": 5208 + }, + { + "epoch": 0.6626383411779672, + "ewc_loss": 0.03166218101978302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.583109042258002e-05, + "grad_norm": 24.202821731567383, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8672386407852173, + "num_tokens": 198843335.0, + "step": 5209 + }, + { + "epoch": 0.6627655514565577, + "ewc_loss": 0.03165528550744057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.58276434376603e-05, + "grad_norm": 24.024307250976562, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.864045262336731, + "num_tokens": 198880925.0, + "step": 5210 + }, + { + "epoch": 0.6628927617351482, + "ewc_loss": 0.031658027321100235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.582901313668117e-05, + "grad_norm": 24.19244956970215, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8557900190353394, + "num_tokens": 198915448.0, + "step": 5211 + }, + { + "epoch": 0.6630199720137387, + "ewc_loss": 0.03174315392971039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5871577488724142e-05, + "grad_norm": 24.09457015991211, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8673437237739563, + "num_tokens": 198952254.0, + "step": 5212 + }, + { + "epoch": 0.6631471822923293, + "ewc_loss": 0.03163756802678108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5818783140275627e-05, + "grad_norm": 24.030241012573242, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8622391223907471, + "num_tokens": 198990841.0, + "step": 5213 + }, + { + "epoch": 0.6632743925709197, + "ewc_loss": 0.03177341818809509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.588670966157224e-05, + "grad_norm": 24.19599151611328, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.860138475894928, + "num_tokens": 199029383.0, + "step": 5214 + }, + { + "epoch": 0.6634016028495102, + "ewc_loss": 0.03174148127436638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.587074075359851e-05, + "grad_norm": 24.044422149658203, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.856842577457428, + "num_tokens": 199066731.0, + "step": 5215 + }, + { + "epoch": 0.6635288131281007, + "ewc_loss": 0.03170705586671829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5853527656872757e-05, + "grad_norm": 24.123367309570312, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.869045615196228, + "num_tokens": 199108752.0, + "step": 5216 + }, + { + "epoch": 0.6636560234066913, + "ewc_loss": 0.03170114383101463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5850571799091995e-05, + "grad_norm": 24.08750343322754, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8614569902420044, + "num_tokens": 199146438.0, + "step": 5217 + }, + { + "epoch": 0.6637832336852818, + "ewc_loss": 0.03170726075768471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5853629520279355e-05, + "grad_norm": 24.123403549194336, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8593205213546753, + "num_tokens": 199186367.0, + "step": 5218 + }, + { + "epoch": 0.6639104439638723, + "ewc_loss": 0.031713008880615234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5856503523536958e-05, + "grad_norm": 24.05130386352539, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8706054091453552, + "num_tokens": 199225785.0, + "step": 5219 + }, + { + "epoch": 0.6640376542424627, + "ewc_loss": 0.03175745904445648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5878729755058885e-05, + "grad_norm": 24.09832191467285, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8617849349975586, + "num_tokens": 199262574.0, + "step": 5220 + }, + { + "epoch": 0.6641648645210533, + "ewc_loss": 0.031775761395692825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5887881090748124e-05, + "grad_norm": 24.175817489624023, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8630062937736511, + "num_tokens": 199299093.0, + "step": 5221 + }, + { + "epoch": 0.6642920747996438, + "ewc_loss": 0.03174309432506561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5871546565904282e-05, + "grad_norm": 24.0106143951416, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8639322519302368, + "num_tokens": 199337446.0, + "step": 5222 + }, + { + "epoch": 0.6644192850782343, + "ewc_loss": 0.031712740659713745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.58563707373105e-05, + "grad_norm": 24.19144630432129, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8595056533813477, + "num_tokens": 199377321.0, + "step": 5223 + }, + { + "epoch": 0.6645464953568249, + "ewc_loss": 0.0317445769906044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.587228871358093e-05, + "grad_norm": 24.035480499267578, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8687163591384888, + "num_tokens": 199413057.0, + "step": 5224 + }, + { + "epoch": 0.6646737056354154, + "ewc_loss": 0.031731463968753815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5865731256781146e-05, + "grad_norm": 24.19159698486328, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8651487231254578, + "num_tokens": 199452730.0, + "step": 5225 + }, + { + "epoch": 0.6648009159140058, + "ewc_loss": 0.031784482300281525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5892241208348423e-05, + "grad_norm": 24.085901260375977, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8684283494949341, + "num_tokens": 199487860.0, + "step": 5226 + }, + { + "epoch": 0.6649281261925963, + "ewc_loss": 0.03173597902059555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5867988622630946e-05, + "grad_norm": 24.163448333740234, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8649404048919678, + "num_tokens": 199532790.0, + "step": 5227 + }, + { + "epoch": 0.6650553364711869, + "ewc_loss": 0.03171669319272041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.585834615980275e-05, + "grad_norm": 24.06093978881836, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8777308464050293, + "num_tokens": 199561072.0, + "step": 5228 + }, + { + "epoch": 0.6651825467497774, + "ewc_loss": 0.03174680098891258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5873400116106495e-05, + "grad_norm": 24.21324348449707, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8637824058532715, + "num_tokens": 199595820.0, + "step": 5229 + }, + { + "epoch": 0.6653097570283679, + "ewc_loss": 0.03172456473112106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5862282452872023e-05, + "grad_norm": 24.01786994934082, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8448116779327393, + "num_tokens": 199640101.0, + "step": 5230 + }, + { + "epoch": 0.6654369673069584, + "ewc_loss": 0.03165967017412186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5829835319891572e-05, + "grad_norm": 24.106754302978516, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8541727066040039, + "num_tokens": 199680287.0, + "step": 5231 + }, + { + "epoch": 0.6655641775855489, + "ewc_loss": 0.031762465834617615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5881232684478164e-05, + "grad_norm": 24.14398956298828, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8572705984115601, + "num_tokens": 199721958.0, + "step": 5232 + }, + { + "epoch": 0.6656913878641394, + "ewc_loss": 0.03170885518193245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5854428056627512e-05, + "grad_norm": 24.147430419921875, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8639093637466431, + "num_tokens": 199756021.0, + "step": 5233 + }, + { + "epoch": 0.6658185981427299, + "ewc_loss": 0.031701236963272095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5850619092816487e-05, + "grad_norm": 24.095304489135742, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8627874851226807, + "num_tokens": 199793586.0, + "step": 5234 + }, + { + "epoch": 0.6659458084213205, + "ewc_loss": 0.03172392398118973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5861962310737e-05, + "grad_norm": 24.11191749572754, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8456419110298157, + "num_tokens": 199835728.0, + "step": 5235 + }, + { + "epoch": 0.666073018699911, + "ewc_loss": 0.031752150505781174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.587607584951911e-05, + "grad_norm": 24.118223190307617, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8680856823921204, + "num_tokens": 199873348.0, + "step": 5236 + }, + { + "epoch": 0.6662002289785015, + "ewc_loss": 0.03176531568169594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.588265695318114e-05, + "grad_norm": 24.15261459350586, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8581874370574951, + "num_tokens": 199908223.0, + "step": 5237 + }, + { + "epoch": 0.6663274392570919, + "ewc_loss": 0.031728774309158325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5864387023611926e-05, + "grad_norm": 24.08329963684082, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8542284965515137, + "num_tokens": 199947017.0, + "step": 5238 + }, + { + "epoch": 0.6664546495356825, + "ewc_loss": 0.03177900239825249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5889501810306683e-05, + "grad_norm": 24.164278030395508, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.866621732711792, + "num_tokens": 199984806.0, + "step": 5239 + }, + { + "epoch": 0.666581859814273, + "ewc_loss": 0.03179076686501503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5895382603048347e-05, + "grad_norm": 24.13737678527832, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8519877195358276, + "num_tokens": 200022600.0, + "step": 5240 + }, + { + "epoch": 0.6667090700928635, + "ewc_loss": 0.03170132264494896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.585066092957277e-05, + "grad_norm": 24.065711975097656, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8678411841392517, + "num_tokens": 200058407.0, + "step": 5241 + }, + { + "epoch": 0.666836280371454, + "ewc_loss": 0.03176417946815491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5882089428487234e-05, + "grad_norm": 24.227760314941406, + "learning_rate": 1e-06, + "loss": 0.5373, + "mean_token_accuracy": 0.8314600586891174, + "num_tokens": 200101871.0, + "step": 5242 + }, + { + "epoch": 0.6669634906500446, + "ewc_loss": 0.031803883612155914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5901941878837533e-05, + "grad_norm": 24.12421226501465, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.859198808670044, + "num_tokens": 200138501.0, + "step": 5243 + }, + { + "epoch": 0.667090700928635, + "ewc_loss": 0.0317520946264267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5876046745688654e-05, + "grad_norm": 24.13520622253418, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8572604656219482, + "num_tokens": 200176639.0, + "step": 5244 + }, + { + "epoch": 0.6672179112072255, + "ewc_loss": 0.03176511079072952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.588255508977454e-05, + "grad_norm": 24.076946258544922, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.854356586933136, + "num_tokens": 200211873.0, + "step": 5245 + }, + { + "epoch": 0.667345121485816, + "ewc_loss": 0.031786710023880005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5893354429863393e-05, + "grad_norm": 24.17449951171875, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8542320728302002, + "num_tokens": 200246596.0, + "step": 5246 + }, + { + "epoch": 0.6674723317644066, + "ewc_loss": 0.03180224448442459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5901121514616534e-05, + "grad_norm": 24.08709716796875, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8739621043205261, + "num_tokens": 200282761.0, + "step": 5247 + }, + { + "epoch": 0.6675995420429971, + "ewc_loss": 0.03178224340081215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5891122529865243e-05, + "grad_norm": 24.078630447387695, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.842532217502594, + "num_tokens": 200321821.0, + "step": 5248 + }, + { + "epoch": 0.6677267523215876, + "ewc_loss": 0.031861670315265656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5930834706523456e-05, + "grad_norm": 24.198408126831055, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8415943384170532, + "num_tokens": 200369066.0, + "step": 5249 + }, + { + "epoch": 0.667853962600178, + "ewc_loss": 0.031870774924755096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.593538763700053e-05, + "grad_norm": 24.140460968017578, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8501483201980591, + "num_tokens": 200408067.0, + "step": 5250 + }, + { + "epoch": 0.6679811728787686, + "ewc_loss": 0.031796932220458984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5898465790087357e-05, + "grad_norm": 24.164104461669922, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8744940757751465, + "num_tokens": 200446693.0, + "step": 5251 + }, + { + "epoch": 0.6681083831573591, + "ewc_loss": 0.031859952956438065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5929976143524982e-05, + "grad_norm": 24.1345272064209, + "learning_rate": 1e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8387751579284668, + "num_tokens": 200485340.0, + "step": 5252 + }, + { + "epoch": 0.6682355934359496, + "ewc_loss": 0.03186899051070213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.593449451320339e-05, + "grad_norm": 24.238351821899414, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8549742698669434, + "num_tokens": 200521978.0, + "step": 5253 + }, + { + "epoch": 0.6683628037145402, + "ewc_loss": 0.03186716511845589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5933583199512213e-05, + "grad_norm": 24.128385543823242, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8597860336303711, + "num_tokens": 200560604.0, + "step": 5254 + }, + { + "epoch": 0.6684900139931307, + "ewc_loss": 0.03188055008649826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.594027526152786e-05, + "grad_norm": 24.129526138305664, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8538552522659302, + "num_tokens": 200601718.0, + "step": 5255 + }, + { + "epoch": 0.6686172242717211, + "ewc_loss": 0.03188652917742729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5943263861117885e-05, + "grad_norm": 24.151580810546875, + "learning_rate": 1e-06, + "loss": 0.5275, + "mean_token_accuracy": 0.833737850189209, + "num_tokens": 200646186.0, + "step": 5256 + }, + { + "epoch": 0.6687444345503116, + "ewc_loss": 0.03188219666481018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.594109744473826e-05, + "grad_norm": 24.272123336791992, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.860968291759491, + "num_tokens": 200684849.0, + "step": 5257 + }, + { + "epoch": 0.6688716448289022, + "ewc_loss": 0.03189685195684433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5948426153045148e-05, + "grad_norm": 24.098312377929688, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8634727597236633, + "num_tokens": 200719542.0, + "step": 5258 + }, + { + "epoch": 0.6689988551074927, + "ewc_loss": 0.031944915652275085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5972458641044796e-05, + "grad_norm": 24.3142147064209, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8594207167625427, + "num_tokens": 200756212.0, + "step": 5259 + }, + { + "epoch": 0.6691260653860832, + "ewc_loss": 0.031902872025966644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5951436580508016e-05, + "grad_norm": 24.105499267578125, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8605101704597473, + "num_tokens": 200791184.0, + "step": 5260 + }, + { + "epoch": 0.6692532756646737, + "ewc_loss": 0.031881995499134064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5940997400321066e-05, + "grad_norm": 24.281137466430664, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8411358594894409, + "num_tokens": 200823966.0, + "step": 5261 + }, + { + "epoch": 0.6693804859432643, + "ewc_loss": 0.03190474212169647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.595237154106144e-05, + "grad_norm": 24.21538734436035, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8626874685287476, + "num_tokens": 200862718.0, + "step": 5262 + }, + { + "epoch": 0.6695076962218547, + "ewc_loss": 0.031866248697042465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.593312481418252e-05, + "grad_norm": 24.105018615722656, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8530234098434448, + "num_tokens": 200901755.0, + "step": 5263 + }, + { + "epoch": 0.6696349065004452, + "ewc_loss": 0.031851425766944885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.592571243236307e-05, + "grad_norm": 24.30560302734375, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8697168827056885, + "num_tokens": 200935788.0, + "step": 5264 + }, + { + "epoch": 0.6697621167790357, + "ewc_loss": 0.031870704144239426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.593535125721246e-05, + "grad_norm": 24.07206916809082, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8721848130226135, + "num_tokens": 200975159.0, + "step": 5265 + }, + { + "epoch": 0.6698893270576263, + "ewc_loss": 0.03182506561279297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5912532035144977e-05, + "grad_norm": 24.294536590576172, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8700944185256958, + "num_tokens": 201010813.0, + "step": 5266 + }, + { + "epoch": 0.6700165373362168, + "ewc_loss": 0.03185448423027992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5927242202451453e-05, + "grad_norm": 24.159055709838867, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8562003374099731, + "num_tokens": 201051937.0, + "step": 5267 + }, + { + "epoch": 0.6701437476148073, + "ewc_loss": 0.03186948597431183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5934743714751676e-05, + "grad_norm": 24.38612174987793, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8646367192268372, + "num_tokens": 201090926.0, + "step": 5268 + }, + { + "epoch": 0.6702709578933977, + "ewc_loss": 0.03181556984782219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5907784472801723e-05, + "grad_norm": 24.366968154907227, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.837714433670044, + "num_tokens": 201126086.0, + "step": 5269 + }, + { + "epoch": 0.6703981681719883, + "ewc_loss": 0.03178735822439194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5893678209977224e-05, + "grad_norm": 24.273059844970703, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.847158670425415, + "num_tokens": 201165933.0, + "step": 5270 + }, + { + "epoch": 0.6705253784505788, + "ewc_loss": 0.031792037189006805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5896019249339588e-05, + "grad_norm": 24.412193298339844, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.861178994178772, + "num_tokens": 201198794.0, + "step": 5271 + }, + { + "epoch": 0.6706525887291693, + "ewc_loss": 0.03169562667608261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.584781421115622e-05, + "grad_norm": 24.05706787109375, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8493581414222717, + "num_tokens": 201234081.0, + "step": 5272 + }, + { + "epoch": 0.6707797990077599, + "ewc_loss": 0.03169381618499756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5846908354433253e-05, + "grad_norm": 24.248958587646484, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8544603586196899, + "num_tokens": 201271796.0, + "step": 5273 + }, + { + "epoch": 0.6709070092863504, + "ewc_loss": 0.03181898593902588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5909492503851652e-05, + "grad_norm": 24.234586715698242, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8404721021652222, + "num_tokens": 201309526.0, + "step": 5274 + }, + { + "epoch": 0.6710342195649408, + "ewc_loss": 0.03173952177166939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.586976031831e-05, + "grad_norm": 24.098003387451172, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8653165102005005, + "num_tokens": 201344015.0, + "step": 5275 + }, + { + "epoch": 0.6711614298435313, + "ewc_loss": 0.03178102895617485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5890514987404458e-05, + "grad_norm": 24.266395568847656, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8579517602920532, + "num_tokens": 201385201.0, + "step": 5276 + }, + { + "epoch": 0.6712886401221219, + "ewc_loss": 0.03177141770720482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.588570921740029e-05, + "grad_norm": 24.043170928955078, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8630765676498413, + "num_tokens": 201422370.0, + "step": 5277 + }, + { + "epoch": 0.6714158504007124, + "ewc_loss": 0.03175140172243118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.587570113770198e-05, + "grad_norm": 24.09648895263672, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8428789973258972, + "num_tokens": 201467085.0, + "step": 5278 + }, + { + "epoch": 0.6715430606793029, + "ewc_loss": 0.03180303797125816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.590151987329591e-05, + "grad_norm": 24.105356216430664, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8724610805511475, + "num_tokens": 201503432.0, + "step": 5279 + }, + { + "epoch": 0.6716702709578934, + "ewc_loss": 0.03183049336075783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5915245967335068e-05, + "grad_norm": 24.153797149658203, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.865371584892273, + "num_tokens": 201541358.0, + "step": 5280 + }, + { + "epoch": 0.6717974812364839, + "ewc_loss": 0.03184634447097778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.592317312315572e-05, + "grad_norm": 24.15873146057129, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8549901843070984, + "num_tokens": 201580235.0, + "step": 5281 + }, + { + "epoch": 0.6719246915150744, + "ewc_loss": 0.031915102154016495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5957550203893334e-05, + "grad_norm": 24.179182052612305, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8597468137741089, + "num_tokens": 201627703.0, + "step": 5282 + }, + { + "epoch": 0.6720519017936649, + "ewc_loss": 0.03185258060693741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5926290870993398e-05, + "grad_norm": 24.273256301879883, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8599615097045898, + "num_tokens": 201667182.0, + "step": 5283 + }, + { + "epoch": 0.6721791120722554, + "ewc_loss": 0.031866732984781265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.593336673977319e-05, + "grad_norm": 24.26299476623535, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8602389693260193, + "num_tokens": 201699876.0, + "step": 5284 + }, + { + "epoch": 0.672306322350846, + "ewc_loss": 0.03183235228061676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.591617547092028e-05, + "grad_norm": 24.190017700195312, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8477141857147217, + "num_tokens": 201732770.0, + "step": 5285 + }, + { + "epoch": 0.6724335326294365, + "ewc_loss": 0.03186793997883797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.593397064425517e-05, + "grad_norm": 24.204410552978516, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8595917224884033, + "num_tokens": 201771388.0, + "step": 5286 + }, + { + "epoch": 0.6725607429080269, + "ewc_loss": 0.03188671916723251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5943360267556272e-05, + "grad_norm": 24.428741455078125, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8523534536361694, + "num_tokens": 201809283.0, + "step": 5287 + }, + { + "epoch": 0.6726879531866174, + "ewc_loss": 0.03182410076260567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5912050002953038e-05, + "grad_norm": 24.240318298339844, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8643741607666016, + "num_tokens": 201854950.0, + "step": 5288 + }, + { + "epoch": 0.672815163465208, + "ewc_loss": 0.03179647773504257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5898238416411914e-05, + "grad_norm": 24.241119384765625, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8619544506072998, + "num_tokens": 201895638.0, + "step": 5289 + }, + { + "epoch": 0.6729423737437985, + "ewc_loss": 0.03177833557128906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.588916711625643e-05, + "grad_norm": 24.192811965942383, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8441683053970337, + "num_tokens": 201928243.0, + "step": 5290 + }, + { + "epoch": 0.673069584022389, + "ewc_loss": 0.03179166093468666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.589583007444162e-05, + "grad_norm": 24.358449935913086, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8578009605407715, + "num_tokens": 201964687.0, + "step": 5291 + }, + { + "epoch": 0.6731967943009796, + "ewc_loss": 0.03185680881142616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.592840453668032e-05, + "grad_norm": 24.2279109954834, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8700617551803589, + "num_tokens": 201997166.0, + "step": 5292 + }, + { + "epoch": 0.67332400457957, + "ewc_loss": 0.0318068228662014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.59034116222756e-05, + "grad_norm": 24.13294219970703, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.860472559928894, + "num_tokens": 202030771.0, + "step": 5293 + }, + { + "epoch": 0.6734512148581605, + "ewc_loss": 0.03185359016060829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.592679473105818e-05, + "grad_norm": 24.24446678161621, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8509819507598877, + "num_tokens": 202077280.0, + "step": 5294 + }, + { + "epoch": 0.673578425136751, + "ewc_loss": 0.03183233365416527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5916166375973262e-05, + "grad_norm": 24.138490676879883, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8499456644058228, + "num_tokens": 202115273.0, + "step": 5295 + }, + { + "epoch": 0.6737056354153416, + "ewc_loss": 0.031829770654439926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5914885807433166e-05, + "grad_norm": 24.290250778198242, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8685382604598999, + "num_tokens": 202145347.0, + "step": 5296 + }, + { + "epoch": 0.6738328456939321, + "ewc_loss": 0.03188741207122803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5943705875542946e-05, + "grad_norm": 24.165287017822266, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8627166152000427, + "num_tokens": 202184621.0, + "step": 5297 + }, + { + "epoch": 0.6739600559725226, + "ewc_loss": 0.031832050532102585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5916024494799785e-05, + "grad_norm": 24.195009231567383, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8616315722465515, + "num_tokens": 202230099.0, + "step": 5298 + }, + { + "epoch": 0.674087266251113, + "ewc_loss": 0.03188970312476158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.594485183886718e-05, + "grad_norm": 24.20513153076172, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8520165681838989, + "num_tokens": 202267872.0, + "step": 5299 + }, + { + "epoch": 0.6742144765297036, + "ewc_loss": 0.03186531737446785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5932659152895212e-05, + "grad_norm": 24.206897735595703, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8560813665390015, + "num_tokens": 202305517.0, + "step": 5300 + }, + { + "epoch": 0.6743416868082941, + "ewc_loss": 0.03182728961110115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5913645256659947e-05, + "grad_norm": 24.099655151367188, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.8441821336746216, + "num_tokens": 202340671.0, + "step": 5301 + }, + { + "epoch": 0.6744688970868846, + "ewc_loss": 0.03187105432152748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5935527699184604e-05, + "grad_norm": 24.11859130859375, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8656362891197205, + "num_tokens": 202379552.0, + "step": 5302 + }, + { + "epoch": 0.6745961073654752, + "ewc_loss": 0.03191996365785599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5959982192725874e-05, + "grad_norm": 24.381732940673828, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8521926403045654, + "num_tokens": 202421404.0, + "step": 5303 + }, + { + "epoch": 0.6747233176440657, + "ewc_loss": 0.031919192522764206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5959596566972323e-05, + "grad_norm": 24.180286407470703, + "learning_rate": 1e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8441993594169617, + "num_tokens": 202463765.0, + "step": 5304 + }, + { + "epoch": 0.6748505279226561, + "ewc_loss": 0.03177858516573906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5889292626525275e-05, + "grad_norm": 24.261783599853516, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8533123731613159, + "num_tokens": 202505872.0, + "step": 5305 + }, + { + "epoch": 0.6749777382012466, + "ewc_loss": 0.03193534165620804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5967671060934663e-05, + "grad_norm": 24.294418334960938, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8560693264007568, + "num_tokens": 202544085.0, + "step": 5306 + }, + { + "epoch": 0.6751049484798372, + "ewc_loss": 0.03179067373275757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5895337128313258e-05, + "grad_norm": 24.290212631225586, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8545545339584351, + "num_tokens": 202588237.0, + "step": 5307 + }, + { + "epoch": 0.6752321587584277, + "ewc_loss": 0.03182294964790344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5911475202301517e-05, + "grad_norm": 24.250244140625, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8519832491874695, + "num_tokens": 202628462.0, + "step": 5308 + }, + { + "epoch": 0.6753593690370182, + "ewc_loss": 0.03178875148296356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5894374882918783e-05, + "grad_norm": 24.205354690551758, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.859926700592041, + "num_tokens": 202668149.0, + "step": 5309 + }, + { + "epoch": 0.6754865793156087, + "ewc_loss": 0.03180795907974243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5903979146969505e-05, + "grad_norm": 24.14157485961914, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8731687664985657, + "num_tokens": 202708651.0, + "step": 5310 + }, + { + "epoch": 0.6756137895941993, + "ewc_loss": 0.03180510550737381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5902553059277125e-05, + "grad_norm": 24.199460983276367, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8625850081443787, + "num_tokens": 202738613.0, + "step": 5311 + }, + { + "epoch": 0.6757409998727897, + "ewc_loss": 0.03183218091726303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5916089978418313e-05, + "grad_norm": 24.21949577331543, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8540753722190857, + "num_tokens": 202779279.0, + "step": 5312 + }, + { + "epoch": 0.6758682101513802, + "ewc_loss": 0.031906045973300934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5953022739267908e-05, + "grad_norm": 24.28439712524414, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8582957983016968, + "num_tokens": 202812095.0, + "step": 5313 + }, + { + "epoch": 0.6759954204299707, + "ewc_loss": 0.03181315213441849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5906576663837768e-05, + "grad_norm": 24.11725616455078, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8533365726470947, + "num_tokens": 202855705.0, + "step": 5314 + }, + { + "epoch": 0.6761226307085613, + "ewc_loss": 0.03189347684383392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5946738130878657e-05, + "grad_norm": 24.173053741455078, + "learning_rate": 1e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8407764434814453, + "num_tokens": 202894180.0, + "step": 5315 + }, + { + "epoch": 0.6762498409871518, + "ewc_loss": 0.03188320994377136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.594160494278185e-05, + "grad_norm": 24.186965942382812, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.864326000213623, + "num_tokens": 202925842.0, + "step": 5316 + }, + { + "epoch": 0.6763770512657423, + "ewc_loss": 0.031846001744270325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5923000319162384e-05, + "grad_norm": 24.142515182495117, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.86418616771698, + "num_tokens": 202959103.0, + "step": 5317 + }, + { + "epoch": 0.6765042615443327, + "ewc_loss": 0.031906161457300186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.595308094692882e-05, + "grad_norm": 24.094358444213867, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8577936291694641, + "num_tokens": 203000393.0, + "step": 5318 + }, + { + "epoch": 0.6766314718229233, + "ewc_loss": 0.031948309391736984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.59741539391689e-05, + "grad_norm": 24.200700759887695, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8589617609977722, + "num_tokens": 203041613.0, + "step": 5319 + }, + { + "epoch": 0.6767586821015138, + "ewc_loss": 0.032008010894060135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.600400537427049e-05, + "grad_norm": 24.28908348083496, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8720003962516785, + "num_tokens": 203073744.0, + "step": 5320 + }, + { + "epoch": 0.6768858923801043, + "ewc_loss": 0.03192118555307388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5960593373165466e-05, + "grad_norm": 24.132362365722656, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8430727124214172, + "num_tokens": 203108350.0, + "step": 5321 + }, + { + "epoch": 0.6770131026586949, + "ewc_loss": 0.031923796981573105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5961899407557212e-05, + "grad_norm": 24.284561157226562, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8611341714859009, + "num_tokens": 203139149.0, + "step": 5322 + }, + { + "epoch": 0.6771403129372854, + "ewc_loss": 0.03202268108725548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6011341358534992e-05, + "grad_norm": 24.238794326782227, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8394200801849365, + "num_tokens": 203173953.0, + "step": 5323 + }, + { + "epoch": 0.6772675232158758, + "ewc_loss": 0.031981900334358215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5990950487321243e-05, + "grad_norm": 24.293659210205078, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8681173324584961, + "num_tokens": 203208915.0, + "step": 5324 + }, + { + "epoch": 0.6773947334944663, + "ewc_loss": 0.031993210315704346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5996605725376867e-05, + "grad_norm": 24.226551055908203, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8452358245849609, + "num_tokens": 203251906.0, + "step": 5325 + }, + { + "epoch": 0.6775219437730569, + "ewc_loss": 0.03195236250758171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5976182112353854e-05, + "grad_norm": 24.265003204345703, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8582804203033447, + "num_tokens": 203289460.0, + "step": 5326 + }, + { + "epoch": 0.6776491540516474, + "ewc_loss": 0.03198906034231186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.599453025846742e-05, + "grad_norm": 24.219160079956055, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8657406568527222, + "num_tokens": 203325647.0, + "step": 5327 + }, + { + "epoch": 0.6777763643302379, + "ewc_loss": 0.031932469457387924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.596623405930586e-05, + "grad_norm": 24.208478927612305, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8536222577095032, + "num_tokens": 203364972.0, + "step": 5328 + }, + { + "epoch": 0.6779035746088284, + "ewc_loss": 0.03202227130532265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.601113581273239e-05, + "grad_norm": 24.305477142333984, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8581909537315369, + "num_tokens": 203407778.0, + "step": 5329 + }, + { + "epoch": 0.6780307848874189, + "ewc_loss": 0.03200384974479675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6001924450392835e-05, + "grad_norm": 24.30353546142578, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8582291603088379, + "num_tokens": 203443820.0, + "step": 5330 + }, + { + "epoch": 0.6781579951660094, + "ewc_loss": 0.03196999430656433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5984996935003437e-05, + "grad_norm": 24.114809036254883, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8635601997375488, + "num_tokens": 203475742.0, + "step": 5331 + }, + { + "epoch": 0.6782852054445999, + "ewc_loss": 0.031978681683540344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5989340681699105e-05, + "grad_norm": 24.370378494262695, + "learning_rate": 1e-06, + "loss": 0.5359, + "mean_token_accuracy": 0.8327783942222595, + "num_tokens": 203512617.0, + "step": 5332 + }, + { + "epoch": 0.6784124157231904, + "ewc_loss": 0.03203044831752777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6015224900911562e-05, + "grad_norm": 24.099647521972656, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8430556058883667, + "num_tokens": 203546778.0, + "step": 5333 + }, + { + "epoch": 0.678539626001781, + "ewc_loss": 0.03192855790257454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.596427864569705e-05, + "grad_norm": 24.270315170288086, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8588181734085083, + "num_tokens": 203585222.0, + "step": 5334 + }, + { + "epoch": 0.6786668362803715, + "ewc_loss": 0.03201896324753761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6009482351364568e-05, + "grad_norm": 24.188587188720703, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8517609238624573, + "num_tokens": 203627014.0, + "step": 5335 + }, + { + "epoch": 0.6787940465589619, + "ewc_loss": 0.03200171887874603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.600086034159176e-05, + "grad_norm": 24.267852783203125, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.864047110080719, + "num_tokens": 203659189.0, + "step": 5336 + }, + { + "epoch": 0.6789212568375524, + "ewc_loss": 0.03205755352973938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.602877637196798e-05, + "grad_norm": 24.133577346801758, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8686032295227051, + "num_tokens": 203695355.0, + "step": 5337 + }, + { + "epoch": 0.679048467116143, + "ewc_loss": 0.03199722245335579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5998612070688978e-05, + "grad_norm": 24.319801330566406, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8666563034057617, + "num_tokens": 203733446.0, + "step": 5338 + }, + { + "epoch": 0.6791756773947335, + "ewc_loss": 0.03208985552191734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6044927178882062e-05, + "grad_norm": 24.19169044494629, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8422917127609253, + "num_tokens": 203775387.0, + "step": 5339 + }, + { + "epoch": 0.679302887673324, + "ewc_loss": 0.032000187784433365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6000094547052868e-05, + "grad_norm": 24.317705154418945, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8529039621353149, + "num_tokens": 203810713.0, + "step": 5340 + }, + { + "epoch": 0.6794300979519146, + "ewc_loss": 0.03198390454053879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5991952750482596e-05, + "grad_norm": 24.10182762145996, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.870342493057251, + "num_tokens": 203847144.0, + "step": 5341 + }, + { + "epoch": 0.679557308230505, + "ewc_loss": 0.03200288116931915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.600144059921149e-05, + "grad_norm": 24.359804153442383, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8636243343353271, + "num_tokens": 203889716.0, + "step": 5342 + }, + { + "epoch": 0.6796845185090955, + "ewc_loss": 0.03207909315824509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6039546608226374e-05, + "grad_norm": 24.18338966369629, + "learning_rate": 1e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8445780277252197, + "num_tokens": 203927692.0, + "step": 5343 + }, + { + "epoch": 0.679811728787686, + "ewc_loss": 0.03196542710065842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5982714103301987e-05, + "grad_norm": 24.26805877685547, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8434358835220337, + "num_tokens": 203960180.0, + "step": 5344 + }, + { + "epoch": 0.6799389390662766, + "ewc_loss": 0.03208400681614876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6042004062910564e-05, + "grad_norm": 24.21923828125, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8691931962966919, + "num_tokens": 203997375.0, + "step": 5345 + }, + { + "epoch": 0.6800661493448671, + "ewc_loss": 0.03202173858880997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.601086842129007e-05, + "grad_norm": 24.27497673034668, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8542912602424622, + "num_tokens": 204037588.0, + "step": 5346 + }, + { + "epoch": 0.6801933596234576, + "ewc_loss": 0.03200765699148178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6003828932298347e-05, + "grad_norm": 24.190208435058594, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.8504905104637146, + "num_tokens": 204080993.0, + "step": 5347 + }, + { + "epoch": 0.680320569902048, + "ewc_loss": 0.03199627250432968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.599813549546525e-05, + "grad_norm": 24.269264221191406, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8602913618087769, + "num_tokens": 204116863.0, + "step": 5348 + }, + { + "epoch": 0.6804477801806386, + "ewc_loss": 0.032077692449092865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.603884629730601e-05, + "grad_norm": 24.246030807495117, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.851401686668396, + "num_tokens": 204156551.0, + "step": 5349 + }, + { + "epoch": 0.6805749904592291, + "ewc_loss": 0.03201989457011223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6009948012651876e-05, + "grad_norm": 24.192108154296875, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8518989086151123, + "num_tokens": 204196210.0, + "step": 5350 + }, + { + "epoch": 0.6807022007378196, + "ewc_loss": 0.032007038593292236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6003519704099745e-05, + "grad_norm": 24.293834686279297, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8561538457870483, + "num_tokens": 204234791.0, + "step": 5351 + }, + { + "epoch": 0.6808294110164101, + "ewc_loss": 0.03202122077345848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6010610124794766e-05, + "grad_norm": 24.17453384399414, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8605443835258484, + "num_tokens": 204271631.0, + "step": 5352 + }, + { + "epoch": 0.6809566212950007, + "ewc_loss": 0.03201829269528389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6009145838324912e-05, + "grad_norm": 24.239120483398438, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.868499219417572, + "num_tokens": 204311879.0, + "step": 5353 + }, + { + "epoch": 0.6810838315735911, + "ewc_loss": 0.03202524781227112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.601262374606449e-05, + "grad_norm": 24.161134719848633, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8608155250549316, + "num_tokens": 204357123.0, + "step": 5354 + }, + { + "epoch": 0.6812110418521816, + "ewc_loss": 0.03203491121530533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.601745498192031e-05, + "grad_norm": 24.29698371887207, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8557906150817871, + "num_tokens": 204397196.0, + "step": 5355 + }, + { + "epoch": 0.6813382521307721, + "ewc_loss": 0.032057225704193115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.602861266292166e-05, + "grad_norm": 24.205673217773438, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8480550050735474, + "num_tokens": 204434654.0, + "step": 5356 + }, + { + "epoch": 0.6814654624093627, + "ewc_loss": 0.03200466185808182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6002330085029826e-05, + "grad_norm": 24.203125, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8807467222213745, + "num_tokens": 204466581.0, + "step": 5357 + }, + { + "epoch": 0.6815926726879532, + "ewc_loss": 0.032048359513282776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.602417978574522e-05, + "grad_norm": 24.358158111572266, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8703370094299316, + "num_tokens": 204503890.0, + "step": 5358 + }, + { + "epoch": 0.6817198829665437, + "ewc_loss": 0.03201510012149811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6007550584618002e-05, + "grad_norm": 24.204055786132812, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8533773422241211, + "num_tokens": 204537160.0, + "step": 5359 + }, + { + "epoch": 0.6818470932451343, + "ewc_loss": 0.03197583928704262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5987920050974935e-05, + "grad_norm": 24.28537940979004, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8678717613220215, + "num_tokens": 204570077.0, + "step": 5360 + }, + { + "epoch": 0.6819743035237247, + "ewc_loss": 0.032012928277254105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6006464647944085e-05, + "grad_norm": 24.334550857543945, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8728694915771484, + "num_tokens": 204603638.0, + "step": 5361 + }, + { + "epoch": 0.6821015138023152, + "ewc_loss": 0.03201477229595184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6007386875571683e-05, + "grad_norm": 24.300031661987305, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.854171633720398, + "num_tokens": 204644584.0, + "step": 5362 + }, + { + "epoch": 0.6822287240809057, + "ewc_loss": 0.03202275559306145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6011377738323063e-05, + "grad_norm": 24.30912208557129, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8448201417922974, + "num_tokens": 204678459.0, + "step": 5363 + }, + { + "epoch": 0.6823559343594963, + "ewc_loss": 0.031895190477371216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.5947594874887727e-05, + "grad_norm": 24.114290237426758, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8557045459747314, + "num_tokens": 204712923.0, + "step": 5364 + }, + { + "epoch": 0.6824831446380868, + "ewc_loss": 0.032012272626161575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6006137229851447e-05, + "grad_norm": 24.250051498413086, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.8401859402656555, + "num_tokens": 204751303.0, + "step": 5365 + }, + { + "epoch": 0.6826103549166773, + "ewc_loss": 0.03205394372344017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.602697193447966e-05, + "grad_norm": 24.12991714477539, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8578263521194458, + "num_tokens": 204791685.0, + "step": 5366 + }, + { + "epoch": 0.6827375651952677, + "ewc_loss": 0.03205680847167969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6028403479140252e-05, + "grad_norm": 24.280853271484375, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8586517572402954, + "num_tokens": 204827748.0, + "step": 5367 + }, + { + "epoch": 0.6828647754738583, + "ewc_loss": 0.032117221504449844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6058611436164938e-05, + "grad_norm": 24.235538482666016, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8733272552490234, + "num_tokens": 204866568.0, + "step": 5368 + }, + { + "epoch": 0.6829919857524488, + "ewc_loss": 0.03206374868750572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6031874110922217e-05, + "grad_norm": 24.36720085144043, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8574737310409546, + "num_tokens": 204908334.0, + "step": 5369 + }, + { + "epoch": 0.6831191960310393, + "ewc_loss": 0.03208106756210327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.60405343194725e-05, + "grad_norm": 24.183456420898438, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.861536979675293, + "num_tokens": 204949929.0, + "step": 5370 + }, + { + "epoch": 0.6832464063096299, + "ewc_loss": 0.03207907825708389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.603953933226876e-05, + "grad_norm": 24.392744064331055, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8485084772109985, + "num_tokens": 204991435.0, + "step": 5371 + }, + { + "epoch": 0.6833736165882204, + "ewc_loss": 0.032106295228004456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6053147191996686e-05, + "grad_norm": 24.267545700073242, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.864610493183136, + "num_tokens": 205028434.0, + "step": 5372 + }, + { + "epoch": 0.6835008268668108, + "ewc_loss": 0.03202715888619423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6013578715501353e-05, + "grad_norm": 24.249860763549805, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8519994020462036, + "num_tokens": 205064138.0, + "step": 5373 + }, + { + "epoch": 0.6836280371454013, + "ewc_loss": 0.032076645642519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6038322428357787e-05, + "grad_norm": 24.31817626953125, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8510006666183472, + "num_tokens": 205104058.0, + "step": 5374 + }, + { + "epoch": 0.6837552474239919, + "ewc_loss": 0.0320667065680027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.60333529493073e-05, + "grad_norm": 24.194477081298828, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8568034768104553, + "num_tokens": 205142609.0, + "step": 5375 + }, + { + "epoch": 0.6838824577025824, + "ewc_loss": 0.032029613852500916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6014806533348747e-05, + "grad_norm": 24.365610122680664, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8685812950134277, + "num_tokens": 205180407.0, + "step": 5376 + }, + { + "epoch": 0.6840096679811729, + "ewc_loss": 0.032090768218040466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6045383745222352e-05, + "grad_norm": 24.238101959228516, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.878993570804596, + "num_tokens": 205217340.0, + "step": 5377 + }, + { + "epoch": 0.6841368782597634, + "ewc_loss": 0.03203584998846054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6017924281186424e-05, + "grad_norm": 24.23089027404785, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.8459610342979431, + "num_tokens": 205263671.0, + "step": 5378 + }, + { + "epoch": 0.6842640885383539, + "ewc_loss": 0.032040201127529144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6020099792513065e-05, + "grad_norm": 24.211097717285156, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.854451596736908, + "num_tokens": 205302970.0, + "step": 5379 + }, + { + "epoch": 0.6843912988169444, + "ewc_loss": 0.0320480577647686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6024028809624724e-05, + "grad_norm": 24.315326690673828, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8611286878585815, + "num_tokens": 205342247.0, + "step": 5380 + }, + { + "epoch": 0.6845185090955349, + "ewc_loss": 0.03203507885336876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6017538655432872e-05, + "grad_norm": 24.286977767944336, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8438193202018738, + "num_tokens": 205374897.0, + "step": 5381 + }, + { + "epoch": 0.6846457193741254, + "ewc_loss": 0.03203679248690605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6018395399441943e-05, + "grad_norm": 24.2581844329834, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8544530868530273, + "num_tokens": 205411645.0, + "step": 5382 + }, + { + "epoch": 0.684772929652716, + "ewc_loss": 0.03204072266817093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6020361726987176e-05, + "grad_norm": 24.269973754882812, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.857473611831665, + "num_tokens": 205450972.0, + "step": 5383 + }, + { + "epoch": 0.6849001399313065, + "ewc_loss": 0.03205173462629318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.602586780791171e-05, + "grad_norm": 24.251548767089844, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8595435619354248, + "num_tokens": 205494314.0, + "step": 5384 + }, + { + "epoch": 0.6850273502098969, + "ewc_loss": 0.03208095580339432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.604047793080099e-05, + "grad_norm": 24.312070846557617, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8448097705841064, + "num_tokens": 205534696.0, + "step": 5385 + }, + { + "epoch": 0.6851545604884874, + "ewc_loss": 0.032012756913900375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.600637915544212e-05, + "grad_norm": 24.29540252685547, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8701871633529663, + "num_tokens": 205575579.0, + "step": 5386 + }, + { + "epoch": 0.685281770767078, + "ewc_loss": 0.03204304352402687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6021522242226638e-05, + "grad_norm": 24.293590545654297, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.860468864440918, + "num_tokens": 205609249.0, + "step": 5387 + }, + { + "epoch": 0.6854089810456685, + "ewc_loss": 0.03207017853856087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6035090084187686e-05, + "grad_norm": 24.28245735168457, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8548896312713623, + "num_tokens": 205644135.0, + "step": 5388 + }, + { + "epoch": 0.685536191324259, + "ewc_loss": 0.03208969160914421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6044845324358903e-05, + "grad_norm": 24.341142654418945, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8511389493942261, + "num_tokens": 205685312.0, + "step": 5389 + }, + { + "epoch": 0.6856634016028496, + "ewc_loss": 0.03208558261394501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.60427916853223e-05, + "grad_norm": 24.29182243347168, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8532017469406128, + "num_tokens": 205720525.0, + "step": 5390 + }, + { + "epoch": 0.68579061188144, + "ewc_loss": 0.0320560559630394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.602802876732312e-05, + "grad_norm": 24.343265533447266, + "learning_rate": 1e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8386962413787842, + "num_tokens": 205763358.0, + "step": 5391 + }, + { + "epoch": 0.6859178221600305, + "ewc_loss": 0.03208582475781441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6042911738622934e-05, + "grad_norm": 24.262229919433594, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8619097471237183, + "num_tokens": 205801413.0, + "step": 5392 + }, + { + "epoch": 0.686045032438621, + "ewc_loss": 0.03204837441444397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6024187061702833e-05, + "grad_norm": 24.252559661865234, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8589084148406982, + "num_tokens": 205840442.0, + "step": 5393 + }, + { + "epoch": 0.6861722427172116, + "ewc_loss": 0.03212006017565727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6060030247899704e-05, + "grad_norm": 24.224075317382812, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8708722591400146, + "num_tokens": 205877319.0, + "step": 5394 + }, + { + "epoch": 0.6862994529958021, + "ewc_loss": 0.03213633969426155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.606817022548057e-05, + "grad_norm": 24.338134765625, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8565164804458618, + "num_tokens": 205916186.0, + "step": 5395 + }, + { + "epoch": 0.6864266632743926, + "ewc_loss": 0.032144945114851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6072472135419957e-05, + "grad_norm": 24.274078369140625, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.846764862537384, + "num_tokens": 205956336.0, + "step": 5396 + }, + { + "epoch": 0.686553873552983, + "ewc_loss": 0.03211873397231102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.605936631676741e-05, + "grad_norm": 24.342077255249023, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8547781705856323, + "num_tokens": 205989570.0, + "step": 5397 + }, + { + "epoch": 0.6866810838315736, + "ewc_loss": 0.032155610620975494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6077805412351154e-05, + "grad_norm": 24.351974487304688, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8494242429733276, + "num_tokens": 206026139.0, + "step": 5398 + }, + { + "epoch": 0.6868082941101641, + "ewc_loss": 0.03207246959209442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6036234228522517e-05, + "grad_norm": 24.316892623901367, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8512638211250305, + "num_tokens": 206070059.0, + "step": 5399 + }, + { + "epoch": 0.6869355043887546, + "ewc_loss": 0.03210050240159035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.605025136086624e-05, + "grad_norm": 24.26811408996582, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8612839579582214, + "num_tokens": 206104645.0, + "step": 5400 + }, + { + "epoch": 0.6870627146673451, + "ewc_loss": 0.03205883502960205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6029416656238027e-05, + "grad_norm": 24.258285522460938, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8554620742797852, + "num_tokens": 206141220.0, + "step": 5401 + }, + { + "epoch": 0.6871899249459357, + "ewc_loss": 0.0321212075650692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.606060322956182e-05, + "grad_norm": 24.226787567138672, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.856162428855896, + "num_tokens": 206177917.0, + "step": 5402 + }, + { + "epoch": 0.6873171352245261, + "ewc_loss": 0.03209567815065384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.604783938091714e-05, + "grad_norm": 24.20174217224121, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8560476303100586, + "num_tokens": 206213419.0, + "step": 5403 + }, + { + "epoch": 0.6874443455031166, + "ewc_loss": 0.032181598246097565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6090798453660682e-05, + "grad_norm": 24.333833694458008, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8418619632720947, + "num_tokens": 206252421.0, + "step": 5404 + }, + { + "epoch": 0.6875715557817071, + "ewc_loss": 0.03216614946722984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6083075024653226e-05, + "grad_norm": 24.346435546875, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8628056049346924, + "num_tokens": 206287195.0, + "step": 5405 + }, + { + "epoch": 0.6876987660602977, + "ewc_loss": 0.03213764354586601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.606882142368704e-05, + "grad_norm": 24.25537872314453, + "learning_rate": 1e-06, + "loss": 0.5246, + "mean_token_accuracy": 0.8381646275520325, + "num_tokens": 206328339.0, + "step": 5406 + }, + { + "epoch": 0.6878259763388882, + "ewc_loss": 0.032136220484972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6068110198830254e-05, + "grad_norm": 24.294219970703125, + "learning_rate": 1e-06, + "loss": 0.5106, + "mean_token_accuracy": 0.8424698710441589, + "num_tokens": 206369523.0, + "step": 5407 + }, + { + "epoch": 0.6879531866174787, + "ewc_loss": 0.032175712287425995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6087855328805745e-05, + "grad_norm": 24.300113677978516, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8766797780990601, + "num_tokens": 206402864.0, + "step": 5408 + }, + { + "epoch": 0.6880803968960693, + "ewc_loss": 0.032141439616680145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6070720448624343e-05, + "grad_norm": 24.31905746459961, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8451679348945618, + "num_tokens": 206439004.0, + "step": 5409 + }, + { + "epoch": 0.6882076071746597, + "ewc_loss": 0.03214429318904877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6072146536316723e-05, + "grad_norm": 24.25397491455078, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8589462637901306, + "num_tokens": 206472320.0, + "step": 5410 + }, + { + "epoch": 0.6883348174532502, + "ewc_loss": 0.0321466289460659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.60733143275138e-05, + "grad_norm": 24.221223831176758, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8500381708145142, + "num_tokens": 206518649.0, + "step": 5411 + }, + { + "epoch": 0.6884620277318407, + "ewc_loss": 0.0321633443236351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6081672583823092e-05, + "grad_norm": 24.270517349243164, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8719423413276672, + "num_tokens": 206551318.0, + "step": 5412 + }, + { + "epoch": 0.6885892380104313, + "ewc_loss": 0.0321509912610054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.607549529580865e-05, + "grad_norm": 24.289352416992188, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8449759483337402, + "num_tokens": 206588173.0, + "step": 5413 + }, + { + "epoch": 0.6887164482890218, + "ewc_loss": 0.03215941786766052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6079708075267263e-05, + "grad_norm": 24.272214889526367, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8562147617340088, + "num_tokens": 206624013.0, + "step": 5414 + }, + { + "epoch": 0.6888436585676123, + "ewc_loss": 0.03212948888540268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6064745068433695e-05, + "grad_norm": 24.297611236572266, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8648610711097717, + "num_tokens": 206668180.0, + "step": 5415 + }, + { + "epoch": 0.6889708688462027, + "ewc_loss": 0.03219247981905937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6096239050966688e-05, + "grad_norm": 24.42176055908203, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8529383540153503, + "num_tokens": 206710619.0, + "step": 5416 + }, + { + "epoch": 0.6890980791247933, + "ewc_loss": 0.03211604058742523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6058020264608786e-05, + "grad_norm": 24.33360481262207, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8614259958267212, + "num_tokens": 206750121.0, + "step": 5417 + }, + { + "epoch": 0.6892252894033838, + "ewc_loss": 0.0321207158267498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6060357665992342e-05, + "grad_norm": 24.255348205566406, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8690857887268066, + "num_tokens": 206789538.0, + "step": 5418 + }, + { + "epoch": 0.6893524996819743, + "ewc_loss": 0.03209735080599785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.604867611604277e-05, + "grad_norm": 24.25230598449707, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8573667407035828, + "num_tokens": 206827513.0, + "step": 5419 + }, + { + "epoch": 0.6894797099605648, + "ewc_loss": 0.032150015234947205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.60750078066485e-05, + "grad_norm": 24.339946746826172, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8633291125297546, + "num_tokens": 206866563.0, + "step": 5420 + }, + { + "epoch": 0.6896069202391554, + "ewc_loss": 0.03215034306049347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.607517151569482e-05, + "grad_norm": 24.278915405273438, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8675514459609985, + "num_tokens": 206906281.0, + "step": 5421 + }, + { + "epoch": 0.6897341305177458, + "ewc_loss": 0.03217737749218941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.608868842595257e-05, + "grad_norm": 24.437021255493164, + "learning_rate": 1e-06, + "loss": 0.5098, + "mean_token_accuracy": 0.8379840850830078, + "num_tokens": 206944055.0, + "step": 5422 + }, + { + "epoch": 0.6898613407963363, + "ewc_loss": 0.03207384794950485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.603692362550646e-05, + "grad_norm": 24.306283950805664, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8575186729431152, + "num_tokens": 206984421.0, + "step": 5423 + }, + { + "epoch": 0.6899885510749268, + "ewc_loss": 0.03211211413145065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.605605757504236e-05, + "grad_norm": 24.310312271118164, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8463987112045288, + "num_tokens": 207020319.0, + "step": 5424 + }, + { + "epoch": 0.6901157613535174, + "ewc_loss": 0.03210330754518509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6051653801696375e-05, + "grad_norm": 24.33856773376465, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.859470009803772, + "num_tokens": 207061487.0, + "step": 5425 + }, + { + "epoch": 0.6902429716321079, + "ewc_loss": 0.032111022621393204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6055511878221296e-05, + "grad_norm": 24.294904708862305, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8590817451477051, + "num_tokens": 207098343.0, + "step": 5426 + }, + { + "epoch": 0.6903701819106984, + "ewc_loss": 0.032114919275045395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6057460015872493e-05, + "grad_norm": 24.35944175720215, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8544765710830688, + "num_tokens": 207140381.0, + "step": 5427 + }, + { + "epoch": 0.6904973921892888, + "ewc_loss": 0.03212814778089523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6064073861343786e-05, + "grad_norm": 24.395217895507812, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8708363771438599, + "num_tokens": 207173923.0, + "step": 5428 + }, + { + "epoch": 0.6906246024678794, + "ewc_loss": 0.03212706372141838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.606353180250153e-05, + "grad_norm": 24.23251724243164, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8635296821594238, + "num_tokens": 207214840.0, + "step": 5429 + }, + { + "epoch": 0.6907518127464699, + "ewc_loss": 0.03212013468146324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6060066627687775e-05, + "grad_norm": 24.360450744628906, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8562575578689575, + "num_tokens": 207247850.0, + "step": 5430 + }, + { + "epoch": 0.6908790230250604, + "ewc_loss": 0.032189033925533295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.609451646800153e-05, + "grad_norm": 24.238718032836914, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8576856851577759, + "num_tokens": 207286698.0, + "step": 5431 + }, + { + "epoch": 0.691006233303651, + "ewc_loss": 0.032092101871967316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6046051314333454e-05, + "grad_norm": 24.329849243164062, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.851759672164917, + "num_tokens": 207322470.0, + "step": 5432 + }, + { + "epoch": 0.6911334435822415, + "ewc_loss": 0.032182905822992325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.609145328984596e-05, + "grad_norm": 24.292131423950195, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8443795442581177, + "num_tokens": 207361517.0, + "step": 5433 + }, + { + "epoch": 0.6912606538608319, + "ewc_loss": 0.03208770230412483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6043850337155163e-05, + "grad_norm": 24.32402801513672, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8675652146339417, + "num_tokens": 207394504.0, + "step": 5434 + }, + { + "epoch": 0.6913878641394224, + "ewc_loss": 0.03217422589659691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6087113181129098e-05, + "grad_norm": 24.270477294921875, + "learning_rate": 1e-06, + "loss": 0.5138, + "mean_token_accuracy": 0.839117169380188, + "num_tokens": 207430148.0, + "step": 5435 + }, + { + "epoch": 0.691515074418013, + "ewc_loss": 0.03215176984667778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.607588455954101e-05, + "grad_norm": 24.395828247070312, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8546030521392822, + "num_tokens": 207461094.0, + "step": 5436 + }, + { + "epoch": 0.6916422846966035, + "ewc_loss": 0.032241541892290115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.612077176105231e-05, + "grad_norm": 24.233253479003906, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8644962310791016, + "num_tokens": 207500572.0, + "step": 5437 + }, + { + "epoch": 0.691769494975194, + "ewc_loss": 0.032165735960006714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6082867659861222e-05, + "grad_norm": 24.22013282775879, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8500075340270996, + "num_tokens": 207534977.0, + "step": 5438 + }, + { + "epoch": 0.6918967052537845, + "ewc_loss": 0.03225584700703621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6127924027387053e-05, + "grad_norm": 24.30827522277832, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8627939820289612, + "num_tokens": 207569809.0, + "step": 5439 + }, + { + "epoch": 0.692023915532375, + "ewc_loss": 0.03223888948559761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6119443898787722e-05, + "grad_norm": 24.308744430541992, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8587334156036377, + "num_tokens": 207606785.0, + "step": 5440 + }, + { + "epoch": 0.6921511258109655, + "ewc_loss": 0.03234485909342766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.617242924112361e-05, + "grad_norm": 24.34555435180664, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8562831878662109, + "num_tokens": 207648344.0, + "step": 5441 + }, + { + "epoch": 0.692278336089556, + "ewc_loss": 0.032272323966026306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6136162230395712e-05, + "grad_norm": 24.18514633178711, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.864795982837677, + "num_tokens": 207680962.0, + "step": 5442 + }, + { + "epoch": 0.6924055463681466, + "ewc_loss": 0.03227917104959488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6139585568453185e-05, + "grad_norm": 24.375850677490234, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8897817730903625, + "num_tokens": 207709162.0, + "step": 5443 + }, + { + "epoch": 0.6925327566467371, + "ewc_loss": 0.03236447274684906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.618223723198753e-05, + "grad_norm": 24.197179794311523, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8541080951690674, + "num_tokens": 207746089.0, + "step": 5444 + }, + { + "epoch": 0.6926599669253276, + "ewc_loss": 0.0323617160320282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.618085843801964e-05, + "grad_norm": 24.378746032714844, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8687186241149902, + "num_tokens": 207791235.0, + "step": 5445 + }, + { + "epoch": 0.692787177203918, + "ewc_loss": 0.03236198052763939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6180989405256696e-05, + "grad_norm": 24.375635147094727, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8500075340270996, + "num_tokens": 207824527.0, + "step": 5446 + }, + { + "epoch": 0.6929143874825086, + "ewc_loss": 0.03229737654328346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.614868779142853e-05, + "grad_norm": 24.182498931884766, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8684819936752319, + "num_tokens": 207863535.0, + "step": 5447 + }, + { + "epoch": 0.6930415977610991, + "ewc_loss": 0.03228197619318962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.614098800928332e-05, + "grad_norm": 24.392011642456055, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.851590633392334, + "num_tokens": 207899694.0, + "step": 5448 + }, + { + "epoch": 0.6931688080396896, + "ewc_loss": 0.03234181925654411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.617091038497165e-05, + "grad_norm": 24.183929443359375, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8491904139518738, + "num_tokens": 207931440.0, + "step": 5449 + }, + { + "epoch": 0.6932960183182801, + "ewc_loss": 0.03229546919465065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.614773464098107e-05, + "grad_norm": 24.27543067932129, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8555086851119995, + "num_tokens": 207963758.0, + "step": 5450 + }, + { + "epoch": 0.6934232285968707, + "ewc_loss": 0.032364651560783386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6182326362468302e-05, + "grad_norm": 24.18946647644043, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8635135889053345, + "num_tokens": 208002383.0, + "step": 5451 + }, + { + "epoch": 0.6935504388754611, + "ewc_loss": 0.032311052083969116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6155525372596458e-05, + "grad_norm": 24.292980194091797, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8617318868637085, + "num_tokens": 208038723.0, + "step": 5452 + }, + { + "epoch": 0.6936776491540516, + "ewc_loss": 0.032399099320173264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6199550373130478e-05, + "grad_norm": 24.216928482055664, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8628539443016052, + "num_tokens": 208071851.0, + "step": 5453 + }, + { + "epoch": 0.6938048594326421, + "ewc_loss": 0.03235307335853577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6176536519196816e-05, + "grad_norm": 24.371206283569336, + "learning_rate": 1e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8405444622039795, + "num_tokens": 208106916.0, + "step": 5454 + }, + { + "epoch": 0.6939320697112327, + "ewc_loss": 0.03241853043437004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6209265595534816e-05, + "grad_norm": 24.247194290161133, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8670716285705566, + "num_tokens": 208146883.0, + "step": 5455 + }, + { + "epoch": 0.6940592799898232, + "ewc_loss": 0.03228925168514252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.614462598809041e-05, + "grad_norm": 24.342540740966797, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8606381416320801, + "num_tokens": 208187364.0, + "step": 5456 + }, + { + "epoch": 0.6941864902684137, + "ewc_loss": 0.032387085258960724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6193542251130566e-05, + "grad_norm": 24.32099723815918, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8663361072540283, + "num_tokens": 208226087.0, + "step": 5457 + }, + { + "epoch": 0.6943137005470043, + "ewc_loss": 0.032345764338970184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6172882169485092e-05, + "grad_norm": 24.26630973815918, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8546870946884155, + "num_tokens": 208265193.0, + "step": 5458 + }, + { + "epoch": 0.6944409108255947, + "ewc_loss": 0.03241506963968277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.620753391762264e-05, + "grad_norm": 24.3426456451416, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8717010617256165, + "num_tokens": 208305163.0, + "step": 5459 + }, + { + "epoch": 0.6945681211041852, + "ewc_loss": 0.03228268399834633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.614134271221701e-05, + "grad_norm": 24.20359992980957, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8465981483459473, + "num_tokens": 208342229.0, + "step": 5460 + }, + { + "epoch": 0.6946953313827757, + "ewc_loss": 0.03236851468682289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6184258129214868e-05, + "grad_norm": 24.40013313293457, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8479390740394592, + "num_tokens": 208384606.0, + "step": 5461 + }, + { + "epoch": 0.6948225416613663, + "ewc_loss": 0.03238249942660332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6191250324482098e-05, + "grad_norm": 24.294065475463867, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8770171999931335, + "num_tokens": 208416777.0, + "step": 5462 + }, + { + "epoch": 0.6949497519399568, + "ewc_loss": 0.03233708068728447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6168540241778828e-05, + "grad_norm": 24.358741760253906, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8439081907272339, + "num_tokens": 208456101.0, + "step": 5463 + }, + { + "epoch": 0.6950769622185473, + "ewc_loss": 0.03239739313721657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6198697267100215e-05, + "grad_norm": 24.53024673461914, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8681025505065918, + "num_tokens": 208489201.0, + "step": 5464 + }, + { + "epoch": 0.6952041724971377, + "ewc_loss": 0.03228098526597023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6140493244165555e-05, + "grad_norm": 24.43180274963379, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8726599812507629, + "num_tokens": 208525006.0, + "step": 5465 + }, + { + "epoch": 0.6953313827757283, + "ewc_loss": 0.0322461873292923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6123092791531235e-05, + "grad_norm": 24.439302444458008, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8556934595108032, + "num_tokens": 208568997.0, + "step": 5466 + }, + { + "epoch": 0.6954585930543188, + "ewc_loss": 0.03230267018079758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6151334421010688e-05, + "grad_norm": 24.53685188293457, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8506365418434143, + "num_tokens": 208605925.0, + "step": 5467 + }, + { + "epoch": 0.6955858033329093, + "ewc_loss": 0.03223418816924095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.611709376447834e-05, + "grad_norm": 24.33697509765625, + "learning_rate": 1e-06, + "loss": 0.5442, + "mean_token_accuracy": 0.8331762552261353, + "num_tokens": 208646290.0, + "step": 5468 + }, + { + "epoch": 0.6957130136114998, + "ewc_loss": 0.032199617475271225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6099807908176444e-05, + "grad_norm": 24.435556411743164, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.858975887298584, + "num_tokens": 208686424.0, + "step": 5469 + }, + { + "epoch": 0.6958402238900904, + "ewc_loss": 0.03218785673379898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6093928934424184e-05, + "grad_norm": 24.39811134338379, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8603765964508057, + "num_tokens": 208719643.0, + "step": 5470 + }, + { + "epoch": 0.6959674341686808, + "ewc_loss": 0.032125480473041534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6062740542110987e-05, + "grad_norm": 24.327173233032227, + "learning_rate": 1e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.8416779637336731, + "num_tokens": 208760804.0, + "step": 5471 + }, + { + "epoch": 0.6960946444472713, + "ewc_loss": 0.03218633309006691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.60931667778641e-05, + "grad_norm": 24.38535499572754, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8519988656044006, + "num_tokens": 208803022.0, + "step": 5472 + }, + { + "epoch": 0.6962218547258618, + "ewc_loss": 0.032159242779016495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6079620763775893e-05, + "grad_norm": 24.283044815063477, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8599185943603516, + "num_tokens": 208844205.0, + "step": 5473 + }, + { + "epoch": 0.6963490650044524, + "ewc_loss": 0.03223124146461487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.611562038306147e-05, + "grad_norm": 24.39304542541504, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8687517046928406, + "num_tokens": 208886882.0, + "step": 5474 + }, + { + "epoch": 0.6964762752830429, + "ewc_loss": 0.0322134867310524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6106743714772165e-05, + "grad_norm": 24.366193771362305, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8556861877441406, + "num_tokens": 208926649.0, + "step": 5475 + }, + { + "epoch": 0.6966034855616334, + "ewc_loss": 0.0321698822081089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6084941307781264e-05, + "grad_norm": 24.30324363708496, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8519980907440186, + "num_tokens": 208963403.0, + "step": 5476 + }, + { + "epoch": 0.6967306958402238, + "ewc_loss": 0.03219663351774216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6098316336865537e-05, + "grad_norm": 24.40406036376953, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8503080010414124, + "num_tokens": 209002475.0, + "step": 5477 + }, + { + "epoch": 0.6968579061188144, + "ewc_loss": 0.0322425551712513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6121277440106496e-05, + "grad_norm": 24.383663177490234, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8588406443595886, + "num_tokens": 209044216.0, + "step": 5478 + }, + { + "epoch": 0.6969851163974049, + "ewc_loss": 0.03221949189901352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.610974686627742e-05, + "grad_norm": 24.4066162109375, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8466439843177795, + "num_tokens": 209080774.0, + "step": 5479 + }, + { + "epoch": 0.6971123266759954, + "ewc_loss": 0.03220263496041298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6101317669381388e-05, + "grad_norm": 24.416048049926758, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8638345003128052, + "num_tokens": 209117167.0, + "step": 5480 + }, + { + "epoch": 0.697239536954586, + "ewc_loss": 0.032206907868385315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.610345316294115e-05, + "grad_norm": 24.414260864257812, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8514606952667236, + "num_tokens": 209148872.0, + "step": 5481 + }, + { + "epoch": 0.6973667472331765, + "ewc_loss": 0.03222808614373207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6114043319248594e-05, + "grad_norm": 24.532068252563477, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8604007363319397, + "num_tokens": 209187912.0, + "step": 5482 + }, + { + "epoch": 0.6974939575117669, + "ewc_loss": 0.03213248774409294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6066243915702216e-05, + "grad_norm": 24.29372787475586, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8647972941398621, + "num_tokens": 209227055.0, + "step": 5483 + }, + { + "epoch": 0.6976211677903574, + "ewc_loss": 0.03218890354037285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6094450984383002e-05, + "grad_norm": 24.466543197631836, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8714519739151001, + "num_tokens": 209255343.0, + "step": 5484 + }, + { + "epoch": 0.697748378068948, + "ewc_loss": 0.03219585865736008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.609792889212258e-05, + "grad_norm": 24.374788284301758, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8516367673873901, + "num_tokens": 209294621.0, + "step": 5485 + }, + { + "epoch": 0.6978755883475385, + "ewc_loss": 0.03212899714708328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6064499504864216e-05, + "grad_norm": 24.377058029174805, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8733833432197571, + "num_tokens": 209336519.0, + "step": 5486 + }, + { + "epoch": 0.698002798626129, + "ewc_loss": 0.03218572959303856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.609286482562311e-05, + "grad_norm": 24.254648208618164, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8552505970001221, + "num_tokens": 209372913.0, + "step": 5487 + }, + { + "epoch": 0.6981300089047195, + "ewc_loss": 0.03216859698295593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6084299204521812e-05, + "grad_norm": 24.322593688964844, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.848775327205658, + "num_tokens": 209417768.0, + "step": 5488 + }, + { + "epoch": 0.69825721918331, + "ewc_loss": 0.03224749490618706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.612374762771651e-05, + "grad_norm": 24.327037811279297, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8595436811447144, + "num_tokens": 209451221.0, + "step": 5489 + }, + { + "epoch": 0.6983844294619005, + "ewc_loss": 0.032218240201473236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6109119314933196e-05, + "grad_norm": 24.27368927001953, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8566865921020508, + "num_tokens": 209488710.0, + "step": 5490 + }, + { + "epoch": 0.698511639740491, + "ewc_loss": 0.03221280500292778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6106401744764298e-05, + "grad_norm": 24.324996948242188, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8542655110359192, + "num_tokens": 209525166.0, + "step": 5491 + }, + { + "epoch": 0.6986388500190815, + "ewc_loss": 0.032285258173942566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6142628737725317e-05, + "grad_norm": 24.352294921875, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8602880239486694, + "num_tokens": 209564804.0, + "step": 5492 + }, + { + "epoch": 0.6987660602976721, + "ewc_loss": 0.03228792920708656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.614396387594752e-05, + "grad_norm": 24.377450942993164, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8617957234382629, + "num_tokens": 209607155.0, + "step": 5493 + }, + { + "epoch": 0.6988932705762626, + "ewc_loss": 0.0322331078350544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6116553524625488e-05, + "grad_norm": 24.365602493286133, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8472987413406372, + "num_tokens": 209641226.0, + "step": 5494 + }, + { + "epoch": 0.699020480854853, + "ewc_loss": 0.03233376890420914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.61668849614216e-05, + "grad_norm": 24.411102294921875, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8549416065216064, + "num_tokens": 209679180.0, + "step": 5495 + }, + { + "epoch": 0.6991476911334435, + "ewc_loss": 0.03223416209220886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6117081031552516e-05, + "grad_norm": 24.2293643951416, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8498782515525818, + "num_tokens": 209716412.0, + "step": 5496 + }, + { + "epoch": 0.6992749014120341, + "ewc_loss": 0.032300181686878204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6150090232258663e-05, + "grad_norm": 24.470983505249023, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8495984077453613, + "num_tokens": 209755697.0, + "step": 5497 + }, + { + "epoch": 0.6994021116906246, + "ewc_loss": 0.0323524996638298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6176249118871056e-05, + "grad_norm": 24.432443618774414, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8652198314666748, + "num_tokens": 209793654.0, + "step": 5498 + }, + { + "epoch": 0.6995293219692151, + "ewc_loss": 0.0322660468518734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6133022654685192e-05, + "grad_norm": 24.332368850708008, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8558452129364014, + "num_tokens": 209830703.0, + "step": 5499 + }, + { + "epoch": 0.6996565322478057, + "ewc_loss": 0.0323028489947319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6151425370480865e-05, + "grad_norm": 24.329635620117188, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8579610586166382, + "num_tokens": 209868994.0, + "step": 5500 + }, + { + "epoch": 0.6997837425263961, + "ewc_loss": 0.03233696520328522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6168482034117915e-05, + "grad_norm": 24.444168090820312, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8672008514404297, + "num_tokens": 209909605.0, + "step": 5501 + }, + { + "epoch": 0.6999109528049866, + "ewc_loss": 0.03228865563869476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.614432767382823e-05, + "grad_norm": 24.402374267578125, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8635077476501465, + "num_tokens": 209945196.0, + "step": 5502 + }, + { + "epoch": 0.7000381630835771, + "ewc_loss": 0.03232954069972038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6164769476745278e-05, + "grad_norm": 24.3457088470459, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8586819171905518, + "num_tokens": 209988585.0, + "step": 5503 + }, + { + "epoch": 0.7001653733621677, + "ewc_loss": 0.03225895017385483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6129475625348277e-05, + "grad_norm": 24.319759368896484, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8580230474472046, + "num_tokens": 210032084.0, + "step": 5504 + }, + { + "epoch": 0.7002925836407582, + "ewc_loss": 0.03231010213494301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6155050616362132e-05, + "grad_norm": 24.417104721069336, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8473346829414368, + "num_tokens": 210070639.0, + "step": 5505 + }, + { + "epoch": 0.7004197939193487, + "ewc_loss": 0.03230579197406769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.615289511391893e-05, + "grad_norm": 24.422279357910156, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8654215335845947, + "num_tokens": 210113793.0, + "step": 5506 + }, + { + "epoch": 0.7005470041979391, + "ewc_loss": 0.03230888769030571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6154443073901348e-05, + "grad_norm": 24.381345748901367, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8493344187736511, + "num_tokens": 210153517.0, + "step": 5507 + }, + { + "epoch": 0.7006742144765297, + "ewc_loss": 0.032243017107248306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6121508451760747e-05, + "grad_norm": 24.35590362548828, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8530076742172241, + "num_tokens": 210186818.0, + "step": 5508 + }, + { + "epoch": 0.7008014247551202, + "ewc_loss": 0.032309357076883316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6154677723534405e-05, + "grad_norm": 24.401721954345703, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8504364490509033, + "num_tokens": 210224283.0, + "step": 5509 + }, + { + "epoch": 0.7009286350337107, + "ewc_loss": 0.032219450920820236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6109725038404576e-05, + "grad_norm": 24.30435562133789, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8521990776062012, + "num_tokens": 210263439.0, + "step": 5510 + }, + { + "epoch": 0.7010558453123013, + "ewc_loss": 0.03232887387275696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.616443660168443e-05, + "grad_norm": 24.459186553955078, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8607374429702759, + "num_tokens": 210305887.0, + "step": 5511 + }, + { + "epoch": 0.7011830555908918, + "ewc_loss": 0.032242950052022934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6121475709951483e-05, + "grad_norm": 24.349742889404297, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8563841581344604, + "num_tokens": 210340118.0, + "step": 5512 + }, + { + "epoch": 0.7013102658694823, + "ewc_loss": 0.03220849111676216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6104246242321096e-05, + "grad_norm": 24.327754974365234, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8596203923225403, + "num_tokens": 210376098.0, + "step": 5513 + }, + { + "epoch": 0.7014374761480727, + "ewc_loss": 0.03231993690133095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.615996916370932e-05, + "grad_norm": 24.437036514282227, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.861487865447998, + "num_tokens": 210418801.0, + "step": 5514 + }, + { + "epoch": 0.7015646864266633, + "ewc_loss": 0.03227037936449051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6135189071064815e-05, + "grad_norm": 24.361413955688477, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.847364068031311, + "num_tokens": 210452392.0, + "step": 5515 + }, + { + "epoch": 0.7016918967052538, + "ewc_loss": 0.032279059290885925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6139529179781675e-05, + "grad_norm": 24.33993148803711, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.865054726600647, + "num_tokens": 210489697.0, + "step": 5516 + }, + { + "epoch": 0.7018191069838443, + "ewc_loss": 0.03227672353386879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.61383613885846e-05, + "grad_norm": 24.352386474609375, + "learning_rate": 1e-06, + "loss": 0.5039, + "mean_token_accuracy": 0.8429383039474487, + "num_tokens": 210530212.0, + "step": 5517 + }, + { + "epoch": 0.7019463172624348, + "ewc_loss": 0.03231855109333992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.615927612874657e-05, + "grad_norm": 24.3459529876709, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8640505075454712, + "num_tokens": 210564974.0, + "step": 5518 + }, + { + "epoch": 0.7020735275410254, + "ewc_loss": 0.032278671860694885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.61393363669049e-05, + "grad_norm": 24.29598045349121, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8575679063796997, + "num_tokens": 210601272.0, + "step": 5519 + }, + { + "epoch": 0.7022007378196158, + "ewc_loss": 0.032349567860364914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.61747848324012e-05, + "grad_norm": 24.44720458984375, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8599317669868469, + "num_tokens": 210636908.0, + "step": 5520 + }, + { + "epoch": 0.7023279480982063, + "ewc_loss": 0.03228297829627991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6141490050358698e-05, + "grad_norm": 24.32656478881836, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.854414701461792, + "num_tokens": 210672629.0, + "step": 5521 + }, + { + "epoch": 0.7024551583767968, + "ewc_loss": 0.03236695006489754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.618347414478194e-05, + "grad_norm": 24.435075759887695, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8719730973243713, + "num_tokens": 210703531.0, + "step": 5522 + }, + { + "epoch": 0.7025823686553874, + "ewc_loss": 0.03239791840314865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6198959201574326e-05, + "grad_norm": 24.361217498779297, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8715075254440308, + "num_tokens": 210742946.0, + "step": 5523 + }, + { + "epoch": 0.7027095789339779, + "ewc_loss": 0.032356858253479004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6178428268176503e-05, + "grad_norm": 24.426837921142578, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8662864565849304, + "num_tokens": 210778345.0, + "step": 5524 + }, + { + "epoch": 0.7028367892125684, + "ewc_loss": 0.0324292853474617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6214642528211698e-05, + "grad_norm": 24.45833969116211, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8645789623260498, + "num_tokens": 210821544.0, + "step": 5525 + }, + { + "epoch": 0.7029639994911588, + "ewc_loss": 0.03232628479599953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6163141481229104e-05, + "grad_norm": 24.383392333984375, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8613448739051819, + "num_tokens": 210858060.0, + "step": 5526 + }, + { + "epoch": 0.7030912097697494, + "ewc_loss": 0.032350342720746994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6175172277144156e-05, + "grad_norm": 24.447816848754883, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8525222539901733, + "num_tokens": 210900365.0, + "step": 5527 + }, + { + "epoch": 0.7032184200483399, + "ewc_loss": 0.032390985637903214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6195492207771167e-05, + "grad_norm": 24.488971710205078, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8527333736419678, + "num_tokens": 210941118.0, + "step": 5528 + }, + { + "epoch": 0.7033456303269304, + "ewc_loss": 0.0323931947350502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6196598153328523e-05, + "grad_norm": 24.487699508666992, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8646400570869446, + "num_tokens": 210979865.0, + "step": 5529 + }, + { + "epoch": 0.703472840605521, + "ewc_loss": 0.032298579812049866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6149289876921102e-05, + "grad_norm": 24.521451950073242, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8581621646881104, + "num_tokens": 211016385.0, + "step": 5530 + }, + { + "epoch": 0.7036000508841115, + "ewc_loss": 0.03231446444988251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6157231584656984e-05, + "grad_norm": 24.44318199157715, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.845931887626648, + "num_tokens": 211059071.0, + "step": 5531 + }, + { + "epoch": 0.7037272611627019, + "ewc_loss": 0.03226889669895172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6134448742377572e-05, + "grad_norm": 24.573047637939453, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8607185482978821, + "num_tokens": 211094189.0, + "step": 5532 + }, + { + "epoch": 0.7038544714412924, + "ewc_loss": 0.0323149710893631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.615748624317348e-05, + "grad_norm": 24.44428825378418, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8720728754997253, + "num_tokens": 211130524.0, + "step": 5533 + }, + { + "epoch": 0.703981681719883, + "ewc_loss": 0.03221415355801582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6107076589833014e-05, + "grad_norm": 24.373153686523438, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8509539365768433, + "num_tokens": 211172335.0, + "step": 5534 + }, + { + "epoch": 0.7041088919984735, + "ewc_loss": 0.03230790048837662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6153950127772987e-05, + "grad_norm": 24.50727081298828, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8594990372657776, + "num_tokens": 211205406.0, + "step": 5535 + }, + { + "epoch": 0.704236102277064, + "ewc_loss": 0.03227858245372772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.613929089216981e-05, + "grad_norm": 24.43166160583496, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8545029163360596, + "num_tokens": 211246443.0, + "step": 5536 + }, + { + "epoch": 0.7043633125556545, + "ewc_loss": 0.032270342111587524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.613517088117078e-05, + "grad_norm": 24.45452308654785, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8721446394920349, + "num_tokens": 211286707.0, + "step": 5537 + }, + { + "epoch": 0.704490522834245, + "ewc_loss": 0.03229580074548721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6147900169016793e-05, + "grad_norm": 24.423185348510742, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8684975504875183, + "num_tokens": 211319448.0, + "step": 5538 + }, + { + "epoch": 0.7046177331128355, + "ewc_loss": 0.032298315316438675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6149157090694644e-05, + "grad_norm": 24.475107192993164, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8524680137634277, + "num_tokens": 211363747.0, + "step": 5539 + }, + { + "epoch": 0.704744943391426, + "ewc_loss": 0.03227304667234421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6136524209287018e-05, + "grad_norm": 24.38105583190918, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.86452716588974, + "num_tokens": 211408080.0, + "step": 5540 + }, + { + "epoch": 0.7048721536700165, + "ewc_loss": 0.032236918807029724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6118459825520404e-05, + "grad_norm": 24.441070556640625, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8489748239517212, + "num_tokens": 211445656.0, + "step": 5541 + }, + { + "epoch": 0.7049993639486071, + "ewc_loss": 0.03229647874832153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6148238501045853e-05, + "grad_norm": 24.394601821899414, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8671143054962158, + "num_tokens": 211484342.0, + "step": 5542 + }, + { + "epoch": 0.7051265742271976, + "ewc_loss": 0.03225947543978691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6129737559822388e-05, + "grad_norm": 24.577686309814453, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8528581857681274, + "num_tokens": 211518535.0, + "step": 5543 + }, + { + "epoch": 0.705253784505788, + "ewc_loss": 0.032306257635354996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6153127944562584e-05, + "grad_norm": 24.376487731933594, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8519644737243652, + "num_tokens": 211557236.0, + "step": 5544 + }, + { + "epoch": 0.7053809947843785, + "ewc_loss": 0.03220279514789581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6101397704915144e-05, + "grad_norm": 24.44049072265625, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8691620826721191, + "num_tokens": 211599216.0, + "step": 5545 + }, + { + "epoch": 0.7055082050629691, + "ewc_loss": 0.03225858137011528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6129290088429116e-05, + "grad_norm": 24.32526206970215, + "learning_rate": 1e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8399388790130615, + "num_tokens": 211638930.0, + "step": 5546 + }, + { + "epoch": 0.7056354153415596, + "ewc_loss": 0.03221621364355087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.610810613783542e-05, + "grad_norm": 24.41028594970703, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8506139516830444, + "num_tokens": 211680765.0, + "step": 5547 + }, + { + "epoch": 0.7057626256201501, + "ewc_loss": 0.03225007280707359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6125037291203626e-05, + "grad_norm": 24.343624114990234, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8725060224533081, + "num_tokens": 211719037.0, + "step": 5548 + }, + { + "epoch": 0.7058898358987407, + "ewc_loss": 0.03229134902358055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6145673725986853e-05, + "grad_norm": 24.4613037109375, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8531396985054016, + "num_tokens": 211759655.0, + "step": 5549 + }, + { + "epoch": 0.7060170461773311, + "ewc_loss": 0.03225753828883171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.61287698574597e-05, + "grad_norm": 24.45592498779297, + "learning_rate": 1e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8412564992904663, + "num_tokens": 211800405.0, + "step": 5550 + }, + { + "epoch": 0.7061442564559216, + "ewc_loss": 0.03232916072010994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.616458030184731e-05, + "grad_norm": 24.463838577270508, + "learning_rate": 1e-06, + "loss": 0.5121, + "mean_token_accuracy": 0.8401243090629578, + "num_tokens": 211846057.0, + "step": 5551 + }, + { + "epoch": 0.7062714667345121, + "ewc_loss": 0.03225312381982803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6126561604323797e-05, + "grad_norm": 24.459444046020508, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8562643527984619, + "num_tokens": 211885944.0, + "step": 5552 + }, + { + "epoch": 0.7063986770131027, + "ewc_loss": 0.03232663869857788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.616331974219065e-05, + "grad_norm": 24.51486587524414, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8619805574417114, + "num_tokens": 211928727.0, + "step": 5553 + }, + { + "epoch": 0.7065258872916932, + "ewc_loss": 0.03226412832736969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6132064047269523e-05, + "grad_norm": 24.477357864379883, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8476113080978394, + "num_tokens": 211967593.0, + "step": 5554 + }, + { + "epoch": 0.7066530975702837, + "ewc_loss": 0.03225201368331909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6126006812555715e-05, + "grad_norm": 24.3857479095459, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8517158031463623, + "num_tokens": 212001023.0, + "step": 5555 + }, + { + "epoch": 0.7067803078488741, + "ewc_loss": 0.03226958215236664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.613479071238544e-05, + "grad_norm": 24.441720962524414, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8502866625785828, + "num_tokens": 212045611.0, + "step": 5556 + }, + { + "epoch": 0.7069075181274647, + "ewc_loss": 0.03234130144119263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.617065026948694e-05, + "grad_norm": 24.410737991333008, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8536155223846436, + "num_tokens": 212087167.0, + "step": 5557 + }, + { + "epoch": 0.7070347284060552, + "ewc_loss": 0.03227157145738602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6135785699589178e-05, + "grad_norm": 24.434513092041016, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8587023019790649, + "num_tokens": 212124254.0, + "step": 5558 + }, + { + "epoch": 0.7071619386846457, + "ewc_loss": 0.0323459729552269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6172985851881094e-05, + "grad_norm": 24.488033294677734, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8519049286842346, + "num_tokens": 212162706.0, + "step": 5559 + }, + { + "epoch": 0.7072891489632362, + "ewc_loss": 0.03226093202829361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6130465155583806e-05, + "grad_norm": 24.34722137451172, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8565075397491455, + "num_tokens": 212197359.0, + "step": 5560 + }, + { + "epoch": 0.7074163592418268, + "ewc_loss": 0.03228313475847244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6141566447913647e-05, + "grad_norm": 24.417102813720703, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8649294376373291, + "num_tokens": 212237300.0, + "step": 5561 + }, + { + "epoch": 0.7075435695204173, + "ewc_loss": 0.03237873315811157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6189365851460025e-05, + "grad_norm": 24.441431045532227, + "learning_rate": 1e-06, + "loss": 0.5144, + "mean_token_accuracy": 0.8382964730262756, + "num_tokens": 212283472.0, + "step": 5562 + }, + { + "epoch": 0.7076707797990077, + "ewc_loss": 0.03227050602436066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.613525273569394e-05, + "grad_norm": 24.448017120361328, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8524960279464722, + "num_tokens": 212321433.0, + "step": 5563 + }, + { + "epoch": 0.7077979900775982, + "ewc_loss": 0.032344039529561996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.617201996850781e-05, + "grad_norm": 24.449785232543945, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8553891181945801, + "num_tokens": 212362234.0, + "step": 5564 + }, + { + "epoch": 0.7079252003561888, + "ewc_loss": 0.032267920672893524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.613396125321742e-05, + "grad_norm": 24.4180965423584, + "learning_rate": 1e-06, + "loss": 0.4951, + "mean_token_accuracy": 0.8477474451065063, + "num_tokens": 212399259.0, + "step": 5565 + }, + { + "epoch": 0.7080524106347793, + "ewc_loss": 0.03234502300620079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.617251109564677e-05, + "grad_norm": 24.456642150878906, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8575771450996399, + "num_tokens": 212438489.0, + "step": 5566 + }, + { + "epoch": 0.7081796209133698, + "ewc_loss": 0.03238215297460556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6191075701499358e-05, + "grad_norm": 24.375516891479492, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8487144708633423, + "num_tokens": 212482283.0, + "step": 5567 + }, + { + "epoch": 0.7083068311919604, + "ewc_loss": 0.032299935817718506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6149968359968625e-05, + "grad_norm": 24.315582275390625, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8527383804321289, + "num_tokens": 212522749.0, + "step": 5568 + }, + { + "epoch": 0.7084340414705508, + "ewc_loss": 0.032367561012506485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6183779735001735e-05, + "grad_norm": 24.4523983001709, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8485308885574341, + "num_tokens": 212559937.0, + "step": 5569 + }, + { + "epoch": 0.7085612517491413, + "ewc_loss": 0.03240591660141945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6202959159272723e-05, + "grad_norm": 24.430007934570312, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8627687096595764, + "num_tokens": 212595578.0, + "step": 5570 + }, + { + "epoch": 0.7086884620277318, + "ewc_loss": 0.03239092230796814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6195461284951307e-05, + "grad_norm": 24.44468879699707, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.86870276927948, + "num_tokens": 212634085.0, + "step": 5571 + }, + { + "epoch": 0.7088156723063224, + "ewc_loss": 0.03244445100426674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.622222589503508e-05, + "grad_norm": 24.341129302978516, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8460257649421692, + "num_tokens": 212673557.0, + "step": 5572 + }, + { + "epoch": 0.7089428825849129, + "ewc_loss": 0.0323544442653656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6177222278201953e-05, + "grad_norm": 24.40582275390625, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8668920993804932, + "num_tokens": 212711142.0, + "step": 5573 + }, + { + "epoch": 0.7090700928635034, + "ewc_loss": 0.03247186169028282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6235930161201395e-05, + "grad_norm": 24.512475967407227, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8566244840621948, + "num_tokens": 212745570.0, + "step": 5574 + }, + { + "epoch": 0.7091973031420938, + "ewc_loss": 0.03236362338066101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.61818115884671e-05, + "grad_norm": 24.322050094604492, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8607980012893677, + "num_tokens": 212783048.0, + "step": 5575 + }, + { + "epoch": 0.7093245134206844, + "ewc_loss": 0.032429039478302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6214518836932257e-05, + "grad_norm": 24.52815055847168, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8748061656951904, + "num_tokens": 212821231.0, + "step": 5576 + }, + { + "epoch": 0.7094517236992749, + "ewc_loss": 0.03244801238179207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6224006685661152e-05, + "grad_norm": 24.405786514282227, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8547372221946716, + "num_tokens": 212861715.0, + "step": 5577 + }, + { + "epoch": 0.7095789339778654, + "ewc_loss": 0.03241446614265442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6207233784371056e-05, + "grad_norm": 24.554502487182617, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8475039601325989, + "num_tokens": 212900655.0, + "step": 5578 + }, + { + "epoch": 0.709706144256456, + "ewc_loss": 0.032369788736104965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.618489477550611e-05, + "grad_norm": 24.27273178100586, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8490971326828003, + "num_tokens": 212939308.0, + "step": 5579 + }, + { + "epoch": 0.7098333545350465, + "ewc_loss": 0.03240558132529259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6202789993258193e-05, + "grad_norm": 24.519874572753906, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8619830012321472, + "num_tokens": 212981174.0, + "step": 5580 + }, + { + "epoch": 0.7099605648136369, + "ewc_loss": 0.0324365496635437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.621827505005058e-05, + "grad_norm": 24.30489730834961, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.867050051689148, + "num_tokens": 213016401.0, + "step": 5581 + }, + { + "epoch": 0.7100877750922274, + "ewc_loss": 0.03244351968169212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6221760233747773e-05, + "grad_norm": 24.50526237487793, + "learning_rate": 1e-06, + "loss": 0.5015, + "mean_token_accuracy": 0.8448969721794128, + "num_tokens": 213058480.0, + "step": 5582 + }, + { + "epoch": 0.710214985370818, + "ewc_loss": 0.03251083940267563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6255418813670985e-05, + "grad_norm": 24.239116668701172, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8699526190757751, + "num_tokens": 213093527.0, + "step": 5583 + }, + { + "epoch": 0.7103421956494085, + "ewc_loss": 0.03242719545960426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6213598428294063e-05, + "grad_norm": 24.611160278320312, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8549326658248901, + "num_tokens": 213126778.0, + "step": 5584 + }, + { + "epoch": 0.710469405927999, + "ewc_loss": 0.03253552317619324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6267762475763448e-05, + "grad_norm": 24.36996841430664, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8604851961135864, + "num_tokens": 213170863.0, + "step": 5585 + }, + { + "epoch": 0.7105966162065895, + "ewc_loss": 0.03234706073999405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6173529729712754e-05, + "grad_norm": 24.491134643554688, + "learning_rate": 1e-06, + "loss": 0.5243, + "mean_token_accuracy": 0.8332719206809998, + "num_tokens": 213211336.0, + "step": 5586 + }, + { + "epoch": 0.71072382648518, + "ewc_loss": 0.03252599015831947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6262994904536754e-05, + "grad_norm": 24.483797073364258, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8639484643936157, + "num_tokens": 213245262.0, + "step": 5587 + }, + { + "epoch": 0.7108510367637705, + "ewc_loss": 0.03243609145283699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6218045857385732e-05, + "grad_norm": 24.58617401123047, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8489094972610474, + "num_tokens": 213282150.0, + "step": 5588 + }, + { + "epoch": 0.710978247042361, + "ewc_loss": 0.03249111399054527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.624555625312496e-05, + "grad_norm": 24.513917922973633, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.862655520439148, + "num_tokens": 213325573.0, + "step": 5589 + }, + { + "epoch": 0.7111054573209515, + "ewc_loss": 0.03237362205982208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6186810171348043e-05, + "grad_norm": 24.3592586517334, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.866365909576416, + "num_tokens": 213363738.0, + "step": 5590 + }, + { + "epoch": 0.7112326675995421, + "ewc_loss": 0.03250402584671974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6252013665507548e-05, + "grad_norm": 24.620927810668945, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8622832298278809, + "num_tokens": 213404796.0, + "step": 5591 + }, + { + "epoch": 0.7113598778781326, + "ewc_loss": 0.03243077173829079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6215386494877748e-05, + "grad_norm": 24.497852325439453, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8500635623931885, + "num_tokens": 213436711.0, + "step": 5592 + }, + { + "epoch": 0.711487088156723, + "ewc_loss": 0.03239583596587181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6197918739635497e-05, + "grad_norm": 24.543956756591797, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8594584465026855, + "num_tokens": 213473458.0, + "step": 5593 + }, + { + "epoch": 0.7116142984353135, + "ewc_loss": 0.032405100762844086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6202549886656925e-05, + "grad_norm": 24.510107040405273, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.867871880531311, + "num_tokens": 213507355.0, + "step": 5594 + }, + { + "epoch": 0.7117415087139041, + "ewc_loss": 0.032414402812719345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6207201042561792e-05, + "grad_norm": 24.473779678344727, + "learning_rate": 1e-06, + "loss": 0.511, + "mean_token_accuracy": 0.8406398892402649, + "num_tokens": 213543771.0, + "step": 5595 + }, + { + "epoch": 0.7118687189924946, + "ewc_loss": 0.03241446241736412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6207231965381652e-05, + "grad_norm": 24.529821395874023, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8553791642189026, + "num_tokens": 213578317.0, + "step": 5596 + }, + { + "epoch": 0.7119959292710851, + "ewc_loss": 0.032464586198329926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6232292182394303e-05, + "grad_norm": 24.511749267578125, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8529749512672424, + "num_tokens": 213615795.0, + "step": 5597 + }, + { + "epoch": 0.7121231395496757, + "ewc_loss": 0.03237403184175491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6187015717150643e-05, + "grad_norm": 24.323143005371094, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8669371008872986, + "num_tokens": 213655452.0, + "step": 5598 + }, + { + "epoch": 0.7122503498282661, + "ewc_loss": 0.032496221363544464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6248110114247538e-05, + "grad_norm": 24.429868698120117, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8494209051132202, + "num_tokens": 213688079.0, + "step": 5599 + }, + { + "epoch": 0.7123775601068566, + "ewc_loss": 0.03250769153237343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6253845387836918e-05, + "grad_norm": 24.358932495117188, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.874746561050415, + "num_tokens": 213725310.0, + "step": 5600 + }, + { + "epoch": 0.7125047703854471, + "ewc_loss": 0.03251596912741661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.625798540771939e-05, + "grad_norm": 24.507688522338867, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.851582407951355, + "num_tokens": 213757950.0, + "step": 5601 + }, + { + "epoch": 0.7126319806640377, + "ewc_loss": 0.03256727382540703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6283636796288192e-05, + "grad_norm": 24.457212448120117, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8442103862762451, + "num_tokens": 213794146.0, + "step": 5602 + }, + { + "epoch": 0.7127591909426282, + "ewc_loss": 0.032547950744628906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.627397614356596e-05, + "grad_norm": 24.455474853515625, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8610416650772095, + "num_tokens": 213834922.0, + "step": 5603 + }, + { + "epoch": 0.7128864012212187, + "ewc_loss": 0.0326075442135334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6303773008985445e-05, + "grad_norm": 24.505977630615234, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8755967020988464, + "num_tokens": 213879651.0, + "step": 5604 + }, + { + "epoch": 0.7130136114998091, + "ewc_loss": 0.03251538798213005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.625769436941482e-05, + "grad_norm": 24.514760971069336, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8568398952484131, + "num_tokens": 213921833.0, + "step": 5605 + }, + { + "epoch": 0.7131408217783997, + "ewc_loss": 0.03249664977192879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6248324754997157e-05, + "grad_norm": 24.332265853881836, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8623056411743164, + "num_tokens": 213962909.0, + "step": 5606 + }, + { + "epoch": 0.7132680320569902, + "ewc_loss": 0.032500606030225754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6250303815468214e-05, + "grad_norm": 24.556594848632812, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8592063188552856, + "num_tokens": 214002030.0, + "step": 5607 + }, + { + "epoch": 0.7133952423355807, + "ewc_loss": 0.03252764418721199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6263822544715367e-05, + "grad_norm": 24.50938606262207, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8500334024429321, + "num_tokens": 214037094.0, + "step": 5608 + }, + { + "epoch": 0.7135224526141712, + "ewc_loss": 0.03245573490858078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6227866581175476e-05, + "grad_norm": 24.50218391418457, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8611938953399658, + "num_tokens": 214076649.0, + "step": 5609 + }, + { + "epoch": 0.7136496628927618, + "ewc_loss": 0.032481759786605835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6240879631368443e-05, + "grad_norm": 24.41062355041504, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8537680506706238, + "num_tokens": 214114194.0, + "step": 5610 + }, + { + "epoch": 0.7137768731713523, + "ewc_loss": 0.032476529479026794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6238263924606144e-05, + "grad_norm": 24.512434005737305, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8594598770141602, + "num_tokens": 214149652.0, + "step": 5611 + }, + { + "epoch": 0.7139040834499427, + "ewc_loss": 0.03243673965334892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6218369637499563e-05, + "grad_norm": 24.357135772705078, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8675410747528076, + "num_tokens": 214189068.0, + "step": 5612 + }, + { + "epoch": 0.7140312937285332, + "ewc_loss": 0.03242664039134979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.621332012291532e-05, + "grad_norm": 24.429216384887695, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8453687429428101, + "num_tokens": 214220251.0, + "step": 5613 + }, + { + "epoch": 0.7141585040071238, + "ewc_loss": 0.032502319663763046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6251160559477285e-05, + "grad_norm": 24.46474266052246, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8549575209617615, + "num_tokens": 214256705.0, + "step": 5614 + }, + { + "epoch": 0.7142857142857143, + "ewc_loss": 0.03247885778546333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6239428077824414e-05, + "grad_norm": 24.449478149414062, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.871140718460083, + "num_tokens": 214292563.0, + "step": 5615 + }, + { + "epoch": 0.7144129245643048, + "ewc_loss": 0.03246035799384117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6230178516707383e-05, + "grad_norm": 24.47319793701172, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8509715795516968, + "num_tokens": 214333843.0, + "step": 5616 + }, + { + "epoch": 0.7145401348428954, + "ewc_loss": 0.032482605427503586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.624130345589947e-05, + "grad_norm": 24.42524528503418, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8697046041488647, + "num_tokens": 214372272.0, + "step": 5617 + }, + { + "epoch": 0.7146673451214858, + "ewc_loss": 0.032463837414979935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6231919289566576e-05, + "grad_norm": 24.38370704650879, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8637382984161377, + "num_tokens": 214416364.0, + "step": 5618 + }, + { + "epoch": 0.7147945554000763, + "ewc_loss": 0.03250904753804207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.625452387088444e-05, + "grad_norm": 24.49252700805664, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8550981283187866, + "num_tokens": 214453102.0, + "step": 5619 + }, + { + "epoch": 0.7149217656786668, + "ewc_loss": 0.032508235424757004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.625411823624745e-05, + "grad_norm": 24.347454071044922, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8616151809692383, + "num_tokens": 214492374.0, + "step": 5620 + }, + { + "epoch": 0.7150489759572574, + "ewc_loss": 0.03252718225121498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6263591533061117e-05, + "grad_norm": 24.474843978881836, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8570091724395752, + "num_tokens": 214530918.0, + "step": 5621 + }, + { + "epoch": 0.7151761862358479, + "ewc_loss": 0.03251715004444122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6258574760286137e-05, + "grad_norm": 24.3826847076416, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8737505674362183, + "num_tokens": 214570227.0, + "step": 5622 + }, + { + "epoch": 0.7153033965144384, + "ewc_loss": 0.032480500638484955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6240250261034817e-05, + "grad_norm": 24.488554000854492, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8605599403381348, + "num_tokens": 214609519.0, + "step": 5623 + }, + { + "epoch": 0.7154306067930288, + "ewc_loss": 0.032544855028390884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6272428183583543e-05, + "grad_norm": 24.40359878540039, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8553555011749268, + "num_tokens": 214643564.0, + "step": 5624 + }, + { + "epoch": 0.7155578170716194, + "ewc_loss": 0.032503336668014526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6251668057520874e-05, + "grad_norm": 24.583209991455078, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8658767938613892, + "num_tokens": 214677632.0, + "step": 5625 + }, + { + "epoch": 0.7156850273502099, + "ewc_loss": 0.03254012018442154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6270059859380126e-05, + "grad_norm": 24.46568489074707, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8596468567848206, + "num_tokens": 214710363.0, + "step": 5626 + }, + { + "epoch": 0.7158122376288004, + "ewc_loss": 0.032463911920785904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6231955669354647e-05, + "grad_norm": 24.523042678833008, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8661019206047058, + "num_tokens": 214746982.0, + "step": 5627 + }, + { + "epoch": 0.715939447907391, + "ewc_loss": 0.032522767782211304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6261383279925212e-05, + "grad_norm": 24.460182189941406, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8489881753921509, + "num_tokens": 214786447.0, + "step": 5628 + }, + { + "epoch": 0.7160666581859815, + "ewc_loss": 0.032458651810884476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.622932541067712e-05, + "grad_norm": 24.522085189819336, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8655863404273987, + "num_tokens": 214824627.0, + "step": 5629 + }, + { + "epoch": 0.7161938684645719, + "ewc_loss": 0.03248553350567818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.624276592337992e-05, + "grad_norm": 24.366342544555664, + "learning_rate": 1e-06, + "loss": 0.521, + "mean_token_accuracy": 0.8388108015060425, + "num_tokens": 214863051.0, + "step": 5630 + }, + { + "epoch": 0.7163210787431624, + "ewc_loss": 0.03247058764100075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6235293514910154e-05, + "grad_norm": 24.528837203979492, + "learning_rate": 1e-06, + "loss": 0.5173, + "mean_token_accuracy": 0.8445436358451843, + "num_tokens": 214900524.0, + "step": 5631 + }, + { + "epoch": 0.716448289021753, + "ewc_loss": 0.03257463499903679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6287316611851566e-05, + "grad_norm": 24.41035270690918, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8636173009872437, + "num_tokens": 214943949.0, + "step": 5632 + }, + { + "epoch": 0.7165754993003435, + "ewc_loss": 0.032521747052669525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.626087396289222e-05, + "grad_norm": 24.531333923339844, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8626335859298706, + "num_tokens": 214979126.0, + "step": 5633 + }, + { + "epoch": 0.716702709578934, + "ewc_loss": 0.03254065662622452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.627032906981185e-05, + "grad_norm": 24.33909797668457, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.858106255531311, + "num_tokens": 215017740.0, + "step": 5634 + }, + { + "epoch": 0.7168299198575245, + "ewc_loss": 0.03256475180387497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6282376236631535e-05, + "grad_norm": 24.588716506958008, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8653484582901001, + "num_tokens": 215054472.0, + "step": 5635 + }, + { + "epoch": 0.716957130136115, + "ewc_loss": 0.03254980966448784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6274905647151172e-05, + "grad_norm": 24.410411834716797, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.860080361366272, + "num_tokens": 215086894.0, + "step": 5636 + }, + { + "epoch": 0.7170843404147055, + "ewc_loss": 0.03248556703329086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6242784113273956e-05, + "grad_norm": 24.412282943725586, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8599257469177246, + "num_tokens": 215128157.0, + "step": 5637 + }, + { + "epoch": 0.717211550693296, + "ewc_loss": 0.03257439285516739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.628719655855093e-05, + "grad_norm": 24.47467803955078, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.865253210067749, + "num_tokens": 215161008.0, + "step": 5638 + }, + { + "epoch": 0.7173387609718865, + "ewc_loss": 0.03255624696612358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6278123439406045e-05, + "grad_norm": 24.42559242248535, + "learning_rate": 1e-06, + "loss": 0.5303, + "mean_token_accuracy": 0.8419238328933716, + "num_tokens": 215197157.0, + "step": 5639 + }, + { + "epoch": 0.7174659712504771, + "ewc_loss": 0.03260808438062668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.630404221941717e-05, + "grad_norm": 24.39163589477539, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8654724955558777, + "num_tokens": 215240270.0, + "step": 5640 + }, + { + "epoch": 0.7175931815290676, + "ewc_loss": 0.03257335349917412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6286676327581517e-05, + "grad_norm": 24.557025909423828, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8653607368469238, + "num_tokens": 215279828.0, + "step": 5641 + }, + { + "epoch": 0.717720391807658, + "ewc_loss": 0.032546672970056534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6273335859295912e-05, + "grad_norm": 24.286869049072266, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8515379428863525, + "num_tokens": 215315271.0, + "step": 5642 + }, + { + "epoch": 0.7178476020862485, + "ewc_loss": 0.03258403390645981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6292016880470328e-05, + "grad_norm": 24.478137969970703, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8599379062652588, + "num_tokens": 215352621.0, + "step": 5643 + }, + { + "epoch": 0.7179748123648391, + "ewc_loss": 0.032645370811223984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.632268504181411e-05, + "grad_norm": 24.361961364746094, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8598645925521851, + "num_tokens": 215390231.0, + "step": 5644 + }, + { + "epoch": 0.7181020226434296, + "ewc_loss": 0.032628849148750305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.631442501093261e-05, + "grad_norm": 24.45465087890625, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8732969760894775, + "num_tokens": 215426812.0, + "step": 5645 + }, + { + "epoch": 0.7182292329220201, + "ewc_loss": 0.03265732526779175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6328662240994163e-05, + "grad_norm": 24.428064346313477, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8471939563751221, + "num_tokens": 215467551.0, + "step": 5646 + }, + { + "epoch": 0.7183564432006107, + "ewc_loss": 0.03265523537993431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6327618141076528e-05, + "grad_norm": 24.484121322631836, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.867313027381897, + "num_tokens": 215508559.0, + "step": 5647 + }, + { + "epoch": 0.7184836534792011, + "ewc_loss": 0.03262712061405182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6313560990965925e-05, + "grad_norm": 24.330060958862305, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8511813282966614, + "num_tokens": 215548709.0, + "step": 5648 + }, + { + "epoch": 0.7186108637577916, + "ewc_loss": 0.03262852877378464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6314264939865097e-05, + "grad_norm": 24.50095558166504, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8584364652633667, + "num_tokens": 215593844.0, + "step": 5649 + }, + { + "epoch": 0.7187380740363821, + "ewc_loss": 0.03271852433681488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6359263099730015e-05, + "grad_norm": 24.459463119506836, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.864172637462616, + "num_tokens": 215633853.0, + "step": 5650 + }, + { + "epoch": 0.7188652843149727, + "ewc_loss": 0.03258048743009567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.629024336580187e-05, + "grad_norm": 24.451505661010742, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8577338457107544, + "num_tokens": 215667272.0, + "step": 5651 + }, + { + "epoch": 0.7189924945935632, + "ewc_loss": 0.03265952691435814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6329762729583308e-05, + "grad_norm": 24.373937606811523, + "learning_rate": 1e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.8413006663322449, + "num_tokens": 215705889.0, + "step": 5652 + }, + { + "epoch": 0.7191197048721537, + "ewc_loss": 0.03267538547515869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6337693523382768e-05, + "grad_norm": 24.567428588867188, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8673382997512817, + "num_tokens": 215749143.0, + "step": 5653 + }, + { + "epoch": 0.7192469151507441, + "ewc_loss": 0.03269989416003227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6349946236005053e-05, + "grad_norm": 24.464298248291016, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8526453375816345, + "num_tokens": 215787583.0, + "step": 5654 + }, + { + "epoch": 0.7193741254293347, + "ewc_loss": 0.0326051227748394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6302561562042683e-05, + "grad_norm": 24.442903518676758, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8587070107460022, + "num_tokens": 215825552.0, + "step": 5655 + }, + { + "epoch": 0.7195013357079252, + "ewc_loss": 0.0326959490776062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.634797445149161e-05, + "grad_norm": 24.483699798583984, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8682631254196167, + "num_tokens": 215858310.0, + "step": 5656 + }, + { + "epoch": 0.7196285459865157, + "ewc_loss": 0.032643161714076996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.632158091524616e-05, + "grad_norm": 24.400434494018555, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8473830223083496, + "num_tokens": 215898403.0, + "step": 5657 + }, + { + "epoch": 0.7197557562651062, + "ewc_loss": 0.03262688219547272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.631344093766529e-05, + "grad_norm": 24.43747329711914, + "learning_rate": 1e-06, + "loss": 0.5479, + "mean_token_accuracy": 0.835817277431488, + "num_tokens": 215935930.0, + "step": 5658 + }, + { + "epoch": 0.7198829665436968, + "ewc_loss": 0.032648053020238876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6324025637004524e-05, + "grad_norm": 24.438478469848633, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8554455041885376, + "num_tokens": 215971509.0, + "step": 5659 + }, + { + "epoch": 0.7200101768222873, + "ewc_loss": 0.032693445682525635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.634672298678197e-05, + "grad_norm": 24.396453857421875, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8639003038406372, + "num_tokens": 216008696.0, + "step": 5660 + }, + { + "epoch": 0.7201373871008777, + "ewc_loss": 0.03262973949313164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6314870663336478e-05, + "grad_norm": 24.433656692504883, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.858651876449585, + "num_tokens": 216045861.0, + "step": 5661 + }, + { + "epoch": 0.7202645973794682, + "ewc_loss": 0.032668329775333405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.633416468393989e-05, + "grad_norm": 24.37674331665039, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8681778311729431, + "num_tokens": 216081307.0, + "step": 5662 + }, + { + "epoch": 0.7203918076580588, + "ewc_loss": 0.03267998620867729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.633999272598885e-05, + "grad_norm": 24.42339324951172, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.86772221326828, + "num_tokens": 216118506.0, + "step": 5663 + }, + { + "epoch": 0.7205190179366493, + "ewc_loss": 0.03272148221731186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6360741938115098e-05, + "grad_norm": 24.425662994384766, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8512110114097595, + "num_tokens": 216159000.0, + "step": 5664 + }, + { + "epoch": 0.7206462282152398, + "ewc_loss": 0.03267935290932655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6339676221832633e-05, + "grad_norm": 24.408132553100586, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8517551422119141, + "num_tokens": 216204712.0, + "step": 5665 + }, + { + "epoch": 0.7207734384938304, + "ewc_loss": 0.03267974033951759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6339870853698812e-05, + "grad_norm": 24.412626266479492, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8510943651199341, + "num_tokens": 216235001.0, + "step": 5666 + }, + { + "epoch": 0.7209006487724208, + "ewc_loss": 0.03271933272480965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6359666915377602e-05, + "grad_norm": 24.437589645385742, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8522695302963257, + "num_tokens": 216271383.0, + "step": 5667 + }, + { + "epoch": 0.7210278590510113, + "ewc_loss": 0.032724879682064056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6362439055228606e-05, + "grad_norm": 24.4532527923584, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8587459325790405, + "num_tokens": 216307923.0, + "step": 5668 + }, + { + "epoch": 0.7211550693296018, + "ewc_loss": 0.03271518647670746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.635759326745756e-05, + "grad_norm": 24.419004440307617, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.876901388168335, + "num_tokens": 216345147.0, + "step": 5669 + }, + { + "epoch": 0.7212822796081924, + "ewc_loss": 0.03273335471749306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6366677300538868e-05, + "grad_norm": 24.411531448364258, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8512195348739624, + "num_tokens": 216386991.0, + "step": 5670 + }, + { + "epoch": 0.7214094898867829, + "ewc_loss": 0.03272808715701103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6364043403882533e-05, + "grad_norm": 24.420133590698242, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8609402179718018, + "num_tokens": 216425534.0, + "step": 5671 + }, + { + "epoch": 0.7215367001653734, + "ewc_loss": 0.03269391879439354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.634695945540443e-05, + "grad_norm": 24.42072296142578, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.859695315361023, + "num_tokens": 216466686.0, + "step": 5672 + }, + { + "epoch": 0.7216639104439638, + "ewc_loss": 0.032736506313085556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.636825254536234e-05, + "grad_norm": 24.403318405151367, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8710340857505798, + "num_tokens": 216505245.0, + "step": 5673 + }, + { + "epoch": 0.7217911207225544, + "ewc_loss": 0.032750144600868225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6375071936636232e-05, + "grad_norm": 24.489595413208008, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8556338548660278, + "num_tokens": 216538024.0, + "step": 5674 + }, + { + "epoch": 0.7219183310011449, + "ewc_loss": 0.03273604437708855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6368021533708088e-05, + "grad_norm": 24.500843048095703, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8585957288742065, + "num_tokens": 216573722.0, + "step": 5675 + }, + { + "epoch": 0.7220455412797354, + "ewc_loss": 0.03279725834727287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6398629668401554e-05, + "grad_norm": 24.77667236328125, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.864580512046814, + "num_tokens": 216610362.0, + "step": 5676 + }, + { + "epoch": 0.7221727515583259, + "ewc_loss": 0.032704807817935944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6352403690689243e-05, + "grad_norm": 24.427162170410156, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8493304252624512, + "num_tokens": 216648732.0, + "step": 5677 + }, + { + "epoch": 0.7222999618369165, + "ewc_loss": 0.032738346606492996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6369172954000533e-05, + "grad_norm": 24.85063934326172, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8662766814231873, + "num_tokens": 216680324.0, + "step": 5678 + }, + { + "epoch": 0.7224271721155069, + "ewc_loss": 0.032687120139598846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6343559764209203e-05, + "grad_norm": 24.32843017578125, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8624076247215271, + "num_tokens": 216712773.0, + "step": 5679 + }, + { + "epoch": 0.7225543823940974, + "ewc_loss": 0.032601457089185715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.630072802072391e-05, + "grad_norm": 24.58815574645996, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8506848216056824, + "num_tokens": 216752538.0, + "step": 5680 + }, + { + "epoch": 0.722681592672688, + "ewc_loss": 0.03277919068932533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6389594748034142e-05, + "grad_norm": 24.63869285583496, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8608551621437073, + "num_tokens": 216791795.0, + "step": 5681 + }, + { + "epoch": 0.7228088029512785, + "ewc_loss": 0.03267287835478783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.633643842069432e-05, + "grad_norm": 24.56300926208496, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8551246523857117, + "num_tokens": 216831729.0, + "step": 5682 + }, + { + "epoch": 0.722936013229869, + "ewc_loss": 0.03268727287650108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6343636161764152e-05, + "grad_norm": 24.575910568237305, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8590657711029053, + "num_tokens": 216869494.0, + "step": 5683 + }, + { + "epoch": 0.7230632235084595, + "ewc_loss": 0.032668981701135635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6334490283043124e-05, + "grad_norm": 24.55390167236328, + "learning_rate": 1e-06, + "loss": 0.5038, + "mean_token_accuracy": 0.8404109477996826, + "num_tokens": 216906672.0, + "step": 5684 + }, + { + "epoch": 0.72319043378705, + "ewc_loss": 0.032625578343868256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6312789739458822e-05, + "grad_norm": 24.471071243286133, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8660579919815063, + "num_tokens": 216941587.0, + "step": 5685 + }, + { + "epoch": 0.7233176440656405, + "ewc_loss": 0.03269021213054657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6345105905202217e-05, + "grad_norm": 24.47504234313965, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.85115647315979, + "num_tokens": 216981812.0, + "step": 5686 + }, + { + "epoch": 0.723444854344231, + "ewc_loss": 0.03270236775279045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6351183148799464e-05, + "grad_norm": 24.567537307739258, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.868040919303894, + "num_tokens": 217024137.0, + "step": 5687 + }, + { + "epoch": 0.7235720646228215, + "ewc_loss": 0.032663583755493164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6331792721757665e-05, + "grad_norm": 24.433170318603516, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8709003329277039, + "num_tokens": 217063753.0, + "step": 5688 + }, + { + "epoch": 0.7236992749014121, + "ewc_loss": 0.032693054527044296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6346526535926387e-05, + "grad_norm": 24.49197769165039, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8574315309524536, + "num_tokens": 217105611.0, + "step": 5689 + }, + { + "epoch": 0.7238264851800026, + "ewc_loss": 0.03271501511335373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6357507774955593e-05, + "grad_norm": 24.473905563354492, + "learning_rate": 1e-06, + "loss": 0.5093, + "mean_token_accuracy": 0.8386249542236328, + "num_tokens": 217145989.0, + "step": 5690 + }, + { + "epoch": 0.723953695458593, + "ewc_loss": 0.03271547332406044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.635773696762044e-05, + "grad_norm": 24.4774112701416, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8774480819702148, + "num_tokens": 217183726.0, + "step": 5691 + }, + { + "epoch": 0.7240809057371835, + "ewc_loss": 0.03270402178168297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6352010788978077e-05, + "grad_norm": 24.488792419433594, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.856658935546875, + "num_tokens": 217226644.0, + "step": 5692 + }, + { + "epoch": 0.7242081160157741, + "ewc_loss": 0.03264904022216797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.632452040212229e-05, + "grad_norm": 24.487394332885742, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8442326188087463, + "num_tokens": 217263327.0, + "step": 5693 + }, + { + "epoch": 0.7243353262943646, + "ewc_loss": 0.03272213786840439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6361069356207736e-05, + "grad_norm": 24.510921478271484, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8634119629859924, + "num_tokens": 217300595.0, + "step": 5694 + }, + { + "epoch": 0.7244625365729551, + "ewc_loss": 0.032714199274778366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.63571003213292e-05, + "grad_norm": 24.540987014770508, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8446871638298035, + "num_tokens": 217336385.0, + "step": 5695 + }, + { + "epoch": 0.7245897468515456, + "ewc_loss": 0.03268005698919296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.634002910577692e-05, + "grad_norm": 24.498708724975586, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8652530908584595, + "num_tokens": 217374309.0, + "step": 5696 + }, + { + "epoch": 0.7247169571301361, + "ewc_loss": 0.03261289373040199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6306446923408657e-05, + "grad_norm": 24.505306243896484, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8481336832046509, + "num_tokens": 217408386.0, + "step": 5697 + }, + { + "epoch": 0.7248441674087266, + "ewc_loss": 0.03272644430398941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.636322122067213e-05, + "grad_norm": 24.521991729736328, + "learning_rate": 1e-06, + "loss": 0.5087, + "mean_token_accuracy": 0.8449082374572754, + "num_tokens": 217442855.0, + "step": 5698 + }, + { + "epoch": 0.7249713776873171, + "ewc_loss": 0.032614849507808685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6307425539707765e-05, + "grad_norm": 24.40103530883789, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8626368641853333, + "num_tokens": 217476892.0, + "step": 5699 + }, + { + "epoch": 0.7250985879659076, + "ewc_loss": 0.03274522349238396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6372612662962638e-05, + "grad_norm": 24.555021286010742, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8568820953369141, + "num_tokens": 217513569.0, + "step": 5700 + }, + { + "epoch": 0.7252257982444982, + "ewc_loss": 0.03273148834705353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.636574415897485e-05, + "grad_norm": 24.430633544921875, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8522390127182007, + "num_tokens": 217545585.0, + "step": 5701 + }, + { + "epoch": 0.7253530085230887, + "ewc_loss": 0.03274299576878548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6371497622458264e-05, + "grad_norm": 24.63595199584961, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8602678179740906, + "num_tokens": 217583156.0, + "step": 5702 + }, + { + "epoch": 0.7254802188016791, + "ewc_loss": 0.03271622955799103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6358115317416377e-05, + "grad_norm": 24.291526794433594, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8589372634887695, + "num_tokens": 217619630.0, + "step": 5703 + }, + { + "epoch": 0.7256074290802697, + "ewc_loss": 0.03273985907435417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6369929653592408e-05, + "grad_norm": 24.654541015625, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.869182288646698, + "num_tokens": 217653616.0, + "step": 5704 + }, + { + "epoch": 0.7257346393588602, + "ewc_loss": 0.032816167920827866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6408084775321186e-05, + "grad_norm": 24.619979858398438, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8542098999023438, + "num_tokens": 217692758.0, + "step": 5705 + }, + { + "epoch": 0.7258618496374507, + "ewc_loss": 0.03265699744224548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6328498531947844e-05, + "grad_norm": 24.394023895263672, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8437185287475586, + "num_tokens": 217734175.0, + "step": 5706 + }, + { + "epoch": 0.7259890599160412, + "ewc_loss": 0.032855451107025146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6427726222900674e-05, + "grad_norm": 24.90450668334961, + "learning_rate": 1e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8363800048828125, + "num_tokens": 217773372.0, + "step": 5707 + }, + { + "epoch": 0.7261162701946318, + "ewc_loss": 0.032794203609228134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6397101717302576e-05, + "grad_norm": 24.563304901123047, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8679103851318359, + "num_tokens": 217809174.0, + "step": 5708 + }, + { + "epoch": 0.7262434804732223, + "ewc_loss": 0.03262418881058693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6312094885506667e-05, + "grad_norm": 24.41475486755371, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8511589169502258, + "num_tokens": 217845882.0, + "step": 5709 + }, + { + "epoch": 0.7263706907518127, + "ewc_loss": 0.03283514082431793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6417570805060677e-05, + "grad_norm": 24.85651397705078, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8548259139060974, + "num_tokens": 217885798.0, + "step": 5710 + }, + { + "epoch": 0.7264979010304032, + "ewc_loss": 0.03276897221803665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.638448520679958e-05, + "grad_norm": 24.464412689208984, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.85484778881073, + "num_tokens": 217924967.0, + "step": 5711 + }, + { + "epoch": 0.7266251113089938, + "ewc_loss": 0.03267141059041023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6335705367964692e-05, + "grad_norm": 24.579240798950195, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8697695732116699, + "num_tokens": 217967381.0, + "step": 5712 + }, + { + "epoch": 0.7267523215875843, + "ewc_loss": 0.03274733945727348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6373669495806098e-05, + "grad_norm": 24.6254940032959, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.863247811794281, + "num_tokens": 218001097.0, + "step": 5713 + }, + { + "epoch": 0.7268795318661748, + "ewc_loss": 0.032681889832019806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6340944057446904e-05, + "grad_norm": 24.345674514770508, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8601375818252563, + "num_tokens": 218040705.0, + "step": 5714 + }, + { + "epoch": 0.7270067421447653, + "ewc_loss": 0.032737839967012405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.636892011447344e-05, + "grad_norm": 24.62071990966797, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8573105335235596, + "num_tokens": 218077549.0, + "step": 5715 + }, + { + "epoch": 0.7271339524233558, + "ewc_loss": 0.03279354050755501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.639677066123113e-05, + "grad_norm": 24.432077407836914, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8576577305793762, + "num_tokens": 218116251.0, + "step": 5716 + }, + { + "epoch": 0.7272611627019463, + "ewc_loss": 0.03277619928121567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6388099538744427e-05, + "grad_norm": 24.58844566345215, + "learning_rate": 1e-06, + "loss": 0.5339, + "mean_token_accuracy": 0.831637442111969, + "num_tokens": 218155798.0, + "step": 5717 + }, + { + "epoch": 0.7273883729805368, + "ewc_loss": 0.03278357535600662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6391788449254818e-05, + "grad_norm": 24.48773765563965, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8633842468261719, + "num_tokens": 218191277.0, + "step": 5718 + }, + { + "epoch": 0.7275155832591274, + "ewc_loss": 0.03274201974272728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6371010133298114e-05, + "grad_norm": 24.578250885009766, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8645128607749939, + "num_tokens": 218233165.0, + "step": 5719 + }, + { + "epoch": 0.7276427935377179, + "ewc_loss": 0.032782234251499176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.639111724216491e-05, + "grad_norm": 24.552396774291992, + "learning_rate": 1e-06, + "loss": 0.5193, + "mean_token_accuracy": 0.8381389379501343, + "num_tokens": 218271263.0, + "step": 5720 + }, + { + "epoch": 0.7277700038163084, + "ewc_loss": 0.03271407261490822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6357036656700075e-05, + "grad_norm": 24.570568084716797, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8592715263366699, + "num_tokens": 218309081.0, + "step": 5721 + }, + { + "epoch": 0.7278972140948988, + "ewc_loss": 0.03277956694364548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.638978392293211e-05, + "grad_norm": 24.503820419311523, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8538395762443542, + "num_tokens": 218348314.0, + "step": 5722 + }, + { + "epoch": 0.7280244243734894, + "ewc_loss": 0.03273459151387215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6367295756936073e-05, + "grad_norm": 24.66698455810547, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8619242906570435, + "num_tokens": 218387556.0, + "step": 5723 + }, + { + "epoch": 0.7281516346520799, + "ewc_loss": 0.03281034901738167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6405174392275512e-05, + "grad_norm": 24.551668167114258, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8500429391860962, + "num_tokens": 218423130.0, + "step": 5724 + }, + { + "epoch": 0.7282788449306704, + "ewc_loss": 0.03271704912185669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6358524590032175e-05, + "grad_norm": 24.65372657775879, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8402900099754333, + "num_tokens": 218464079.0, + "step": 5725 + }, + { + "epoch": 0.7284060552092609, + "ewc_loss": 0.032733254134655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.636662636883557e-05, + "grad_norm": 24.49770164489746, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8657890558242798, + "num_tokens": 218501307.0, + "step": 5726 + }, + { + "epoch": 0.7285332654878515, + "ewc_loss": 0.03268396854400635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6341984519385733e-05, + "grad_norm": 24.53364372253418, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8664593696594238, + "num_tokens": 218537710.0, + "step": 5727 + }, + { + "epoch": 0.7286604757664419, + "ewc_loss": 0.032737620174884796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6368810975109227e-05, + "grad_norm": 24.58309555053711, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8537147641181946, + "num_tokens": 218576026.0, + "step": 5728 + }, + { + "epoch": 0.7287876860450324, + "ewc_loss": 0.032687075436115265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.634353793633636e-05, + "grad_norm": 24.472412109375, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.855238139629364, + "num_tokens": 218612491.0, + "step": 5729 + }, + { + "epoch": 0.7289148963236229, + "ewc_loss": 0.03269676864147186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6348383724107407e-05, + "grad_norm": 24.579193115234375, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.865787923336029, + "num_tokens": 218642541.0, + "step": 5730 + }, + { + "epoch": 0.7290421066022135, + "ewc_loss": 0.032776348292827606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6388174117309973e-05, + "grad_norm": 24.624935150146484, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.844001829624176, + "num_tokens": 218678514.0, + "step": 5731 + }, + { + "epoch": 0.729169316880804, + "ewc_loss": 0.03269530087709427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6347650671377778e-05, + "grad_norm": 24.536067962646484, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8628841638565063, + "num_tokens": 218715415.0, + "step": 5732 + }, + { + "epoch": 0.7292965271593945, + "ewc_loss": 0.03268808871507645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6344043615390547e-05, + "grad_norm": 24.488544464111328, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8667637705802917, + "num_tokens": 218754422.0, + "step": 5733 + }, + { + "epoch": 0.7294237374379849, + "ewc_loss": 0.03277238458395004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6386191418860108e-05, + "grad_norm": 24.69382667541504, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8492189049720764, + "num_tokens": 218787141.0, + "step": 5734 + }, + { + "epoch": 0.7295509477165755, + "ewc_loss": 0.032755423337221146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6377711290260777e-05, + "grad_norm": 24.459943771362305, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.872897744178772, + "num_tokens": 218824764.0, + "step": 5735 + }, + { + "epoch": 0.729678157995166, + "ewc_loss": 0.032702989876270294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.635149419598747e-05, + "grad_norm": 24.5361328125, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8560162782669067, + "num_tokens": 218866142.0, + "step": 5736 + }, + { + "epoch": 0.7298053682737565, + "ewc_loss": 0.032767027616500854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6383513866458088e-05, + "grad_norm": 24.53000259399414, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.853526771068573, + "num_tokens": 218909208.0, + "step": 5737 + }, + { + "epoch": 0.7299325785523471, + "ewc_loss": 0.03272724524140358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6363623217330314e-05, + "grad_norm": 24.50912094116211, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8653637170791626, + "num_tokens": 218942469.0, + "step": 5738 + }, + { + "epoch": 0.7300597888309376, + "ewc_loss": 0.03277011588215828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.63850581884617e-05, + "grad_norm": 24.451292037963867, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8513100147247314, + "num_tokens": 218982934.0, + "step": 5739 + }, + { + "epoch": 0.730186999109528, + "ewc_loss": 0.032752279192209244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6376139683416113e-05, + "grad_norm": 24.62238883972168, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8555240035057068, + "num_tokens": 219027055.0, + "step": 5740 + }, + { + "epoch": 0.7303142093881185, + "ewc_loss": 0.032768696546554565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6384348782594316e-05, + "grad_norm": 24.4794864654541, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8587722778320312, + "num_tokens": 219069639.0, + "step": 5741 + }, + { + "epoch": 0.7304414196667091, + "ewc_loss": 0.03267977014183998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.633988540561404e-05, + "grad_norm": 24.525190353393555, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8516942262649536, + "num_tokens": 219111774.0, + "step": 5742 + }, + { + "epoch": 0.7305686299452996, + "ewc_loss": 0.032784461975097656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.639223046367988e-05, + "grad_norm": 24.53778648376465, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8450199961662292, + "num_tokens": 219156962.0, + "step": 5743 + }, + { + "epoch": 0.7306958402238901, + "ewc_loss": 0.03275877982378006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.637939021748025e-05, + "grad_norm": 24.454862594604492, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8676047325134277, + "num_tokens": 219193352.0, + "step": 5744 + }, + { + "epoch": 0.7308230505024806, + "ewc_loss": 0.03276410326361656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6382051398977637e-05, + "grad_norm": 24.564990997314453, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.844398558139801, + "num_tokens": 219230041.0, + "step": 5745 + }, + { + "epoch": 0.7309502607810711, + "ewc_loss": 0.032753847539424896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.637692366784904e-05, + "grad_norm": 24.44519805908203, + "learning_rate": 1e-06, + "loss": 0.522, + "mean_token_accuracy": 0.8360799551010132, + "num_tokens": 219262748.0, + "step": 5746 + }, + { + "epoch": 0.7310774710596616, + "ewc_loss": 0.032767489552497864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.638374487811234e-05, + "grad_norm": 24.561485290527344, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8695075511932373, + "num_tokens": 219296556.0, + "step": 5747 + }, + { + "epoch": 0.7312046813382521, + "ewc_loss": 0.03282882645726204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.641441303945612e-05, + "grad_norm": 24.501527786254883, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.845782995223999, + "num_tokens": 219330801.0, + "step": 5748 + }, + { + "epoch": 0.7313318916168426, + "ewc_loss": 0.03270766884088516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6353835235349834e-05, + "grad_norm": 24.50925064086914, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8586549758911133, + "num_tokens": 219367470.0, + "step": 5749 + }, + { + "epoch": 0.7314591018954332, + "ewc_loss": 0.03283824026584625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6419120584032498e-05, + "grad_norm": 24.64725112915039, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8446693420410156, + "num_tokens": 219405291.0, + "step": 5750 + }, + { + "epoch": 0.7315863121740237, + "ewc_loss": 0.03283779323101044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6418896848335862e-05, + "grad_norm": 24.558927536010742, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8594080805778503, + "num_tokens": 219443365.0, + "step": 5751 + }, + { + "epoch": 0.7317135224526141, + "ewc_loss": 0.03280215710401535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6401078028138727e-05, + "grad_norm": 24.56639289855957, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8601665496826172, + "num_tokens": 219477524.0, + "step": 5752 + }, + { + "epoch": 0.7318407327312046, + "ewc_loss": 0.03283483907580376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6417419828940183e-05, + "grad_norm": 24.488325119018555, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8495821952819824, + "num_tokens": 219514576.0, + "step": 5753 + }, + { + "epoch": 0.7319679430097952, + "ewc_loss": 0.03285868838429451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6429343304480426e-05, + "grad_norm": 24.644210815429688, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8582776784896851, + "num_tokens": 219558253.0, + "step": 5754 + }, + { + "epoch": 0.7320951532883857, + "ewc_loss": 0.03287452459335327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.643726136535406e-05, + "grad_norm": 24.508121490478516, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.853495717048645, + "num_tokens": 219601156.0, + "step": 5755 + }, + { + "epoch": 0.7322223635669762, + "ewc_loss": 0.03283383697271347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6416917787864804e-05, + "grad_norm": 24.647937774658203, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8474463224411011, + "num_tokens": 219638241.0, + "step": 5756 + }, + { + "epoch": 0.7323495738455668, + "ewc_loss": 0.03288192301988602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6440961189800873e-05, + "grad_norm": 24.551315307617188, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.858582615852356, + "num_tokens": 219678168.0, + "step": 5757 + }, + { + "epoch": 0.7324767841241572, + "ewc_loss": 0.03280361369252205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.640180744288955e-05, + "grad_norm": 24.559736251831055, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8556766510009766, + "num_tokens": 219718115.0, + "step": 5758 + }, + { + "epoch": 0.7326039944027477, + "ewc_loss": 0.032881394028663635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.644069743633736e-05, + "grad_norm": 24.559389114379883, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8579264879226685, + "num_tokens": 219754221.0, + "step": 5759 + }, + { + "epoch": 0.7327312046813382, + "ewc_loss": 0.03282710909843445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6413554476457648e-05, + "grad_norm": 24.498565673828125, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8633914589881897, + "num_tokens": 219786880.0, + "step": 5760 + }, + { + "epoch": 0.7328584149599288, + "ewc_loss": 0.032870590686798096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6435295037808828e-05, + "grad_norm": 24.517723083496094, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8621544241905212, + "num_tokens": 219830585.0, + "step": 5761 + }, + { + "epoch": 0.7329856252385193, + "ewc_loss": 0.032858897000551224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.642944880586583e-05, + "grad_norm": 24.478130340576172, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8612203001976013, + "num_tokens": 219867249.0, + "step": 5762 + }, + { + "epoch": 0.7331128355171098, + "ewc_loss": 0.03279262036085129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6396310456912033e-05, + "grad_norm": 24.52980613708496, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.849234938621521, + "num_tokens": 219906046.0, + "step": 5763 + }, + { + "epoch": 0.7332400457957003, + "ewc_loss": 0.03286098316311836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6430491086794063e-05, + "grad_norm": 24.58139991760254, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8490660190582275, + "num_tokens": 219947466.0, + "step": 5764 + }, + { + "epoch": 0.7333672560742908, + "ewc_loss": 0.03285358101129532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.642679126234725e-05, + "grad_norm": 24.454408645629883, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8587733507156372, + "num_tokens": 219985698.0, + "step": 5765 + }, + { + "epoch": 0.7334944663528813, + "ewc_loss": 0.03278781473636627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.639390757190995e-05, + "grad_norm": 24.538000106811523, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8583614826202393, + "num_tokens": 220026158.0, + "step": 5766 + }, + { + "epoch": 0.7336216766314718, + "ewc_loss": 0.03289686515927315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6448431779281236e-05, + "grad_norm": 24.556596755981445, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8567055463790894, + "num_tokens": 220071849.0, + "step": 5767 + }, + { + "epoch": 0.7337488869100623, + "ewc_loss": 0.03281255438923836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.640627669985406e-05, + "grad_norm": 24.603618621826172, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8645867109298706, + "num_tokens": 220110255.0, + "step": 5768 + }, + { + "epoch": 0.7338760971886529, + "ewc_loss": 0.03281411528587341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6407057046308182e-05, + "grad_norm": 24.52117347717285, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8433687686920166, + "num_tokens": 220144962.0, + "step": 5769 + }, + { + "epoch": 0.7340033074672434, + "ewc_loss": 0.03284236788749695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.642118331801612e-05, + "grad_norm": 24.63665199279785, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8480439186096191, + "num_tokens": 220182118.0, + "step": 5770 + }, + { + "epoch": 0.7341305177458338, + "ewc_loss": 0.032800111919641495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6400055756093934e-05, + "grad_norm": 24.548477172851562, + "learning_rate": 1e-06, + "loss": 0.5061, + "mean_token_accuracy": 0.8484631180763245, + "num_tokens": 220218954.0, + "step": 5771 + }, + { + "epoch": 0.7342577280244243, + "ewc_loss": 0.032799553126096725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.639977745071519e-05, + "grad_norm": 24.662033081054688, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8569298982620239, + "num_tokens": 220259116.0, + "step": 5772 + }, + { + "epoch": 0.7343849383030149, + "ewc_loss": 0.03278108313679695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.639054244151339e-05, + "grad_norm": 24.496763229370117, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8535284996032715, + "num_tokens": 220301742.0, + "step": 5773 + }, + { + "epoch": 0.7345121485816054, + "ewc_loss": 0.03275343403220177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6376716303057037e-05, + "grad_norm": 24.556472778320312, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8619438409805298, + "num_tokens": 220342572.0, + "step": 5774 + }, + { + "epoch": 0.7346393588601959, + "ewc_loss": 0.03279520571231842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6397603758377954e-05, + "grad_norm": 24.49237823486328, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8629363775253296, + "num_tokens": 220376373.0, + "step": 5775 + }, + { + "epoch": 0.7347665691387865, + "ewc_loss": 0.03282557427883148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.641278686292935e-05, + "grad_norm": 24.62218475341797, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8733105063438416, + "num_tokens": 220413290.0, + "step": 5776 + }, + { + "epoch": 0.7348937794173769, + "ewc_loss": 0.03281410038471222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6407049770350568e-05, + "grad_norm": 24.456825256347656, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.853869616985321, + "num_tokens": 220453219.0, + "step": 5777 + }, + { + "epoch": 0.7350209896959674, + "ewc_loss": 0.032759372144937515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6379686712753028e-05, + "grad_norm": 24.58316993713379, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8586019277572632, + "num_tokens": 220488369.0, + "step": 5778 + }, + { + "epoch": 0.7351481999745579, + "ewc_loss": 0.03281514346599579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6407571820309386e-05, + "grad_norm": 24.471208572387695, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8613495230674744, + "num_tokens": 220527798.0, + "step": 5779 + }, + { + "epoch": 0.7352754102531485, + "ewc_loss": 0.03273520991206169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6367604985134676e-05, + "grad_norm": 24.654083251953125, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8755451440811157, + "num_tokens": 220565398.0, + "step": 5780 + }, + { + "epoch": 0.735402620531739, + "ewc_loss": 0.03284236043691635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6421179680037312e-05, + "grad_norm": 24.460020065307617, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8680989742279053, + "num_tokens": 220603903.0, + "step": 5781 + }, + { + "epoch": 0.7355298308103295, + "ewc_loss": 0.03271711245179176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6358555512852035e-05, + "grad_norm": 24.4642391204834, + "learning_rate": 1e-06, + "loss": 0.5082, + "mean_token_accuracy": 0.8405359983444214, + "num_tokens": 220644168.0, + "step": 5782 + }, + { + "epoch": 0.7356570410889199, + "ewc_loss": 0.032848771661520004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6424386558355764e-05, + "grad_norm": 24.697660446166992, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8700354099273682, + "num_tokens": 220679709.0, + "step": 5783 + }, + { + "epoch": 0.7357842513675105, + "ewc_loss": 0.03284741938114166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6423709894297644e-05, + "grad_norm": 24.406410217285156, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8519108295440674, + "num_tokens": 220721184.0, + "step": 5784 + }, + { + "epoch": 0.735911461646101, + "ewc_loss": 0.03277318924665451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6386595234507695e-05, + "grad_norm": 24.62310218811035, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8618136644363403, + "num_tokens": 220755027.0, + "step": 5785 + }, + { + "epoch": 0.7360386719246915, + "ewc_loss": 0.032898351550102234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6449175745947286e-05, + "grad_norm": 24.54055404663086, + "learning_rate": 1e-06, + "loss": 0.528, + "mean_token_accuracy": 0.8338746428489685, + "num_tokens": 220793597.0, + "step": 5786 + }, + { + "epoch": 0.736165882203282, + "ewc_loss": 0.03282877057790756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.641438575461507e-05, + "grad_norm": 24.54824447631836, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8566452860832214, + "num_tokens": 220833549.0, + "step": 5787 + }, + { + "epoch": 0.7362930924818726, + "ewc_loss": 0.032837484031915665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.641874223423656e-05, + "grad_norm": 24.524276733398438, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8639140725135803, + "num_tokens": 220865551.0, + "step": 5788 + }, + { + "epoch": 0.736420302760463, + "ewc_loss": 0.03285869210958481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6429346942459233e-05, + "grad_norm": 24.62413215637207, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8706575632095337, + "num_tokens": 220911626.0, + "step": 5789 + }, + { + "epoch": 0.7365475130390535, + "ewc_loss": 0.03285035118460655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6425175999756902e-05, + "grad_norm": 24.491024017333984, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8596614599227905, + "num_tokens": 220949839.0, + "step": 5790 + }, + { + "epoch": 0.736674723317644, + "ewc_loss": 0.032803770154714584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.64018856594339e-05, + "grad_norm": 24.561119079589844, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8407295942306519, + "num_tokens": 220984069.0, + "step": 5791 + }, + { + "epoch": 0.7368019335962346, + "ewc_loss": 0.03285462036728859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6427309674327262e-05, + "grad_norm": 24.533357620239258, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8776525855064392, + "num_tokens": 221020999.0, + "step": 5792 + }, + { + "epoch": 0.7369291438748251, + "ewc_loss": 0.032896775752305984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.644838812353555e-05, + "grad_norm": 24.640573501586914, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8484551906585693, + "num_tokens": 221051082.0, + "step": 5793 + }, + { + "epoch": 0.7370563541534156, + "ewc_loss": 0.03289364278316498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6446821973659098e-05, + "grad_norm": 24.486791610717773, + "learning_rate": 1e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8403352499008179, + "num_tokens": 221095444.0, + "step": 5794 + }, + { + "epoch": 0.737183564432006, + "ewc_loss": 0.032908327877521515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.645416341489181e-05, + "grad_norm": 24.66301727294922, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.859772264957428, + "num_tokens": 221136433.0, + "step": 5795 + }, + { + "epoch": 0.7373107747105966, + "ewc_loss": 0.03295856714248657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6479283658554778e-05, + "grad_norm": 24.492403030395508, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.850070595741272, + "num_tokens": 221171278.0, + "step": 5796 + }, + { + "epoch": 0.7374379849891871, + "ewc_loss": 0.03286518529057503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.643259201955516e-05, + "grad_norm": 24.628093719482422, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8558269143104553, + "num_tokens": 221209380.0, + "step": 5797 + }, + { + "epoch": 0.7375651952677776, + "ewc_loss": 0.032953064888715744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6476533346576616e-05, + "grad_norm": 24.497995376586914, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8560324311256409, + "num_tokens": 221253076.0, + "step": 5798 + }, + { + "epoch": 0.7376924055463682, + "ewc_loss": 0.032903458923101425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6451729607069865e-05, + "grad_norm": 24.604312896728516, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8524873852729797, + "num_tokens": 221290348.0, + "step": 5799 + }, + { + "epoch": 0.7378196158249587, + "ewc_loss": 0.0329919159412384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6495958334417082e-05, + "grad_norm": 24.55120086669922, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8635672330856323, + "num_tokens": 221329635.0, + "step": 5800 + }, + { + "epoch": 0.7379468261035491, + "ewc_loss": 0.03289081156253815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6445404980913736e-05, + "grad_norm": 24.46704864501953, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8646773099899292, + "num_tokens": 221368521.0, + "step": 5801 + }, + { + "epoch": 0.7380740363821396, + "ewc_loss": 0.03297683596611023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6488418623339385e-05, + "grad_norm": 24.62592315673828, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8698036074638367, + "num_tokens": 221407090.0, + "step": 5802 + }, + { + "epoch": 0.7382012466607302, + "ewc_loss": 0.03303862735629082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.651931415835861e-05, + "grad_norm": 24.609657287597656, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8671442270278931, + "num_tokens": 221450340.0, + "step": 5803 + }, + { + "epoch": 0.7383284569393207, + "ewc_loss": 0.03294120728969574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6470603441121057e-05, + "grad_norm": 24.512243270874023, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8587633371353149, + "num_tokens": 221491996.0, + "step": 5804 + }, + { + "epoch": 0.7384556672179112, + "ewc_loss": 0.03295155242085457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.647577664698474e-05, + "grad_norm": 24.533458709716797, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8662295341491699, + "num_tokens": 221531466.0, + "step": 5805 + }, + { + "epoch": 0.7385828774965018, + "ewc_loss": 0.032938238233327866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6469119145767763e-05, + "grad_norm": 24.631000518798828, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8384045362472534, + "num_tokens": 221567003.0, + "step": 5806 + }, + { + "epoch": 0.7387100877750922, + "ewc_loss": 0.03296036273241043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.648018223932013e-05, + "grad_norm": 24.544191360473633, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8703432679176331, + "num_tokens": 221607396.0, + "step": 5807 + }, + { + "epoch": 0.7388372980536827, + "ewc_loss": 0.032897185534238815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6448591850348748e-05, + "grad_norm": 24.61557388305664, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8497661352157593, + "num_tokens": 221644776.0, + "step": 5808 + }, + { + "epoch": 0.7389645083322732, + "ewc_loss": 0.03294587880373001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.647293902351521e-05, + "grad_norm": 24.576059341430664, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8711638450622559, + "num_tokens": 221685643.0, + "step": 5809 + }, + { + "epoch": 0.7390917186108638, + "ewc_loss": 0.03296630084514618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6483150830026716e-05, + "grad_norm": 24.80403709411621, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8506706357002258, + "num_tokens": 221724673.0, + "step": 5810 + }, + { + "epoch": 0.7392189288894543, + "ewc_loss": 0.03292186185717583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6460930055473e-05, + "grad_norm": 24.593820571899414, + "learning_rate": 1e-06, + "loss": 0.5404, + "mean_token_accuracy": 0.8333747386932373, + "num_tokens": 221766287.0, + "step": 5811 + }, + { + "epoch": 0.7393461391680448, + "ewc_loss": 0.032883819192647934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.644190888328012e-05, + "grad_norm": 24.792726516723633, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8573907613754272, + "num_tokens": 221805357.0, + "step": 5812 + }, + { + "epoch": 0.7394733494466353, + "ewc_loss": 0.03293601796030998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.64680095622316e-05, + "grad_norm": 24.611053466796875, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8560128211975098, + "num_tokens": 221838513.0, + "step": 5813 + }, + { + "epoch": 0.7396005597252258, + "ewc_loss": 0.03282680734992027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6413403500337154e-05, + "grad_norm": 24.571044921875, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8453717827796936, + "num_tokens": 221882808.0, + "step": 5814 + }, + { + "epoch": 0.7397277700038163, + "ewc_loss": 0.032851025462150574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.642551251279656e-05, + "grad_norm": 24.658361434936523, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8703716993331909, + "num_tokens": 221918505.0, + "step": 5815 + }, + { + "epoch": 0.7398549802824068, + "ewc_loss": 0.03294310346245766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6471551134600304e-05, + "grad_norm": 24.631790161132812, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8581734299659729, + "num_tokens": 221955983.0, + "step": 5816 + }, + { + "epoch": 0.7399821905609973, + "ewc_loss": 0.032864760607481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6432381016784348e-05, + "grad_norm": 24.60568618774414, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8567840456962585, + "num_tokens": 221997363.0, + "step": 5817 + }, + { + "epoch": 0.7401094008395879, + "ewc_loss": 0.03286905959248543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6434529243269935e-05, + "grad_norm": 24.66095733642578, + "learning_rate": 1e-06, + "loss": 0.5009, + "mean_token_accuracy": 0.8437830805778503, + "num_tokens": 222039302.0, + "step": 5818 + }, + { + "epoch": 0.7402366111181784, + "ewc_loss": 0.03288480266928673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6442401829408482e-05, + "grad_norm": 24.59128761291504, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8562241196632385, + "num_tokens": 222075779.0, + "step": 5819 + }, + { + "epoch": 0.7403638213967688, + "ewc_loss": 0.03288208693265915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6441043044324033e-05, + "grad_norm": 24.61850357055664, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.875762939453125, + "num_tokens": 222115120.0, + "step": 5820 + }, + { + "epoch": 0.7404910316753593, + "ewc_loss": 0.032888054847717285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6444028005935252e-05, + "grad_norm": 24.62413215637207, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8500944375991821, + "num_tokens": 222156570.0, + "step": 5821 + }, + { + "epoch": 0.7406182419539499, + "ewc_loss": 0.03291941434144974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6459707694593817e-05, + "grad_norm": 24.608835220336914, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8568666577339172, + "num_tokens": 222189487.0, + "step": 5822 + }, + { + "epoch": 0.7407454522325404, + "ewc_loss": 0.032895736396312714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6447867892566137e-05, + "grad_norm": 24.654178619384766, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8568368554115295, + "num_tokens": 222223163.0, + "step": 5823 + }, + { + "epoch": 0.7408726625111309, + "ewc_loss": 0.032938238233327866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6469119145767763e-05, + "grad_norm": 24.69199562072754, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.869034469127655, + "num_tokens": 222259042.0, + "step": 5824 + }, + { + "epoch": 0.7409998727897215, + "ewc_loss": 0.03287282586097717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.643641371629201e-05, + "grad_norm": 24.709331512451172, + "learning_rate": 1e-06, + "loss": 0.5311, + "mean_token_accuracy": 0.8386541604995728, + "num_tokens": 222300497.0, + "step": 5825 + }, + { + "epoch": 0.7411270830683119, + "ewc_loss": 0.03292520344257355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.646260170673486e-05, + "grad_norm": 24.7156982421875, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8533575534820557, + "num_tokens": 222333630.0, + "step": 5826 + }, + { + "epoch": 0.7412542933469024, + "ewc_loss": 0.03281363472342491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6406816939706914e-05, + "grad_norm": 24.568437576293945, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8529768586158752, + "num_tokens": 222371715.0, + "step": 5827 + }, + { + "epoch": 0.7413815036254929, + "ewc_loss": 0.032868850976228714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6434425560873933e-05, + "grad_norm": 24.622726440429688, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8590909242630005, + "num_tokens": 222412044.0, + "step": 5828 + }, + { + "epoch": 0.7415087139040835, + "ewc_loss": 0.0329117514193058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6455875083920546e-05, + "grad_norm": 24.618637084960938, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8603339195251465, + "num_tokens": 222452668.0, + "step": 5829 + }, + { + "epoch": 0.741635924182674, + "ewc_loss": 0.032909948378801346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.645497468416579e-05, + "grad_norm": 24.62486457824707, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8597190380096436, + "num_tokens": 222485631.0, + "step": 5830 + }, + { + "epoch": 0.7417631344612645, + "ewc_loss": 0.0329669825732708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.648349098104518e-05, + "grad_norm": 24.657712936401367, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.87275630235672, + "num_tokens": 222522967.0, + "step": 5831 + }, + { + "epoch": 0.7418903447398549, + "ewc_loss": 0.032892338931560516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6446168956463225e-05, + "grad_norm": 24.598495483398438, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8483975529670715, + "num_tokens": 222563258.0, + "step": 5832 + }, + { + "epoch": 0.7420175550184455, + "ewc_loss": 0.03293218836188316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6466094166389666e-05, + "grad_norm": 24.599794387817383, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8519816398620605, + "num_tokens": 222599248.0, + "step": 5833 + }, + { + "epoch": 0.742144765297036, + "ewc_loss": 0.03299775347113609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.649887599342037e-05, + "grad_norm": 24.707304000854492, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8647701740264893, + "num_tokens": 222642575.0, + "step": 5834 + }, + { + "epoch": 0.7422719755756265, + "ewc_loss": 0.032973211258649826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.648660509090405e-05, + "grad_norm": 24.530860900878906, + "learning_rate": 1e-06, + "loss": 0.527, + "mean_token_accuracy": 0.8335033059120178, + "num_tokens": 222687330.0, + "step": 5835 + }, + { + "epoch": 0.742399185854217, + "ewc_loss": 0.03292658552527428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6463292922708206e-05, + "grad_norm": 24.602184295654297, + "learning_rate": 1e-06, + "loss": 0.5392, + "mean_token_accuracy": 0.8321690559387207, + "num_tokens": 222720020.0, + "step": 5836 + }, + { + "epoch": 0.7425263961328076, + "ewc_loss": 0.032994579523801804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6497289834660478e-05, + "grad_norm": 24.602680206298828, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8709793090820312, + "num_tokens": 222758634.0, + "step": 5837 + }, + { + "epoch": 0.742653606411398, + "ewc_loss": 0.03289863094687462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.644931580813136e-05, + "grad_norm": 24.654735565185547, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8645877838134766, + "num_tokens": 222794823.0, + "step": 5838 + }, + { + "epoch": 0.7427808166899885, + "ewc_loss": 0.03299955278635025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6499776393175125e-05, + "grad_norm": 24.64244842529297, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8571030497550964, + "num_tokens": 222833836.0, + "step": 5839 + }, + { + "epoch": 0.742908026968579, + "ewc_loss": 0.032911550253629684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.645577503950335e-05, + "grad_norm": 24.58738899230957, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8595068454742432, + "num_tokens": 222875712.0, + "step": 5840 + }, + { + "epoch": 0.7430352372471696, + "ewc_loss": 0.03298094868659973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.649047408136539e-05, + "grad_norm": 24.700815200805664, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8466588258743286, + "num_tokens": 222915494.0, + "step": 5841 + }, + { + "epoch": 0.7431624475257601, + "ewc_loss": 0.03295132517814636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6475662050652318e-05, + "grad_norm": 24.61048698425293, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8736963868141174, + "num_tokens": 222951675.0, + "step": 5842 + }, + { + "epoch": 0.7432896578043506, + "ewc_loss": 0.03292776644229889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6463884094264358e-05, + "grad_norm": 24.70522689819336, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8523412942886353, + "num_tokens": 222993245.0, + "step": 5843 + }, + { + "epoch": 0.743416868082941, + "ewc_loss": 0.03296242654323578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.648121360631194e-05, + "grad_norm": 24.6666259765625, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8638644218444824, + "num_tokens": 223029876.0, + "step": 5844 + }, + { + "epoch": 0.7435440783615316, + "ewc_loss": 0.03292354941368103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6461774066556245e-05, + "grad_norm": 24.71271514892578, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.863362193107605, + "num_tokens": 223072550.0, + "step": 5845 + }, + { + "epoch": 0.7436712886401221, + "ewc_loss": 0.032897621393203735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6448810129077174e-05, + "grad_norm": 24.662919998168945, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8554282188415527, + "num_tokens": 223112892.0, + "step": 5846 + }, + { + "epoch": 0.7437984989187126, + "ewc_loss": 0.032918527722358704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6459263861179352e-05, + "grad_norm": 24.732513427734375, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8631154894828796, + "num_tokens": 223157093.0, + "step": 5847 + }, + { + "epoch": 0.7439257091973032, + "ewc_loss": 0.0328928641974926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.644643270992674e-05, + "grad_norm": 24.67078399658203, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8483643531799316, + "num_tokens": 223190756.0, + "step": 5848 + }, + { + "epoch": 0.7440529194758937, + "ewc_loss": 0.03287084400653839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6435422367067076e-05, + "grad_norm": 24.759939193725586, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8484541177749634, + "num_tokens": 223226013.0, + "step": 5849 + }, + { + "epoch": 0.7441801297544841, + "ewc_loss": 0.032867301255464554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6433650671388023e-05, + "grad_norm": 24.723737716674805, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8710002899169922, + "num_tokens": 223262858.0, + "step": 5850 + }, + { + "epoch": 0.7443073400330746, + "ewc_loss": 0.03286075219511986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.643037649046164e-05, + "grad_norm": 24.61617660522461, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8612253665924072, + "num_tokens": 223310528.0, + "step": 5851 + }, + { + "epoch": 0.7444345503116652, + "ewc_loss": 0.032797954976558685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.639897709537763e-05, + "grad_norm": 24.68604850769043, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8587867021560669, + "num_tokens": 223345021.0, + "step": 5852 + }, + { + "epoch": 0.7445617605902557, + "ewc_loss": 0.0329158790409565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.645793963689357e-05, + "grad_norm": 24.839595794677734, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8682597279548645, + "num_tokens": 223382055.0, + "step": 5853 + }, + { + "epoch": 0.7446889708688462, + "ewc_loss": 0.03277469053864479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6387344658141956e-05, + "grad_norm": 24.453649520874023, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8473179340362549, + "num_tokens": 223418441.0, + "step": 5854 + }, + { + "epoch": 0.7448161811474368, + "ewc_loss": 0.03281877562403679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6409387171734124e-05, + "grad_norm": 24.865713119506836, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8563165664672852, + "num_tokens": 223458150.0, + "step": 5855 + }, + { + "epoch": 0.7449433914260272, + "ewc_loss": 0.032955385744571686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6477692042826675e-05, + "grad_norm": 24.593624114990234, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8546104431152344, + "num_tokens": 223496916.0, + "step": 5856 + }, + { + "epoch": 0.7450706017046177, + "ewc_loss": 0.03280385211110115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.640192567720078e-05, + "grad_norm": 24.695268630981445, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8639652132987976, + "num_tokens": 223534808.0, + "step": 5857 + }, + { + "epoch": 0.7451978119832082, + "ewc_loss": 0.032885365188121796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6442681953776628e-05, + "grad_norm": 24.698238372802734, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8668437600135803, + "num_tokens": 223571255.0, + "step": 5858 + }, + { + "epoch": 0.7453250222617988, + "ewc_loss": 0.03285019099712372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6425095964223146e-05, + "grad_norm": 24.581314086914062, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8617771863937378, + "num_tokens": 223616608.0, + "step": 5859 + }, + { + "epoch": 0.7454522325403893, + "ewc_loss": 0.03285755589604378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6428777598775923e-05, + "grad_norm": 24.68063735961914, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8576146960258484, + "num_tokens": 223656188.0, + "step": 5860 + }, + { + "epoch": 0.7455794428189798, + "ewc_loss": 0.03290450572967529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6452253476018086e-05, + "grad_norm": 24.611740112304688, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.852996826171875, + "num_tokens": 223693787.0, + "step": 5861 + }, + { + "epoch": 0.7457066530975703, + "ewc_loss": 0.03285960480570793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.642980168981012e-05, + "grad_norm": 24.574026107788086, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8693183660507202, + "num_tokens": 223729350.0, + "step": 5862 + }, + { + "epoch": 0.7458338633761608, + "ewc_loss": 0.03286314755678177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6431573385489173e-05, + "grad_norm": 24.683727264404297, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8464906811714172, + "num_tokens": 223770846.0, + "step": 5863 + }, + { + "epoch": 0.7459610736547513, + "ewc_loss": 0.032903432846069336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.645171687414404e-05, + "grad_norm": 24.616586685180664, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8569300174713135, + "num_tokens": 223814194.0, + "step": 5864 + }, + { + "epoch": 0.7460882839333418, + "ewc_loss": 0.03288424760103226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.644212352402974e-05, + "grad_norm": 24.61797332763672, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8506993651390076, + "num_tokens": 223849141.0, + "step": 5865 + }, + { + "epoch": 0.7462154942119323, + "ewc_loss": 0.03288271278142929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6441355910501443e-05, + "grad_norm": 24.604923248291016, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.879592776298523, + "num_tokens": 223888654.0, + "step": 5866 + }, + { + "epoch": 0.7463427044905229, + "ewc_loss": 0.032930757850408554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6465379303554073e-05, + "grad_norm": 24.70248794555664, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8611726760864258, + "num_tokens": 223924509.0, + "step": 5867 + }, + { + "epoch": 0.7464699147691134, + "ewc_loss": 0.03286649286746979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6433246855740435e-05, + "grad_norm": 24.551416397094727, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8431339263916016, + "num_tokens": 223966191.0, + "step": 5868 + }, + { + "epoch": 0.7465971250477038, + "ewc_loss": 0.032905351370573044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.645267548155971e-05, + "grad_norm": 24.633237838745117, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8600873947143555, + "num_tokens": 224001415.0, + "step": 5869 + }, + { + "epoch": 0.7467243353262943, + "ewc_loss": 0.0329444482922554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6472224160679616e-05, + "grad_norm": 24.668865203857422, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8638837337493896, + "num_tokens": 224042820.0, + "step": 5870 + }, + { + "epoch": 0.7468515456048849, + "ewc_loss": 0.032922327518463135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6461162886116654e-05, + "grad_norm": 24.617420196533203, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8623862862586975, + "num_tokens": 224078411.0, + "step": 5871 + }, + { + "epoch": 0.7469787558834754, + "ewc_loss": 0.032948777079582214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6474388758069836e-05, + "grad_norm": 24.63260269165039, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8646523952484131, + "num_tokens": 224117136.0, + "step": 5872 + }, + { + "epoch": 0.7471059661620659, + "ewc_loss": 0.03292004391551018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.646002237976063e-05, + "grad_norm": 24.627792358398438, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8513678312301636, + "num_tokens": 224155879.0, + "step": 5873 + }, + { + "epoch": 0.7472331764406565, + "ewc_loss": 0.0329832024872303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.649160185479559e-05, + "grad_norm": 24.67575454711914, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8691842555999756, + "num_tokens": 224195186.0, + "step": 5874 + }, + { + "epoch": 0.7473603867192469, + "ewc_loss": 0.03291730582714081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.645865268073976e-05, + "grad_norm": 24.57607078552246, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.85431969165802, + "num_tokens": 224235722.0, + "step": 5875 + }, + { + "epoch": 0.7474875969978374, + "ewc_loss": 0.03290414810180664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6452073396067135e-05, + "grad_norm": 24.643932342529297, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8659080862998962, + "num_tokens": 224268461.0, + "step": 5876 + }, + { + "epoch": 0.7476148072764279, + "ewc_loss": 0.03296484425663948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6482421415275894e-05, + "grad_norm": 24.558866500854492, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.870620608329773, + "num_tokens": 224302921.0, + "step": 5877 + }, + { + "epoch": 0.7477420175550185, + "ewc_loss": 0.032913003116846085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6456500816275366e-05, + "grad_norm": 24.55338478088379, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8590269088745117, + "num_tokens": 224344081.0, + "step": 5878 + }, + { + "epoch": 0.747869227833609, + "ewc_loss": 0.03295004740357399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6475023585371673e-05, + "grad_norm": 24.617412567138672, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8452659845352173, + "num_tokens": 224384464.0, + "step": 5879 + }, + { + "epoch": 0.7479964381121995, + "ewc_loss": 0.032990943640470505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6495472664246336e-05, + "grad_norm": 24.683042526245117, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8665158152580261, + "num_tokens": 224425542.0, + "step": 5880 + }, + { + "epoch": 0.7481236483907899, + "ewc_loss": 0.0330156646668911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.650783269724343e-05, + "grad_norm": 24.659717559814453, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.874841570854187, + "num_tokens": 224465342.0, + "step": 5881 + }, + { + "epoch": 0.7482508586693805, + "ewc_loss": 0.03293992951512337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6469964975840412e-05, + "grad_norm": 24.626667022705078, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8625076413154602, + "num_tokens": 224504787.0, + "step": 5882 + }, + { + "epoch": 0.748378068947971, + "ewc_loss": 0.033031973987817764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6515987226739526e-05, + "grad_norm": 24.60100555419922, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8588038682937622, + "num_tokens": 224548534.0, + "step": 5883 + }, + { + "epoch": 0.7485052792265615, + "ewc_loss": 0.03296545147895813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6482725186506286e-05, + "grad_norm": 24.65444564819336, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8501347303390503, + "num_tokens": 224590251.0, + "step": 5884 + }, + { + "epoch": 0.748632489505152, + "ewc_loss": 0.03300464153289795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6502321159350686e-05, + "grad_norm": 24.651893615722656, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8686625361442566, + "num_tokens": 224633745.0, + "step": 5885 + }, + { + "epoch": 0.7487596997837426, + "ewc_loss": 0.03297960385680199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.648980105528608e-05, + "grad_norm": 24.63401985168457, + "learning_rate": 1e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8425202965736389, + "num_tokens": 224671319.0, + "step": 5886 + }, + { + "epoch": 0.748886910062333, + "ewc_loss": 0.03299670293927193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.649835212447215e-05, + "grad_norm": 24.62838363647461, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8584734201431274, + "num_tokens": 224712670.0, + "step": 5887 + }, + { + "epoch": 0.7490141203409235, + "ewc_loss": 0.032978355884552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6489177141920663e-05, + "grad_norm": 24.688570022583008, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8566792011260986, + "num_tokens": 224747840.0, + "step": 5888 + }, + { + "epoch": 0.749141330619514, + "ewc_loss": 0.03293526545166969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6467633031425066e-05, + "grad_norm": 24.553022384643555, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8685015439987183, + "num_tokens": 224783274.0, + "step": 5889 + }, + { + "epoch": 0.7492685408981046, + "ewc_loss": 0.03300285339355469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.650142621656414e-05, + "grad_norm": 24.615407943725586, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8551751971244812, + "num_tokens": 224820456.0, + "step": 5890 + }, + { + "epoch": 0.7493957511766951, + "ewc_loss": 0.03297995775938034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6489979316247627e-05, + "grad_norm": 24.59885025024414, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8630427122116089, + "num_tokens": 224858569.0, + "step": 5891 + }, + { + "epoch": 0.7495229614552856, + "ewc_loss": 0.03297185152769089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6485926607856527e-05, + "grad_norm": 24.55485725402832, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8564013838768005, + "num_tokens": 224901033.0, + "step": 5892 + }, + { + "epoch": 0.749650171733876, + "ewc_loss": 0.033039238303899765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6519619748578407e-05, + "grad_norm": 24.67963218688965, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8733587265014648, + "num_tokens": 224934345.0, + "step": 5893 + }, + { + "epoch": 0.7497773820124666, + "ewc_loss": 0.03300205618143082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6501027857884765e-05, + "grad_norm": 24.604965209960938, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8515160083770752, + "num_tokens": 224972479.0, + "step": 5894 + }, + { + "epoch": 0.7499045922910571, + "ewc_loss": 0.032982975244522095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6491487258463167e-05, + "grad_norm": 24.710338592529297, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8748195171356201, + "num_tokens": 225014213.0, + "step": 5895 + }, + { + "epoch": 0.7500318025696476, + "ewc_loss": 0.033061571419239044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6530786524526775e-05, + "grad_norm": 24.6046142578125, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.865525484085083, + "num_tokens": 225049929.0, + "step": 5896 + }, + { + "epoch": 0.7501590128482382, + "ewc_loss": 0.03302871063351631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6514355593244545e-05, + "grad_norm": 24.705690383911133, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8433403968811035, + "num_tokens": 225086409.0, + "step": 5897 + }, + { + "epoch": 0.7502862231268287, + "ewc_loss": 0.03306267410516739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.653133767831605e-05, + "grad_norm": 24.688575744628906, + "learning_rate": 1e-06, + "loss": 0.5146, + "mean_token_accuracy": 0.8407478332519531, + "num_tokens": 225124443.0, + "step": 5898 + }, + { + "epoch": 0.7504134334054191, + "ewc_loss": 0.032980237156152725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.64901193784317e-05, + "grad_norm": 24.628032684326172, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8475106954574585, + "num_tokens": 225166665.0, + "step": 5899 + }, + { + "epoch": 0.7505406436840096, + "ewc_loss": 0.03302949666976929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.651474849495571e-05, + "grad_norm": 24.711523056030273, + "learning_rate": 1e-06, + "loss": 0.5353, + "mean_token_accuracy": 0.833838164806366, + "num_tokens": 225203841.0, + "step": 5900 + }, + { + "epoch": 0.7506678539626002, + "ewc_loss": 0.032963547855615616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6481773855048232e-05, + "grad_norm": 24.579200744628906, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.841934323310852, + "num_tokens": 225234844.0, + "step": 5901 + }, + { + "epoch": 0.7507950642411907, + "ewc_loss": 0.03301343321800232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6506715837749653e-05, + "grad_norm": 24.79962921142578, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8396075963973999, + "num_tokens": 225269828.0, + "step": 5902 + }, + { + "epoch": 0.7509222745197812, + "ewc_loss": 0.03307299315929413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6536496332264505e-05, + "grad_norm": 24.651113510131836, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8626967668533325, + "num_tokens": 225312361.0, + "step": 5903 + }, + { + "epoch": 0.7510494847983717, + "ewc_loss": 0.032953087240457535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6476544260513037e-05, + "grad_norm": 24.627910614013672, + "learning_rate": 1e-06, + "loss": 0.5035, + "mean_token_accuracy": 0.8478355407714844, + "num_tokens": 225353708.0, + "step": 5904 + }, + { + "epoch": 0.7511766950769622, + "ewc_loss": 0.03306511044502258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.653255458222702e-05, + "grad_norm": 24.616806030273438, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8770718574523926, + "num_tokens": 225399730.0, + "step": 5905 + }, + { + "epoch": 0.7513039053555527, + "ewc_loss": 0.03299950808286667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6499754565302283e-05, + "grad_norm": 24.61871337890625, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8519083261489868, + "num_tokens": 225441836.0, + "step": 5906 + }, + { + "epoch": 0.7514311156341432, + "ewc_loss": 0.033046018332242966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6523008525837213e-05, + "grad_norm": 24.63022804260254, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8657814264297485, + "num_tokens": 225471004.0, + "step": 5907 + }, + { + "epoch": 0.7515583259127337, + "ewc_loss": 0.03311452269554138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6557261915295385e-05, + "grad_norm": 24.748424530029297, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8393751978874207, + "num_tokens": 225505789.0, + "step": 5908 + }, + { + "epoch": 0.7516855361913243, + "ewc_loss": 0.03305203095078468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6526015315321274e-05, + "grad_norm": 24.509435653686523, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8609015941619873, + "num_tokens": 225541138.0, + "step": 5909 + }, + { + "epoch": 0.7518127464699148, + "ewc_loss": 0.03306794539093971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6533973393961787e-05, + "grad_norm": 24.77490234375, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8618456125259399, + "num_tokens": 225581695.0, + "step": 5910 + }, + { + "epoch": 0.7519399567485053, + "ewc_loss": 0.03309785574674606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6548927305848338e-05, + "grad_norm": 24.659225463867188, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8632015585899353, + "num_tokens": 225619532.0, + "step": 5911 + }, + { + "epoch": 0.7520671670270958, + "ewc_loss": 0.03305380046367645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6526899344171397e-05, + "grad_norm": 24.702503204345703, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8503149151802063, + "num_tokens": 225657403.0, + "step": 5912 + }, + { + "epoch": 0.7521943773056863, + "ewc_loss": 0.03305808827280998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6529043932678178e-05, + "grad_norm": 24.603586196899414, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8536860942840576, + "num_tokens": 225693402.0, + "step": 5913 + }, + { + "epoch": 0.7523215875842768, + "ewc_loss": 0.03298570215702057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6492851500515826e-05, + "grad_norm": 24.7213134765625, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8558182716369629, + "num_tokens": 225727208.0, + "step": 5914 + }, + { + "epoch": 0.7524487978628673, + "ewc_loss": 0.03310471400618553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6552357919863425e-05, + "grad_norm": 24.542943954467773, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8622314929962158, + "num_tokens": 225762117.0, + "step": 5915 + }, + { + "epoch": 0.7525760081414579, + "ewc_loss": 0.03304021432995796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6520107237738557e-05, + "grad_norm": 24.836854934692383, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8415933847427368, + "num_tokens": 225794737.0, + "step": 5916 + }, + { + "epoch": 0.7527032184200484, + "ewc_loss": 0.03317325934767723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6586629499215633e-05, + "grad_norm": 24.63739776611328, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8570337295532227, + "num_tokens": 225829074.0, + "step": 5917 + }, + { + "epoch": 0.7528304286986388, + "ewc_loss": 0.03302337974309921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.651169077376835e-05, + "grad_norm": 24.66328239440918, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8713128566741943, + "num_tokens": 225871344.0, + "step": 5918 + }, + { + "epoch": 0.7529576389772293, + "ewc_loss": 0.0331314280629158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6565714759053662e-05, + "grad_norm": 24.78948402404785, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8534340858459473, + "num_tokens": 225911321.0, + "step": 5919 + }, + { + "epoch": 0.7530848492558199, + "ewc_loss": 0.0330464243888855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.652321225265041e-05, + "grad_norm": 24.66315269470215, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8671087026596069, + "num_tokens": 225944613.0, + "step": 5920 + }, + { + "epoch": 0.7532120595344104, + "ewc_loss": 0.03298191726207733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6490957932546735e-05, + "grad_norm": 24.633800506591797, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.863154411315918, + "num_tokens": 225989501.0, + "step": 5921 + }, + { + "epoch": 0.7533392698130009, + "ewc_loss": 0.03312244638800621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6561223674216308e-05, + "grad_norm": 24.72504234313965, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8599403500556946, + "num_tokens": 226024984.0, + "step": 5922 + }, + { + "epoch": 0.7534664800915915, + "ewc_loss": 0.03316200524568558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6581003364990465e-05, + "grad_norm": 24.737993240356445, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8705726861953735, + "num_tokens": 226063255.0, + "step": 5923 + }, + { + "epoch": 0.7535936903701819, + "ewc_loss": 0.03311147168278694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.655573578318581e-05, + "grad_norm": 24.721059799194336, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8459464311599731, + "num_tokens": 226105538.0, + "step": 5924 + }, + { + "epoch": 0.7537209006487724, + "ewc_loss": 0.03308688476681709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6543443052796647e-05, + "grad_norm": 24.7894287109375, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8625327348709106, + "num_tokens": 226137449.0, + "step": 5925 + }, + { + "epoch": 0.7538481109273629, + "ewc_loss": 0.03305706009268761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6528529158676974e-05, + "grad_norm": 24.737689971923828, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8529242277145386, + "num_tokens": 226171078.0, + "step": 5926 + }, + { + "epoch": 0.7539753212059535, + "ewc_loss": 0.03307405114173889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6537025658180937e-05, + "grad_norm": 24.722698211669922, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8598954677581787, + "num_tokens": 226208060.0, + "step": 5927 + }, + { + "epoch": 0.754102531484544, + "ewc_loss": 0.03304239735007286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6521198631380685e-05, + "grad_norm": 24.7138671875, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.868208646774292, + "num_tokens": 226245330.0, + "step": 5928 + }, + { + "epoch": 0.7542297417631345, + "ewc_loss": 0.03302621468901634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6513107766513713e-05, + "grad_norm": 24.638099670410156, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.878876268863678, + "num_tokens": 226281974.0, + "step": 5929 + }, + { + "epoch": 0.7543569520417249, + "ewc_loss": 0.03305754065513611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6528771084267646e-05, + "grad_norm": 24.83383560180664, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8473714590072632, + "num_tokens": 226323302.0, + "step": 5930 + }, + { + "epoch": 0.7544841623203155, + "ewc_loss": 0.03310636430978775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.655318192206323e-05, + "grad_norm": 24.70988655090332, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8460015654563904, + "num_tokens": 226361346.0, + "step": 5931 + }, + { + "epoch": 0.754611372598906, + "ewc_loss": 0.03302988409996033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6514941307832487e-05, + "grad_norm": 24.790498733520508, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.854519248008728, + "num_tokens": 226397744.0, + "step": 5932 + }, + { + "epoch": 0.7547385828774965, + "ewc_loss": 0.033110611140728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6555306501686573e-05, + "grad_norm": 24.74091911315918, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8656870126724243, + "num_tokens": 226436745.0, + "step": 5933 + }, + { + "epoch": 0.754865793156087, + "ewc_loss": 0.03306412324309349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.653206163609866e-05, + "grad_norm": 24.784162521362305, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8666781187057495, + "num_tokens": 226470302.0, + "step": 5934 + }, + { + "epoch": 0.7549930034346776, + "ewc_loss": 0.033098820596933365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.654941115702968e-05, + "grad_norm": 24.692487716674805, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8603202700614929, + "num_tokens": 226500090.0, + "step": 5935 + }, + { + "epoch": 0.755120213713268, + "ewc_loss": 0.03305887430906296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6529436834389344e-05, + "grad_norm": 24.837512969970703, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8604055047035217, + "num_tokens": 226533359.0, + "step": 5936 + }, + { + "epoch": 0.7552474239918585, + "ewc_loss": 0.033105120062828064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.655255982768722e-05, + "grad_norm": 24.66256332397461, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8479109406471252, + "num_tokens": 226567060.0, + "step": 5937 + }, + { + "epoch": 0.755374634270449, + "ewc_loss": 0.033074211329221725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6537105693714693e-05, + "grad_norm": 24.66008758544922, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8564472198486328, + "num_tokens": 226604270.0, + "step": 5938 + }, + { + "epoch": 0.7555018445490396, + "ewc_loss": 0.03319023922085762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.659511872276198e-05, + "grad_norm": 24.885883331298828, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.856429934501648, + "num_tokens": 226641310.0, + "step": 5939 + }, + { + "epoch": 0.7556290548276301, + "ewc_loss": 0.033149875700473785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.657493703532964e-05, + "grad_norm": 24.721088409423828, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8464596271514893, + "num_tokens": 226678305.0, + "step": 5940 + }, + { + "epoch": 0.7557562651062206, + "ewc_loss": 0.033086229115724564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6543113815714605e-05, + "grad_norm": 24.69942855834961, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8644837141036987, + "num_tokens": 226715628.0, + "step": 5941 + }, + { + "epoch": 0.755883475384811, + "ewc_loss": 0.03315774351358414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.657887150940951e-05, + "grad_norm": 24.66630744934082, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8701213598251343, + "num_tokens": 226755346.0, + "step": 5942 + }, + { + "epoch": 0.7560106856634016, + "ewc_loss": 0.03314944729208946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6574724213569425e-05, + "grad_norm": 24.784086227416992, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.847113847732544, + "num_tokens": 226792243.0, + "step": 5943 + }, + { + "epoch": 0.7561378959419921, + "ewc_loss": 0.03319362923502922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6596814020886086e-05, + "grad_norm": 24.650192260742188, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8726072907447815, + "num_tokens": 226829033.0, + "step": 5944 + }, + { + "epoch": 0.7562651062205826, + "ewc_loss": 0.033137474209070206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6568737919442356e-05, + "grad_norm": 24.70298957824707, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8722924590110779, + "num_tokens": 226870645.0, + "step": 5945 + }, + { + "epoch": 0.7563923164991732, + "ewc_loss": 0.03316282853484154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6581414456595667e-05, + "grad_norm": 24.56937599182129, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8623777627944946, + "num_tokens": 226902395.0, + "step": 5946 + }, + { + "epoch": 0.7565195267777637, + "ewc_loss": 0.03322359174489975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.661179521761369e-05, + "grad_norm": 24.755661010742188, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8647803664207458, + "num_tokens": 226942620.0, + "step": 5947 + }, + { + "epoch": 0.7566467370563541, + "ewc_loss": 0.03320750221610069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.660375164647121e-05, + "grad_norm": 24.54758071899414, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8669908046722412, + "num_tokens": 226976472.0, + "step": 5948 + }, + { + "epoch": 0.7567739473349446, + "ewc_loss": 0.03317899629473686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6589498045505024e-05, + "grad_norm": 24.68930435180664, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8485723733901978, + "num_tokens": 227013626.0, + "step": 5949 + }, + { + "epoch": 0.7569011576135352, + "ewc_loss": 0.03322664648294449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6613323168712668e-05, + "grad_norm": 24.615800857543945, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.8457211852073669, + "num_tokens": 227049336.0, + "step": 5950 + }, + { + "epoch": 0.7570283678921257, + "ewc_loss": 0.033250074833631516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6625037460471503e-05, + "grad_norm": 24.60073471069336, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.881879448890686, + "num_tokens": 227087371.0, + "step": 5951 + }, + { + "epoch": 0.7571555781707162, + "ewc_loss": 0.03324054926633835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.662027534621302e-05, + "grad_norm": 24.59417724609375, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8598606586456299, + "num_tokens": 227129188.0, + "step": 5952 + }, + { + "epoch": 0.7572827884493067, + "ewc_loss": 0.033271536231040955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.663576767896302e-05, + "grad_norm": 24.706649780273438, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8507418036460876, + "num_tokens": 227161807.0, + "step": 5953 + }, + { + "epoch": 0.7574099987278972, + "ewc_loss": 0.03324998915195465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.662499380472582e-05, + "grad_norm": 24.567846298217773, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.847775399684906, + "num_tokens": 227198431.0, + "step": 5954 + }, + { + "epoch": 0.7575372090064877, + "ewc_loss": 0.03322790563106537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6613952539046295e-05, + "grad_norm": 24.640775680541992, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8778940439224243, + "num_tokens": 227236046.0, + "step": 5955 + }, + { + "epoch": 0.7576644192850782, + "ewc_loss": 0.03332908824086189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6664544091327116e-05, + "grad_norm": 24.658031463623047, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8755737543106079, + "num_tokens": 227272925.0, + "step": 5956 + }, + { + "epoch": 0.7577916295636687, + "ewc_loss": 0.03326772525906563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6633863197057508e-05, + "grad_norm": 24.642948150634766, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8629608750343323, + "num_tokens": 227312311.0, + "step": 5957 + }, + { + "epoch": 0.7579188398422593, + "ewc_loss": 0.03330506011843681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6652529666316696e-05, + "grad_norm": 24.73540687561035, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.878638505935669, + "num_tokens": 227349632.0, + "step": 5958 + }, + { + "epoch": 0.7580460501208498, + "ewc_loss": 0.03322060778737068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6610303646302782e-05, + "grad_norm": 24.621633529663086, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8573847413063049, + "num_tokens": 227392982.0, + "step": 5959 + }, + { + "epoch": 0.7581732603994403, + "ewc_loss": 0.03321012109518051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6605061318841763e-05, + "grad_norm": 24.769737243652344, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8557291030883789, + "num_tokens": 227431878.0, + "step": 5960 + }, + { + "epoch": 0.7583004706780307, + "ewc_loss": 0.033274538815021515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.663727016421035e-05, + "grad_norm": 24.660390853881836, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8491370677947998, + "num_tokens": 227470401.0, + "step": 5961 + }, + { + "epoch": 0.7584276809566213, + "ewc_loss": 0.03320489078760147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6602445612079464e-05, + "grad_norm": 24.768030166625977, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8636225461959839, + "num_tokens": 227507189.0, + "step": 5962 + }, + { + "epoch": 0.7585548912352118, + "ewc_loss": 0.03327817842364311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6639089153613895e-05, + "grad_norm": 24.62131690979004, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8624098300933838, + "num_tokens": 227548070.0, + "step": 5963 + }, + { + "epoch": 0.7586821015138023, + "ewc_loss": 0.03316976875066757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6584885088377632e-05, + "grad_norm": 24.727937698364258, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8635202050209045, + "num_tokens": 227584360.0, + "step": 5964 + }, + { + "epoch": 0.7588093117923929, + "ewc_loss": 0.033319614827632904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6659807442920282e-05, + "grad_norm": 24.80638313293457, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.864581823348999, + "num_tokens": 227618961.0, + "step": 5965 + }, + { + "epoch": 0.7589365220709834, + "ewc_loss": 0.03317004814743996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.65850233315723e-05, + "grad_norm": 24.61135482788086, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8599869608879089, + "num_tokens": 227658304.0, + "step": 5966 + }, + { + "epoch": 0.7590637323495738, + "ewc_loss": 0.03318829461932182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6594147382420488e-05, + "grad_norm": 24.688167572021484, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8574784994125366, + "num_tokens": 227701526.0, + "step": 5967 + }, + { + "epoch": 0.7591909426281643, + "ewc_loss": 0.033215370029211044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6607684301561676e-05, + "grad_norm": 24.692529678344727, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8641157150268555, + "num_tokens": 227744930.0, + "step": 5968 + }, + { + "epoch": 0.7593181529067549, + "ewc_loss": 0.03323889523744583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6619447706034407e-05, + "grad_norm": 24.68918228149414, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8698405623435974, + "num_tokens": 227780607.0, + "step": 5969 + }, + { + "epoch": 0.7594453631853454, + "ewc_loss": 0.03319884464144707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.659942245169077e-05, + "grad_norm": 24.838150024414062, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8560163378715515, + "num_tokens": 227825742.0, + "step": 5970 + }, + { + "epoch": 0.7595725734639359, + "ewc_loss": 0.03323230519890785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6616151697235182e-05, + "grad_norm": 24.634122848510742, + "learning_rate": 1e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8401365876197815, + "num_tokens": 227865278.0, + "step": 5971 + }, + { + "epoch": 0.7596997837425264, + "ewc_loss": 0.03312022611498833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.656011227169074e-05, + "grad_norm": 24.797948837280273, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8559743762016296, + "num_tokens": 227906912.0, + "step": 5972 + }, + { + "epoch": 0.7598269940211169, + "ewc_loss": 0.033336956053972244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6668478565406986e-05, + "grad_norm": 24.771121978759766, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8570131063461304, + "num_tokens": 227947067.0, + "step": 5973 + }, + { + "epoch": 0.7599542042997074, + "ewc_loss": 0.033081311732530594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6540656361030415e-05, + "grad_norm": 24.68550682067871, + "learning_rate": 1e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8415862321853638, + "num_tokens": 227988563.0, + "step": 5974 + }, + { + "epoch": 0.7600814145782979, + "ewc_loss": 0.03321678191423416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6608391888439655e-05, + "grad_norm": 24.773067474365234, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8521014451980591, + "num_tokens": 228028556.0, + "step": 5975 + }, + { + "epoch": 0.7602086248568884, + "ewc_loss": 0.03314073011279106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.657036591495853e-05, + "grad_norm": 24.701683044433594, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.873533308506012, + "num_tokens": 228066265.0, + "step": 5976 + }, + { + "epoch": 0.760335835135479, + "ewc_loss": 0.0331433042883873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6571651940466836e-05, + "grad_norm": 24.62030601501465, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8556662201881409, + "num_tokens": 228103732.0, + "step": 5977 + }, + { + "epoch": 0.7604630454140695, + "ewc_loss": 0.03316779434680939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6583897377131507e-05, + "grad_norm": 24.716829299926758, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8570193648338318, + "num_tokens": 228149609.0, + "step": 5978 + }, + { + "epoch": 0.7605902556926599, + "ewc_loss": 0.03321465849876404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6607329598627985e-05, + "grad_norm": 24.752830505371094, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8623407483100891, + "num_tokens": 228188462.0, + "step": 5979 + }, + { + "epoch": 0.7607174659712505, + "ewc_loss": 0.033167414367198944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6583706383244134e-05, + "grad_norm": 24.687528610229492, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.871729850769043, + "num_tokens": 228229252.0, + "step": 5980 + }, + { + "epoch": 0.760844676249841, + "ewc_loss": 0.0331902913749218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6595146007603034e-05, + "grad_norm": 24.84263801574707, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8560621738433838, + "num_tokens": 228267397.0, + "step": 5981 + }, + { + "epoch": 0.7609718865284315, + "ewc_loss": 0.03312032297253609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6560161384404637e-05, + "grad_norm": 24.723731994628906, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.867037296295166, + "num_tokens": 228307561.0, + "step": 5982 + }, + { + "epoch": 0.761099096807022, + "ewc_loss": 0.033170975744724274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6585487173870206e-05, + "grad_norm": 24.783756256103516, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8609867691993713, + "num_tokens": 228347667.0, + "step": 5983 + }, + { + "epoch": 0.7612263070856126, + "ewc_loss": 0.03314150124788284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.657075154071208e-05, + "grad_norm": 24.663400650024414, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8576619625091553, + "num_tokens": 228386820.0, + "step": 5984 + }, + { + "epoch": 0.761353517364203, + "ewc_loss": 0.03313528001308441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6567639249842614e-05, + "grad_norm": 24.832082748413086, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8688077330589294, + "num_tokens": 228427304.0, + "step": 5985 + }, + { + "epoch": 0.7614807276427935, + "ewc_loss": 0.03316803649067879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.658401743043214e-05, + "grad_norm": 24.65628433227539, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8497055768966675, + "num_tokens": 228470779.0, + "step": 5986 + }, + { + "epoch": 0.761607937921384, + "ewc_loss": 0.03306397423148155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6531987057533115e-05, + "grad_norm": 24.75422477722168, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8756589293479919, + "num_tokens": 228500454.0, + "step": 5987 + }, + { + "epoch": 0.7617351481999746, + "ewc_loss": 0.0331878624856472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6593930922681466e-05, + "grad_norm": 24.69634437561035, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.867216944694519, + "num_tokens": 228537374.0, + "step": 5988 + }, + { + "epoch": 0.7618623584785651, + "ewc_loss": 0.03317432850599289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6587164282100275e-05, + "grad_norm": 24.758869171142578, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8600482940673828, + "num_tokens": 228577215.0, + "step": 5989 + }, + { + "epoch": 0.7619895687571556, + "ewc_loss": 0.03313100337982178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6565501937293448e-05, + "grad_norm": 24.699953079223633, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8633030652999878, + "num_tokens": 228611693.0, + "step": 5990 + }, + { + "epoch": 0.762116779035746, + "ewc_loss": 0.03315184637904167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6575922927586362e-05, + "grad_norm": 24.741575241088867, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8581873178482056, + "num_tokens": 228644231.0, + "step": 5991 + }, + { + "epoch": 0.7622439893143366, + "ewc_loss": 0.03320295736193657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6601477909716778e-05, + "grad_norm": 24.700939178466797, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8702949285507202, + "num_tokens": 228684776.0, + "step": 5992 + }, + { + "epoch": 0.7623711995929271, + "ewc_loss": 0.033216092735528946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.660804628045298e-05, + "grad_norm": 24.757049560546875, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8653762340545654, + "num_tokens": 228724214.0, + "step": 5993 + }, + { + "epoch": 0.7624984098715176, + "ewc_loss": 0.03317040205001831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.658520159253385e-05, + "grad_norm": 24.749900817871094, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8586649894714355, + "num_tokens": 228765098.0, + "step": 5994 + }, + { + "epoch": 0.7626256201501082, + "ewc_loss": 0.03312748298048973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6563741155550815e-05, + "grad_norm": 24.73206329345703, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.86054927110672, + "num_tokens": 228802423.0, + "step": 5995 + }, + { + "epoch": 0.7627528304286987, + "ewc_loss": 0.033175867050886154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6587933714617975e-05, + "grad_norm": 24.791501998901367, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8490526676177979, + "num_tokens": 228840218.0, + "step": 5996 + }, + { + "epoch": 0.7628800407072891, + "ewc_loss": 0.033178847283124924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.658942346693948e-05, + "grad_norm": 24.772369384765625, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8652352094650269, + "num_tokens": 228882810.0, + "step": 5997 + }, + { + "epoch": 0.7630072509858796, + "ewc_loss": 0.03317129611968994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6585647244937718e-05, + "grad_norm": 24.818809509277344, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.866734504699707, + "num_tokens": 228917664.0, + "step": 5998 + }, + { + "epoch": 0.7631344612644702, + "ewc_loss": 0.033175479620695114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6587739082751796e-05, + "grad_norm": 24.785747528076172, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8658512830734253, + "num_tokens": 228951017.0, + "step": 5999 + }, + { + "epoch": 0.7632616715430607, + "ewc_loss": 0.0332132950425148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6606647477601655e-05, + "grad_norm": 24.784337997436523, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8617266416549683, + "num_tokens": 228990949.0, + "step": 6000 + }, + { + "epoch": 0.7633888818216512, + "ewc_loss": 0.033202409744262695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6601205061306246e-05, + "grad_norm": 24.680334091186523, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8643734455108643, + "num_tokens": 229026604.0, + "step": 6001 + }, + { + "epoch": 0.7635160921002417, + "ewc_loss": 0.033215124160051346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.660756242927164e-05, + "grad_norm": 24.836030960083008, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.852910041809082, + "num_tokens": 229061810.0, + "step": 6002 + }, + { + "epoch": 0.7636433023788322, + "ewc_loss": 0.03321341797709465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6606709323241375e-05, + "grad_norm": 24.753019332885742, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8370587825775146, + "num_tokens": 229107765.0, + "step": 6003 + }, + { + "epoch": 0.7637705126574227, + "ewc_loss": 0.03316473588347435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6582367607043125e-05, + "grad_norm": 24.81647300720215, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8475222587585449, + "num_tokens": 229147822.0, + "step": 6004 + }, + { + "epoch": 0.7638977229360132, + "ewc_loss": 0.03324847295880318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6624237105133943e-05, + "grad_norm": 24.749910354614258, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8695870637893677, + "num_tokens": 229184421.0, + "step": 6005 + }, + { + "epoch": 0.7640249332146037, + "ewc_loss": 0.033119168132543564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6559584764763713e-05, + "grad_norm": 24.75667953491211, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8597099781036377, + "num_tokens": 229221737.0, + "step": 6006 + }, + { + "epoch": 0.7641521434931943, + "ewc_loss": 0.03324297070503235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.662148497416638e-05, + "grad_norm": 24.768449783325195, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8534246683120728, + "num_tokens": 229254727.0, + "step": 6007 + }, + { + "epoch": 0.7642793537717848, + "ewc_loss": 0.033127717673778534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6563859389862046e-05, + "grad_norm": 24.833520889282227, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.870023787021637, + "num_tokens": 229292648.0, + "step": 6008 + }, + { + "epoch": 0.7644065640503753, + "ewc_loss": 0.03318791091442108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6593956388533115e-05, + "grad_norm": 24.650386810302734, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8638955950737, + "num_tokens": 229335708.0, + "step": 6009 + }, + { + "epoch": 0.7645337743289657, + "ewc_loss": 0.03318806737661362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6594034605077468e-05, + "grad_norm": 24.8258056640625, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8682215809822083, + "num_tokens": 229373791.0, + "step": 6010 + }, + { + "epoch": 0.7646609846075563, + "ewc_loss": 0.03321860358119011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6609301383141428e-05, + "grad_norm": 24.592655181884766, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8693192601203918, + "num_tokens": 229409891.0, + "step": 6011 + }, + { + "epoch": 0.7647881948861468, + "ewc_loss": 0.03325588256120682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6627940567559563e-05, + "grad_norm": 24.844831466674805, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8622172474861145, + "num_tokens": 229449859.0, + "step": 6012 + }, + { + "epoch": 0.7649154051647373, + "ewc_loss": 0.03325357660651207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6626789147267118e-05, + "grad_norm": 24.679515838623047, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8666150569915771, + "num_tokens": 229484224.0, + "step": 6013 + }, + { + "epoch": 0.7650426154433279, + "ewc_loss": 0.03317603841423988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6588019207119942e-05, + "grad_norm": 24.76896095275879, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8561633825302124, + "num_tokens": 229521104.0, + "step": 6014 + }, + { + "epoch": 0.7651698257219184, + "ewc_loss": 0.033228710293769836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.661435453570448e-05, + "grad_norm": 24.731714248657227, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8445753455162048, + "num_tokens": 229558918.0, + "step": 6015 + }, + { + "epoch": 0.7652970360005088, + "ewc_loss": 0.03325658291578293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6628291632514447e-05, + "grad_norm": 24.88456916809082, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8474974632263184, + "num_tokens": 229596931.0, + "step": 6016 + }, + { + "epoch": 0.7654242462790993, + "ewc_loss": 0.033257681876420975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6628840967314318e-05, + "grad_norm": 24.697351455688477, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8449829816818237, + "num_tokens": 229633903.0, + "step": 6017 + }, + { + "epoch": 0.7655514565576899, + "ewc_loss": 0.033232226967811584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6616113498457707e-05, + "grad_norm": 24.906597137451172, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8565511703491211, + "num_tokens": 229674224.0, + "step": 6018 + }, + { + "epoch": 0.7656786668362804, + "ewc_loss": 0.033234696835279465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6617348592262715e-05, + "grad_norm": 24.62981605529785, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.857886791229248, + "num_tokens": 229710547.0, + "step": 6019 + }, + { + "epoch": 0.7658058771148709, + "ewc_loss": 0.03324735164642334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6623675037408248e-05, + "grad_norm": 24.796232223510742, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8749768137931824, + "num_tokens": 229752020.0, + "step": 6020 + }, + { + "epoch": 0.7659330873934614, + "ewc_loss": 0.03331681340932846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6658406821079552e-05, + "grad_norm": 24.621917724609375, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8581230640411377, + "num_tokens": 229785710.0, + "step": 6021 + }, + { + "epoch": 0.7660602976720519, + "ewc_loss": 0.03325481340289116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6627407603664324e-05, + "grad_norm": 24.772933959960938, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.866719663143158, + "num_tokens": 229824118.0, + "step": 6022 + }, + { + "epoch": 0.7661875079506424, + "ewc_loss": 0.03338498994708061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6692494682502e-05, + "grad_norm": 24.764686584472656, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8710511922836304, + "num_tokens": 229854231.0, + "step": 6023 + }, + { + "epoch": 0.7663147182292329, + "ewc_loss": 0.03329533711075783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.664766932663042e-05, + "grad_norm": 24.663070678710938, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8500095009803772, + "num_tokens": 229888102.0, + "step": 6024 + }, + { + "epoch": 0.7664419285078234, + "ewc_loss": 0.03326240926980972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6631203834549524e-05, + "grad_norm": 24.734325408935547, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8635642528533936, + "num_tokens": 229927634.0, + "step": 6025 + }, + { + "epoch": 0.766569138786414, + "ewc_loss": 0.033367037773132324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6683519788784906e-05, + "grad_norm": 24.794286727905273, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8677830696105957, + "num_tokens": 229966526.0, + "step": 6026 + }, + { + "epoch": 0.7666963490650045, + "ewc_loss": 0.0333072654902935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6653631973895244e-05, + "grad_norm": 24.644853591918945, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8696171045303345, + "num_tokens": 230002761.0, + "step": 6027 + }, + { + "epoch": 0.7668235593435949, + "ewc_loss": 0.03332574665546417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6662872440065257e-05, + "grad_norm": 24.810930252075195, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8658707141876221, + "num_tokens": 230041303.0, + "step": 6028 + }, + { + "epoch": 0.7669507696221854, + "ewc_loss": 0.03335678204894066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6678390238666907e-05, + "grad_norm": 24.627803802490234, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8637739419937134, + "num_tokens": 230081117.0, + "step": 6029 + }, + { + "epoch": 0.767077979900776, + "ewc_loss": 0.03327494114637375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.663747025304474e-05, + "grad_norm": 24.734580993652344, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8539899587631226, + "num_tokens": 230119411.0, + "step": 6030 + }, + { + "epoch": 0.7672051901793665, + "ewc_loss": 0.03340144455432892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.670072197157424e-05, + "grad_norm": 24.82516098022461, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8674160838127136, + "num_tokens": 230161111.0, + "step": 6031 + }, + { + "epoch": 0.767332400457957, + "ewc_loss": 0.033272240310907364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.663612056290731e-05, + "grad_norm": 24.692291259765625, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8557115793228149, + "num_tokens": 230196999.0, + "step": 6032 + }, + { + "epoch": 0.7674596107365476, + "ewc_loss": 0.03333978354930878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.666989192017354e-05, + "grad_norm": 24.814422607421875, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8626066446304321, + "num_tokens": 230231495.0, + "step": 6033 + }, + { + "epoch": 0.767586821015138, + "ewc_loss": 0.03332354500889778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6661771951476112e-05, + "grad_norm": 24.6745662689209, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8534946441650391, + "num_tokens": 230267732.0, + "step": 6034 + }, + { + "epoch": 0.7677140312937285, + "ewc_loss": 0.03326727822422981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6633639461360872e-05, + "grad_norm": 24.732332229614258, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8596935272216797, + "num_tokens": 230297559.0, + "step": 6035 + }, + { + "epoch": 0.767841241572319, + "ewc_loss": 0.03339141607284546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6695708836778067e-05, + "grad_norm": 24.831443786621094, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8478521108627319, + "num_tokens": 230336396.0, + "step": 6036 + }, + { + "epoch": 0.7679684518509096, + "ewc_loss": 0.03330978378653526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.66548925335519e-05, + "grad_norm": 24.69192886352539, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8602179884910583, + "num_tokens": 230368152.0, + "step": 6037 + }, + { + "epoch": 0.7680956621295001, + "ewc_loss": 0.03335515409708023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6677577150403522e-05, + "grad_norm": 24.926149368286133, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8658727407455444, + "num_tokens": 230408081.0, + "step": 6038 + }, + { + "epoch": 0.7682228724080906, + "ewc_loss": 0.03338421508669853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6692107237759046e-05, + "grad_norm": 24.7576847076416, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8526405692100525, + "num_tokens": 230441691.0, + "step": 6039 + }, + { + "epoch": 0.768350082686681, + "ewc_loss": 0.03327122703194618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.663561306486372e-05, + "grad_norm": 24.78066062927246, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8577007055282593, + "num_tokens": 230480089.0, + "step": 6040 + }, + { + "epoch": 0.7684772929652716, + "ewc_loss": 0.03331301733851433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.665650779614225e-05, + "grad_norm": 24.829570770263672, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8539356589317322, + "num_tokens": 230518655.0, + "step": 6041 + }, + { + "epoch": 0.7686045032438621, + "ewc_loss": 0.03332621604204178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6663107089698315e-05, + "grad_norm": 24.684532165527344, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8626589775085449, + "num_tokens": 230555717.0, + "step": 6042 + }, + { + "epoch": 0.7687317135224526, + "ewc_loss": 0.03331482410430908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6657411833875813e-05, + "grad_norm": 24.888912200927734, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8694431781768799, + "num_tokens": 230596699.0, + "step": 6043 + }, + { + "epoch": 0.7688589238010431, + "ewc_loss": 0.03340154513716698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.670077290327754e-05, + "grad_norm": 24.731115341186523, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8685457706451416, + "num_tokens": 230633284.0, + "step": 6044 + }, + { + "epoch": 0.7689861340796337, + "ewc_loss": 0.033269207924604416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.663460352574475e-05, + "grad_norm": 24.706199645996094, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8641266822814941, + "num_tokens": 230673133.0, + "step": 6045 + }, + { + "epoch": 0.7691133443582241, + "ewc_loss": 0.033364228904247284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6682113709975965e-05, + "grad_norm": 24.754728317260742, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8608722686767578, + "num_tokens": 230719003.0, + "step": 6046 + }, + { + "epoch": 0.7692405546368146, + "ewc_loss": 0.033345386385917664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6672693163855e-05, + "grad_norm": 24.78627586364746, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8699334263801575, + "num_tokens": 230754805.0, + "step": 6047 + }, + { + "epoch": 0.7693677649154052, + "ewc_loss": 0.033293891698122025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.664694536884781e-05, + "grad_norm": 24.74983024597168, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8477374315261841, + "num_tokens": 230793249.0, + "step": 6048 + }, + { + "epoch": 0.7694949751939957, + "ewc_loss": 0.03331052139401436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6655259969411418e-05, + "grad_norm": 24.768354415893555, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8776484727859497, + "num_tokens": 230826025.0, + "step": 6049 + }, + { + "epoch": 0.7696221854725862, + "ewc_loss": 0.03332235664129257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6661178960930556e-05, + "grad_norm": 24.77330780029297, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8720513582229614, + "num_tokens": 230866615.0, + "step": 6050 + }, + { + "epoch": 0.7697493957511767, + "ewc_loss": 0.03329843655228615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6649219105602242e-05, + "grad_norm": 24.8316707611084, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.8447453379631042, + "num_tokens": 230901587.0, + "step": 6051 + }, + { + "epoch": 0.7698766060297672, + "ewc_loss": 0.03329382836818695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.664691444602795e-05, + "grad_norm": 24.620792388916016, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8662424683570862, + "num_tokens": 230944267.0, + "step": 6052 + }, + { + "epoch": 0.7700038163083577, + "ewc_loss": 0.033303599804639816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6651800251565874e-05, + "grad_norm": 24.8243350982666, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8512239456176758, + "num_tokens": 230980022.0, + "step": 6053 + }, + { + "epoch": 0.7701310265869482, + "ewc_loss": 0.0333535335958004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6676767700118944e-05, + "grad_norm": 24.755146026611328, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8667173385620117, + "num_tokens": 231018110.0, + "step": 6054 + }, + { + "epoch": 0.7702582368655387, + "ewc_loss": 0.03330611810088158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6653058992233127e-05, + "grad_norm": 24.726314544677734, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.865231990814209, + "num_tokens": 231056531.0, + "step": 6055 + }, + { + "epoch": 0.7703854471441293, + "ewc_loss": 0.03336023911833763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.668012009758968e-05, + "grad_norm": 24.777578353881836, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8649827241897583, + "num_tokens": 231096270.0, + "step": 6056 + }, + { + "epoch": 0.7705126574227198, + "ewc_loss": 0.033277977257966995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.66389891091967e-05, + "grad_norm": 24.666589736938477, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8537525534629822, + "num_tokens": 231136300.0, + "step": 6057 + }, + { + "epoch": 0.7706398677013102, + "ewc_loss": 0.03338243439793587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6691217751940712e-05, + "grad_norm": 24.81651496887207, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8471006751060486, + "num_tokens": 231169005.0, + "step": 6058 + }, + { + "epoch": 0.7707670779799007, + "ewc_loss": 0.033310506492853165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6655252693453804e-05, + "grad_norm": 24.650423049926758, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8677573800086975, + "num_tokens": 231205274.0, + "step": 6059 + }, + { + "epoch": 0.7708942882584913, + "ewc_loss": 0.033361922949552536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.668096228968352e-05, + "grad_norm": 24.767221450805664, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8656898736953735, + "num_tokens": 231239022.0, + "step": 6060 + }, + { + "epoch": 0.7710214985370818, + "ewc_loss": 0.03337714076042175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6688571122358553e-05, + "grad_norm": 24.704185485839844, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8420112729072571, + "num_tokens": 231282952.0, + "step": 6061 + }, + { + "epoch": 0.7711487088156723, + "ewc_loss": 0.033407725393772125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6703863366274163e-05, + "grad_norm": 24.757610321044922, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8576841950416565, + "num_tokens": 231322980.0, + "step": 6062 + }, + { + "epoch": 0.7712759190942629, + "ewc_loss": 0.0333629846572876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6681491615599953e-05, + "grad_norm": 24.725046157836914, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8522567749023438, + "num_tokens": 231358382.0, + "step": 6063 + }, + { + "epoch": 0.7714031293728534, + "ewc_loss": 0.03339703008532524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.669851553742774e-05, + "grad_norm": 24.660480499267578, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8684844970703125, + "num_tokens": 231401915.0, + "step": 6064 + }, + { + "epoch": 0.7715303396514438, + "ewc_loss": 0.03344475477933884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6722377040423453e-05, + "grad_norm": 24.84589195251465, + "learning_rate": 1e-06, + "loss": 0.5085, + "mean_token_accuracy": 0.8383759260177612, + "num_tokens": 231433241.0, + "step": 6065 + }, + { + "epoch": 0.7716575499300343, + "ewc_loss": 0.03348682448267937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6743411833886057e-05, + "grad_norm": 24.706071853637695, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8542751669883728, + "num_tokens": 231478318.0, + "step": 6066 + }, + { + "epoch": 0.7717847602086249, + "ewc_loss": 0.03340877220034599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.670438541623298e-05, + "grad_norm": 24.794055938720703, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8529785871505737, + "num_tokens": 231518584.0, + "step": 6067 + }, + { + "epoch": 0.7719119704872154, + "ewc_loss": 0.03343578055500984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6717889593564905e-05, + "grad_norm": 24.649200439453125, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8595771789550781, + "num_tokens": 231556689.0, + "step": 6068 + }, + { + "epoch": 0.7720391807658059, + "ewc_loss": 0.03346385434269905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.673192673479207e-05, + "grad_norm": 24.869176864624023, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8589205741882324, + "num_tokens": 231593936.0, + "step": 6069 + }, + { + "epoch": 0.7721663910443964, + "ewc_loss": 0.03350018709897995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.675009298196528e-05, + "grad_norm": 24.70064926147461, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8782329559326172, + "num_tokens": 231634785.0, + "step": 6070 + }, + { + "epoch": 0.7722936013229869, + "ewc_loss": 0.033431779593229294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.671589052421041e-05, + "grad_norm": 24.873977661132812, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8548321723937988, + "num_tokens": 231675798.0, + "step": 6071 + }, + { + "epoch": 0.7724208116015774, + "ewc_loss": 0.03346829116344452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6734145901864395e-05, + "grad_norm": 24.83824920654297, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8569892048835754, + "num_tokens": 231711611.0, + "step": 6072 + }, + { + "epoch": 0.7725480218801679, + "ewc_loss": 0.033365242183208466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6682621208019555e-05, + "grad_norm": 24.698684692382812, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.849834680557251, + "num_tokens": 231746997.0, + "step": 6073 + }, + { + "epoch": 0.7726752321587584, + "ewc_loss": 0.03342384845018387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6711923308321275e-05, + "grad_norm": 24.83633804321289, + "learning_rate": 1e-06, + "loss": 0.508, + "mean_token_accuracy": 0.8394825458526611, + "num_tokens": 231787696.0, + "step": 6074 + }, + { + "epoch": 0.772802442437349, + "ewc_loss": 0.03343888744711876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6719443010515533e-05, + "grad_norm": 24.67576789855957, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8508439064025879, + "num_tokens": 231826127.0, + "step": 6075 + }, + { + "epoch": 0.7729296527159395, + "ewc_loss": 0.03339909389615059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.669954690441955e-05, + "grad_norm": 24.789838790893555, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8538868427276611, + "num_tokens": 231857538.0, + "step": 6076 + }, + { + "epoch": 0.7730568629945299, + "ewc_loss": 0.03347296267747879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6736481484258547e-05, + "grad_norm": 24.671573638916016, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8570404052734375, + "num_tokens": 231893913.0, + "step": 6077 + }, + { + "epoch": 0.7731840732731204, + "ewc_loss": 0.033445682376623154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6722840882721357e-05, + "grad_norm": 24.836904525756836, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8620061874389648, + "num_tokens": 231934274.0, + "step": 6078 + }, + { + "epoch": 0.773311283551711, + "ewc_loss": 0.03349091112613678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6745456377975643e-05, + "grad_norm": 24.789894104003906, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8530323505401611, + "num_tokens": 231983197.0, + "step": 6079 + }, + { + "epoch": 0.7734384938303015, + "ewc_loss": 0.033491071313619614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.67455364135094e-05, + "grad_norm": 24.85342025756836, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8555572032928467, + "num_tokens": 232021265.0, + "step": 6080 + }, + { + "epoch": 0.773565704108892, + "ewc_loss": 0.033461783081293106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.673089172982145e-05, + "grad_norm": 24.86564064025879, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8572540283203125, + "num_tokens": 232060992.0, + "step": 6081 + }, + { + "epoch": 0.7736929143874826, + "ewc_loss": 0.033403825014829636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.670191159064416e-05, + "grad_norm": 24.781879425048828, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8572924137115479, + "num_tokens": 232096758.0, + "step": 6082 + }, + { + "epoch": 0.773820124666073, + "ewc_loss": 0.03340863808989525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6704319932614453e-05, + "grad_norm": 24.72573471069336, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8559795618057251, + "num_tokens": 232133773.0, + "step": 6083 + }, + { + "epoch": 0.7739473349446635, + "ewc_loss": 0.03344190493226051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6720952771720476e-05, + "grad_norm": 24.77315330505371, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.857329785823822, + "num_tokens": 232173182.0, + "step": 6084 + }, + { + "epoch": 0.774074545223254, + "ewc_loss": 0.03343503177165985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.671751670073718e-05, + "grad_norm": 24.66628646850586, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8675951957702637, + "num_tokens": 232206478.0, + "step": 6085 + }, + { + "epoch": 0.7742017555018446, + "ewc_loss": 0.033428505063056946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6714251614757814e-05, + "grad_norm": 24.746017456054688, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.853920578956604, + "num_tokens": 232242231.0, + "step": 6086 + }, + { + "epoch": 0.7743289657804351, + "ewc_loss": 0.033507153391838074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.675357634667307e-05, + "grad_norm": 24.78969955444336, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.862378716468811, + "num_tokens": 232270636.0, + "step": 6087 + }, + { + "epoch": 0.7744561760590256, + "ewc_loss": 0.03346211090683937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.673105543886777e-05, + "grad_norm": 24.773862838745117, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8651896119117737, + "num_tokens": 232309974.0, + "step": 6088 + }, + { + "epoch": 0.774583386337616, + "ewc_loss": 0.033397696912288666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6698848412488587e-05, + "grad_norm": 24.74489974975586, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.867375910282135, + "num_tokens": 232344118.0, + "step": 6089 + }, + { + "epoch": 0.7747105966162066, + "ewc_loss": 0.033507831394672394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6753916497691534e-05, + "grad_norm": 24.91891098022461, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8489857912063599, + "num_tokens": 232385183.0, + "step": 6090 + }, + { + "epoch": 0.7748378068947971, + "ewc_loss": 0.03345949202775955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6729745766497217e-05, + "grad_norm": 24.77815818786621, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8606618642807007, + "num_tokens": 232421670.0, + "step": 6091 + }, + { + "epoch": 0.7749650171733876, + "ewc_loss": 0.033445172011852264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.672258622420486e-05, + "grad_norm": 24.809221267700195, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8469693064689636, + "num_tokens": 232459563.0, + "step": 6092 + }, + { + "epoch": 0.7750922274519781, + "ewc_loss": 0.033467382192611694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6733691154513508e-05, + "grad_norm": 24.682918548583984, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8601023554801941, + "num_tokens": 232496357.0, + "step": 6093 + }, + { + "epoch": 0.7752194377305687, + "ewc_loss": 0.033468879759311676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.673444057814777e-05, + "grad_norm": 24.771556854248047, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8589462637901306, + "num_tokens": 232530436.0, + "step": 6094 + }, + { + "epoch": 0.7753466480091591, + "ewc_loss": 0.03351641446352005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.67582074936945e-05, + "grad_norm": 24.751239776611328, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8522161245346069, + "num_tokens": 232568585.0, + "step": 6095 + }, + { + "epoch": 0.7754738582877496, + "ewc_loss": 0.03339162468910217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.669581251917407e-05, + "grad_norm": 24.610692977905273, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8664522171020508, + "num_tokens": 232602506.0, + "step": 6096 + }, + { + "epoch": 0.7756010685663401, + "ewc_loss": 0.033558547496795654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6779273209976964e-05, + "grad_norm": 24.84477996826172, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8669030666351318, + "num_tokens": 232642819.0, + "step": 6097 + }, + { + "epoch": 0.7757282788449307, + "ewc_loss": 0.03359734266996384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6798670912976377e-05, + "grad_norm": 24.685623168945312, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8480223417282104, + "num_tokens": 232679505.0, + "step": 6098 + }, + { + "epoch": 0.7758554891235212, + "ewc_loss": 0.03345615044236183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6728075934224762e-05, + "grad_norm": 24.80063247680664, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8641204237937927, + "num_tokens": 232715701.0, + "step": 6099 + }, + { + "epoch": 0.7759826994021117, + "ewc_loss": 0.03358874097466469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6794370822026394e-05, + "grad_norm": 24.792417526245117, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8602848052978516, + "num_tokens": 232750786.0, + "step": 6100 + }, + { + "epoch": 0.7761099096807021, + "ewc_loss": 0.03348727896809578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.67436392075615e-05, + "grad_norm": 24.81291389465332, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8484910130500793, + "num_tokens": 232791370.0, + "step": 6101 + }, + { + "epoch": 0.7762371199592927, + "ewc_loss": 0.033458683639764786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.672934195084963e-05, + "grad_norm": 24.81978416442871, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8607115149497986, + "num_tokens": 232830895.0, + "step": 6102 + }, + { + "epoch": 0.7763643302378832, + "ewc_loss": 0.033484309911727905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6742154912208207e-05, + "grad_norm": 24.838592529296875, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.858906090259552, + "num_tokens": 232869940.0, + "step": 6103 + }, + { + "epoch": 0.7764915405164737, + "ewc_loss": 0.03350122645497322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6750613212934695e-05, + "grad_norm": 24.932655334472656, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8558014631271362, + "num_tokens": 232910743.0, + "step": 6104 + }, + { + "epoch": 0.7766187507950643, + "ewc_loss": 0.033420562744140625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6710280760889873e-05, + "grad_norm": 24.71727180480957, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8595998287200928, + "num_tokens": 232948428.0, + "step": 6105 + }, + { + "epoch": 0.7767459610736548, + "ewc_loss": 0.03339528292417526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6697640603524633e-05, + "grad_norm": 24.808231353759766, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8644823431968689, + "num_tokens": 232985142.0, + "step": 6106 + }, + { + "epoch": 0.7768731713522452, + "ewc_loss": 0.03348608687520027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6743042579037137e-05, + "grad_norm": 24.80490493774414, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8694010972976685, + "num_tokens": 233020810.0, + "step": 6107 + }, + { + "epoch": 0.7770003816308357, + "ewc_loss": 0.03346753492951393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6733767552068457e-05, + "grad_norm": 24.784650802612305, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8511897921562195, + "num_tokens": 233055674.0, + "step": 6108 + }, + { + "epoch": 0.7771275919094263, + "ewc_loss": 0.033516108989715576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.67580546985846e-05, + "grad_norm": 24.808027267456055, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8540018200874329, + "num_tokens": 233084506.0, + "step": 6109 + }, + { + "epoch": 0.7772548021880168, + "ewc_loss": 0.03353673592209816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6768368368502706e-05, + "grad_norm": 24.70361328125, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8759067058563232, + "num_tokens": 233120172.0, + "step": 6110 + }, + { + "epoch": 0.7773820124666073, + "ewc_loss": 0.03345382958650589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.67269154189853e-05, + "grad_norm": 24.754898071289062, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8494740724563599, + "num_tokens": 233160075.0, + "step": 6111 + }, + { + "epoch": 0.7775092227451978, + "ewc_loss": 0.03352338820695877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6761694496381097e-05, + "grad_norm": 24.67688751220703, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8520931601524353, + "num_tokens": 233201714.0, + "step": 6112 + }, + { + "epoch": 0.7776364330237884, + "ewc_loss": 0.033512476831674576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.675623752817046e-05, + "grad_norm": 24.861360549926758, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.865289568901062, + "num_tokens": 233237642.0, + "step": 6113 + }, + { + "epoch": 0.7777636433023788, + "ewc_loss": 0.033534325659275055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6767162378528155e-05, + "grad_norm": 24.750465393066406, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8644180297851562, + "num_tokens": 233279712.0, + "step": 6114 + }, + { + "epoch": 0.7778908535809693, + "ewc_loss": 0.03352157399058342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6760786820668727e-05, + "grad_norm": 24.827449798583984, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8644657731056213, + "num_tokens": 233322441.0, + "step": 6115 + }, + { + "epoch": 0.7780180638595598, + "ewc_loss": 0.03358832001686096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6794159819255583e-05, + "grad_norm": 24.81601333618164, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.853421688079834, + "num_tokens": 233371053.0, + "step": 6116 + }, + { + "epoch": 0.7781452741381504, + "ewc_loss": 0.0335223563015461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.676117790339049e-05, + "grad_norm": 24.80582618713379, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8605371713638306, + "num_tokens": 233405372.0, + "step": 6117 + }, + { + "epoch": 0.7782724844167409, + "ewc_loss": 0.03356287628412247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6781437807367183e-05, + "grad_norm": 24.897640228271484, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8664355278015137, + "num_tokens": 233447209.0, + "step": 6118 + }, + { + "epoch": 0.7783996946953314, + "ewc_loss": 0.033537350594997406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6768675777711906e-05, + "grad_norm": 24.727222442626953, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8621498346328735, + "num_tokens": 233484621.0, + "step": 6119 + }, + { + "epoch": 0.7785269049739219, + "ewc_loss": 0.03352481126785278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6762405721237883e-05, + "grad_norm": 24.936302185058594, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8428205251693726, + "num_tokens": 233520756.0, + "step": 6120 + }, + { + "epoch": 0.7786541152525124, + "ewc_loss": 0.03354611620306969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6773057723185048e-05, + "grad_norm": 24.848628997802734, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8517991304397583, + "num_tokens": 233559120.0, + "step": 6121 + }, + { + "epoch": 0.7787813255311029, + "ewc_loss": 0.033384356647729874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6692178178345785e-05, + "grad_norm": 24.66843032836914, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8700183629989624, + "num_tokens": 233590513.0, + "step": 6122 + }, + { + "epoch": 0.7789085358096934, + "ewc_loss": 0.033516719937324524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6758360288804397e-05, + "grad_norm": 24.92194938659668, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8758590221405029, + "num_tokens": 233625986.0, + "step": 6123 + }, + { + "epoch": 0.779035746088284, + "ewc_loss": 0.03353697806596756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.676848842180334e-05, + "grad_norm": 24.833288192749023, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8687589168548584, + "num_tokens": 233655097.0, + "step": 6124 + }, + { + "epoch": 0.7791629563668745, + "ewc_loss": 0.033457428216934204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6728714399505407e-05, + "grad_norm": 24.800966262817383, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8487218022346497, + "num_tokens": 233687815.0, + "step": 6125 + }, + { + "epoch": 0.7792901666454649, + "ewc_loss": 0.03353944793343544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6769723515608348e-05, + "grad_norm": 24.93558692932129, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.873078465461731, + "num_tokens": 233722611.0, + "step": 6126 + }, + { + "epoch": 0.7794173769240554, + "ewc_loss": 0.03350945934653282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.675472958595492e-05, + "grad_norm": 24.752595901489258, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8470801115036011, + "num_tokens": 233765345.0, + "step": 6127 + }, + { + "epoch": 0.779544587202646, + "ewc_loss": 0.0334533154964447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6726657122489996e-05, + "grad_norm": 24.814558029174805, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8604584336280823, + "num_tokens": 233804358.0, + "step": 6128 + }, + { + "epoch": 0.7796717974812365, + "ewc_loss": 0.03361978381872177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.680989225860685e-05, + "grad_norm": 24.944862365722656, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8480979204177856, + "num_tokens": 233845117.0, + "step": 6129 + }, + { + "epoch": 0.779799007759827, + "ewc_loss": 0.033542051911354065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6771025912021287e-05, + "grad_norm": 24.77254867553711, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8513745665550232, + "num_tokens": 233884091.0, + "step": 6130 + }, + { + "epoch": 0.7799262180384176, + "ewc_loss": 0.033530622720718384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6765310647315346e-05, + "grad_norm": 24.799358367919922, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8573436737060547, + "num_tokens": 233922590.0, + "step": 6131 + }, + { + "epoch": 0.780053428317008, + "ewc_loss": 0.03357475996017456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6787380445748568e-05, + "grad_norm": 24.843761444091797, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8620136380195618, + "num_tokens": 233954138.0, + "step": 6132 + }, + { + "epoch": 0.7801806385955985, + "ewc_loss": 0.03358868509531021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6794341718195938e-05, + "grad_norm": 24.757843017578125, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8690646290779114, + "num_tokens": 233988035.0, + "step": 6133 + }, + { + "epoch": 0.780307848874189, + "ewc_loss": 0.03356599435210228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6782996681286022e-05, + "grad_norm": 24.828798294067383, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8500866293907166, + "num_tokens": 234023747.0, + "step": 6134 + }, + { + "epoch": 0.7804350591527796, + "ewc_loss": 0.033616337925195694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6808169675641693e-05, + "grad_norm": 24.639318466186523, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.867719829082489, + "num_tokens": 234064182.0, + "step": 6135 + }, + { + "epoch": 0.7805622694313701, + "ewc_loss": 0.03360133245587349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6800666344352067e-05, + "grad_norm": 24.791994094848633, + "learning_rate": 1e-06, + "loss": 0.5013, + "mean_token_accuracy": 0.8424757122993469, + "num_tokens": 234106533.0, + "step": 6136 + }, + { + "epoch": 0.7806894797099606, + "ewc_loss": 0.033606868237257004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6803434846224263e-05, + "grad_norm": 24.701818466186523, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8721475601196289, + "num_tokens": 234145492.0, + "step": 6137 + }, + { + "epoch": 0.780816689988551, + "ewc_loss": 0.033578913658857346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6789457731647417e-05, + "grad_norm": 24.78203773498535, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8629786968231201, + "num_tokens": 234181709.0, + "step": 6138 + }, + { + "epoch": 0.7809439002671416, + "ewc_loss": 0.03363959118723869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6819794836919755e-05, + "grad_norm": 24.745357513427734, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8590136170387268, + "num_tokens": 234217854.0, + "step": 6139 + }, + { + "epoch": 0.7810711105457321, + "ewc_loss": 0.03359033912420273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.679516935837455e-05, + "grad_norm": 24.861759185791016, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.859282910823822, + "num_tokens": 234255769.0, + "step": 6140 + }, + { + "epoch": 0.7811983208243226, + "ewc_loss": 0.03369547426700592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6847736333147623e-05, + "grad_norm": 24.788955688476562, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8458510637283325, + "num_tokens": 234290828.0, + "step": 6141 + }, + { + "epoch": 0.7813255311029131, + "ewc_loss": 0.0335712656378746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.678563239693176e-05, + "grad_norm": 24.777170181274414, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.85819011926651, + "num_tokens": 234326336.0, + "step": 6142 + }, + { + "epoch": 0.7814527413815037, + "ewc_loss": 0.03362742438912392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6813712136354297e-05, + "grad_norm": 24.783405303955078, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8544687628746033, + "num_tokens": 234366210.0, + "step": 6143 + }, + { + "epoch": 0.7815799516600941, + "ewc_loss": 0.03359147161245346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6795735064079054e-05, + "grad_norm": 24.708114624023438, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8518307209014893, + "num_tokens": 234398975.0, + "step": 6144 + }, + { + "epoch": 0.7817071619386846, + "ewc_loss": 0.03361115977168083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6805579434731044e-05, + "grad_norm": 24.747325897216797, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8540576100349426, + "num_tokens": 234437746.0, + "step": 6145 + }, + { + "epoch": 0.7818343722172751, + "ewc_loss": 0.03363730013370514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6818650692584924e-05, + "grad_norm": 24.68549156188965, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8642939329147339, + "num_tokens": 234478402.0, + "step": 6146 + }, + { + "epoch": 0.7819615824958657, + "ewc_loss": 0.03366339951753616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6831700122565962e-05, + "grad_norm": 24.869993209838867, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8446716070175171, + "num_tokens": 234509699.0, + "step": 6147 + }, + { + "epoch": 0.7820887927744562, + "ewc_loss": 0.03362775593996048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.681387766439002e-05, + "grad_norm": 24.697834014892578, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8540927767753601, + "num_tokens": 234551515.0, + "step": 6148 + }, + { + "epoch": 0.7822160030530467, + "ewc_loss": 0.03367344290018082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.683672053331975e-05, + "grad_norm": 24.81204605102539, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8657006621360779, + "num_tokens": 234582811.0, + "step": 6149 + }, + { + "epoch": 0.7823432133316371, + "ewc_loss": 0.03371306508779526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.685653296590317e-05, + "grad_norm": 24.87299346923828, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8518855571746826, + "num_tokens": 234625299.0, + "step": 6150 + }, + { + "epoch": 0.7824704236102277, + "ewc_loss": 0.03367778658866882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6838894225656986e-05, + "grad_norm": 24.84473419189453, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8653461337089539, + "num_tokens": 234665641.0, + "step": 6151 + }, + { + "epoch": 0.7825976338888182, + "ewc_loss": 0.0336042121052742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.680210516497027e-05, + "grad_norm": 24.827165603637695, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8586028814315796, + "num_tokens": 234704668.0, + "step": 6152 + }, + { + "epoch": 0.7827248441674087, + "ewc_loss": 0.033638909459114075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.681945468590129e-05, + "grad_norm": 24.88941764831543, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8629288673400879, + "num_tokens": 234738080.0, + "step": 6153 + }, + { + "epoch": 0.7828520544459993, + "ewc_loss": 0.03360584005713463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.680292007222306e-05, + "grad_norm": 24.677125930786133, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8566391468048096, + "num_tokens": 234781077.0, + "step": 6154 + }, + { + "epoch": 0.7829792647245898, + "ewc_loss": 0.033577896654605865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6788948414614424e-05, + "grad_norm": 24.805675506591797, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8432687520980835, + "num_tokens": 234809374.0, + "step": 6155 + }, + { + "epoch": 0.7831064750031802, + "ewc_loss": 0.03372613340616226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6863066775840707e-05, + "grad_norm": 24.859067916870117, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8477492332458496, + "num_tokens": 234845585.0, + "step": 6156 + }, + { + "epoch": 0.7832336852817707, + "ewc_loss": 0.03361969068646431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.680984496488236e-05, + "grad_norm": 24.79807472229004, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8586233258247375, + "num_tokens": 234883891.0, + "step": 6157 + }, + { + "epoch": 0.7833608955603613, + "ewc_loss": 0.03367787227034569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6838936062413268e-05, + "grad_norm": 24.914344787597656, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8578572869300842, + "num_tokens": 234919445.0, + "step": 6158 + }, + { + "epoch": 0.7834881058389518, + "ewc_loss": 0.033662743866443634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6831372704473324e-05, + "grad_norm": 24.839954376220703, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8616619110107422, + "num_tokens": 234951465.0, + "step": 6159 + }, + { + "epoch": 0.7836153161175423, + "ewc_loss": 0.03356454148888588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6782270904514007e-05, + "grad_norm": 24.83260726928711, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8500533103942871, + "num_tokens": 234989970.0, + "step": 6160 + }, + { + "epoch": 0.7837425263961328, + "ewc_loss": 0.0336405411362648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6820271412143484e-05, + "grad_norm": 24.851505279541016, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8598052859306335, + "num_tokens": 235029673.0, + "step": 6161 + }, + { + "epoch": 0.7838697366747234, + "ewc_loss": 0.03364581614732742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6822908946778625e-05, + "grad_norm": 24.789644241333008, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8586351871490479, + "num_tokens": 235065389.0, + "step": 6162 + }, + { + "epoch": 0.7839969469533138, + "ewc_loss": 0.03361909091472626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6809544831630774e-05, + "grad_norm": 24.813692092895508, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8700870275497437, + "num_tokens": 235099795.0, + "step": 6163 + }, + { + "epoch": 0.7841241572319043, + "ewc_loss": 0.03362935781478882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6814679838716984e-05, + "grad_norm": 24.7825870513916, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8597880601882935, + "num_tokens": 235137475.0, + "step": 6164 + }, + { + "epoch": 0.7842513675104948, + "ewc_loss": 0.03364605829119682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.682302900007926e-05, + "grad_norm": 24.833677291870117, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8545030355453491, + "num_tokens": 235178454.0, + "step": 6165 + }, + { + "epoch": 0.7843785777890854, + "ewc_loss": 0.0337175652384758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6858783055795357e-05, + "grad_norm": 24.864540100097656, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8774899244308472, + "num_tokens": 235216139.0, + "step": 6166 + }, + { + "epoch": 0.7845057880676759, + "ewc_loss": 0.033618271350860596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6809135559014976e-05, + "grad_norm": 24.736635208129883, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8738946318626404, + "num_tokens": 235253571.0, + "step": 6167 + }, + { + "epoch": 0.7846329983462664, + "ewc_loss": 0.03356962651014328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6784813851700164e-05, + "grad_norm": 24.76752471923828, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8432626724243164, + "num_tokens": 235293950.0, + "step": 6168 + }, + { + "epoch": 0.7847602086248568, + "ewc_loss": 0.033687420189380646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6843710909597576e-05, + "grad_norm": 24.79485321044922, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8632457256317139, + "num_tokens": 235336419.0, + "step": 6169 + }, + { + "epoch": 0.7848874189034474, + "ewc_loss": 0.03365607187151909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.682803667790722e-05, + "grad_norm": 24.72455406188965, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8645883202552795, + "num_tokens": 235376039.0, + "step": 6170 + }, + { + "epoch": 0.7850146291820379, + "ewc_loss": 0.03364844620227814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6824222257127985e-05, + "grad_norm": 24.78295135498047, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.847287654876709, + "num_tokens": 235412736.0, + "step": 6171 + }, + { + "epoch": 0.7851418394606284, + "ewc_loss": 0.03365716338157654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6828582374728285e-05, + "grad_norm": 24.80475616455078, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8597855567932129, + "num_tokens": 235457318.0, + "step": 6172 + }, + { + "epoch": 0.785269049739219, + "ewc_loss": 0.03362182527780533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.681091271166224e-05, + "grad_norm": 24.75116729736328, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8500868082046509, + "num_tokens": 235491471.0, + "step": 6173 + }, + { + "epoch": 0.7853962600178095, + "ewc_loss": 0.03364473208785057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.682236688793637e-05, + "grad_norm": 24.848880767822266, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8705508708953857, + "num_tokens": 235531324.0, + "step": 6174 + }, + { + "epoch": 0.7855234702963999, + "ewc_loss": 0.033625777810811996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.681288813415449e-05, + "grad_norm": 24.776315689086914, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8467210531234741, + "num_tokens": 235573233.0, + "step": 6175 + }, + { + "epoch": 0.7856506805749904, + "ewc_loss": 0.03368866816163063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6844334822962992e-05, + "grad_norm": 24.882610321044922, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8603732585906982, + "num_tokens": 235614400.0, + "step": 6176 + }, + { + "epoch": 0.785777890853581, + "ewc_loss": 0.0336180105805397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.680900459177792e-05, + "grad_norm": 24.791688919067383, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8662199378013611, + "num_tokens": 235652247.0, + "step": 6177 + }, + { + "epoch": 0.7859051011321715, + "ewc_loss": 0.03357037156820297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6785184925538488e-05, + "grad_norm": 24.80512237548828, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8555468320846558, + "num_tokens": 235681810.0, + "step": 6178 + }, + { + "epoch": 0.786032311410762, + "ewc_loss": 0.03362147882580757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6810739907668903e-05, + "grad_norm": 24.812246322631836, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8679230213165283, + "num_tokens": 235718422.0, + "step": 6179 + }, + { + "epoch": 0.7861595216893525, + "ewc_loss": 0.03361121937632561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6805610357550904e-05, + "grad_norm": 24.811908721923828, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8543477654457092, + "num_tokens": 235758886.0, + "step": 6180 + }, + { + "epoch": 0.786286731967943, + "ewc_loss": 0.03362387418746948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6811936802696437e-05, + "grad_norm": 24.83285140991211, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8625555634498596, + "num_tokens": 235792270.0, + "step": 6181 + }, + { + "epoch": 0.7864139422465335, + "ewc_loss": 0.03366183862090111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.683091977611184e-05, + "grad_norm": 24.780614852905273, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8498364686965942, + "num_tokens": 235826678.0, + "step": 6182 + }, + { + "epoch": 0.786541152525124, + "ewc_loss": 0.0336027592420578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6801379388198256e-05, + "grad_norm": 24.819196701049805, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8714809417724609, + "num_tokens": 235860437.0, + "step": 6183 + }, + { + "epoch": 0.7866683628037145, + "ewc_loss": 0.03371546417474747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6857731679920107e-05, + "grad_norm": 24.88715171813965, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8645533919334412, + "num_tokens": 235900865.0, + "step": 6184 + }, + { + "epoch": 0.7867955730823051, + "ewc_loss": 0.033629052340984344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6814527043607086e-05, + "grad_norm": 24.85133934020996, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8451249599456787, + "num_tokens": 235937419.0, + "step": 6185 + }, + { + "epoch": 0.7869227833608956, + "ewc_loss": 0.03366636484861374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6833182598929852e-05, + "grad_norm": 24.91533851623535, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8500888347625732, + "num_tokens": 235976425.0, + "step": 6186 + }, + { + "epoch": 0.787049993639486, + "ewc_loss": 0.03366795927286148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6833979316288605e-05, + "grad_norm": 24.88701629638672, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8524210453033447, + "num_tokens": 236012968.0, + "step": 6187 + }, + { + "epoch": 0.7871772039180766, + "ewc_loss": 0.033664949238300323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6832475012051873e-05, + "grad_norm": 24.9158992767334, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8609869480133057, + "num_tokens": 236052462.0, + "step": 6188 + }, + { + "epoch": 0.7873044141966671, + "ewc_loss": 0.03366250544786453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.683125265117269e-05, + "grad_norm": 24.8438663482666, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8752255439758301, + "num_tokens": 236087604.0, + "step": 6189 + }, + { + "epoch": 0.7874316244752576, + "ewc_loss": 0.03364375978708267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6821879398776218e-05, + "grad_norm": 24.86931610107422, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8588669300079346, + "num_tokens": 236126344.0, + "step": 6190 + }, + { + "epoch": 0.7875588347538481, + "ewc_loss": 0.03365402668714523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6827012586873025e-05, + "grad_norm": 24.754987716674805, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8670839071273804, + "num_tokens": 236161913.0, + "step": 6191 + }, + { + "epoch": 0.7876860450324387, + "ewc_loss": 0.03366807848215103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6834039342938922e-05, + "grad_norm": 24.880748748779297, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8619289994239807, + "num_tokens": 236200612.0, + "step": 6192 + }, + { + "epoch": 0.7878132553110291, + "ewc_loss": 0.0336688868701458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.683444315858651e-05, + "grad_norm": 24.79570198059082, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8654593825340271, + "num_tokens": 236238136.0, + "step": 6193 + }, + { + "epoch": 0.7879404655896196, + "ewc_loss": 0.03365422785282135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6827114450279623e-05, + "grad_norm": 24.832399368286133, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8557454943656921, + "num_tokens": 236281303.0, + "step": 6194 + }, + { + "epoch": 0.7880676758682101, + "ewc_loss": 0.033613186329603195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.680659261182882e-05, + "grad_norm": 24.825740814208984, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8569082021713257, + "num_tokens": 236322355.0, + "step": 6195 + }, + { + "epoch": 0.7881948861468007, + "ewc_loss": 0.03359520807862282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6797603166196495e-05, + "grad_norm": 24.838809967041016, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8581348657608032, + "num_tokens": 236354263.0, + "step": 6196 + }, + { + "epoch": 0.7883220964253912, + "ewc_loss": 0.033700838685035706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6850419342517853e-05, + "grad_norm": 24.884157180786133, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8608676195144653, + "num_tokens": 236391581.0, + "step": 6197 + }, + { + "epoch": 0.7884493067039817, + "ewc_loss": 0.03364131227135658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6820657037897035e-05, + "grad_norm": 25.005117416381836, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8679971098899841, + "num_tokens": 236426003.0, + "step": 6198 + }, + { + "epoch": 0.7885765169825721, + "ewc_loss": 0.033713746815919876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6856873116921633e-05, + "grad_norm": 24.84136962890625, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8689841032028198, + "num_tokens": 236464874.0, + "step": 6199 + }, + { + "epoch": 0.7887037272611627, + "ewc_loss": 0.03357598930597305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6787995264166966e-05, + "grad_norm": 24.83452606201172, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8693068027496338, + "num_tokens": 236499826.0, + "step": 6200 + }, + { + "epoch": 0.7888309375397532, + "ewc_loss": 0.03370976448059082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6854883142514154e-05, + "grad_norm": 24.7824649810791, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8715336322784424, + "num_tokens": 236538615.0, + "step": 6201 + }, + { + "epoch": 0.7889581478183437, + "ewc_loss": 0.033632077276706696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6816038623801433e-05, + "grad_norm": 24.879714965820312, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.857924222946167, + "num_tokens": 236577860.0, + "step": 6202 + }, + { + "epoch": 0.7890853580969343, + "ewc_loss": 0.03369985148310661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6849926396389492e-05, + "grad_norm": 24.779653549194336, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8742430210113525, + "num_tokens": 236619041.0, + "step": 6203 + }, + { + "epoch": 0.7892125683755248, + "ewc_loss": 0.03366318717598915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6831592802191153e-05, + "grad_norm": 24.974172592163086, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.863235354423523, + "num_tokens": 236655788.0, + "step": 6204 + }, + { + "epoch": 0.7893397786541152, + "ewc_loss": 0.033753663301467896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.687683106865734e-05, + "grad_norm": 24.8056697845459, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.871347188949585, + "num_tokens": 236691532.0, + "step": 6205 + }, + { + "epoch": 0.7894669889327057, + "ewc_loss": 0.033601850271224976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.680092464084737e-05, + "grad_norm": 24.89006233215332, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.866523265838623, + "num_tokens": 236733792.0, + "step": 6206 + }, + { + "epoch": 0.7895941992112963, + "ewc_loss": 0.03368663042783737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6843314369907603e-05, + "grad_norm": 24.860702514648438, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8720300793647766, + "num_tokens": 236775454.0, + "step": 6207 + }, + { + "epoch": 0.7897214094898868, + "ewc_loss": 0.0335894450545311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.679472188698128e-05, + "grad_norm": 24.785795211791992, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8695192337036133, + "num_tokens": 236811572.0, + "step": 6208 + }, + { + "epoch": 0.7898486197684773, + "ewc_loss": 0.03363891690969467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.68194583238801e-05, + "grad_norm": 24.807018280029297, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8452130556106567, + "num_tokens": 236851018.0, + "step": 6209 + }, + { + "epoch": 0.7899758300470678, + "ewc_loss": 0.03362957760691643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6814788978081197e-05, + "grad_norm": 24.79926300048828, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8636748790740967, + "num_tokens": 236891871.0, + "step": 6210 + }, + { + "epoch": 0.7901030403256584, + "ewc_loss": 0.03361652046442032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.680826062511187e-05, + "grad_norm": 24.81643295288086, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8637720346450806, + "num_tokens": 236926771.0, + "step": 6211 + }, + { + "epoch": 0.7902302506042488, + "ewc_loss": 0.03369925543665886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.684962808212731e-05, + "grad_norm": 24.905311584472656, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8467352390289307, + "num_tokens": 236964768.0, + "step": 6212 + }, + { + "epoch": 0.7903574608828393, + "ewc_loss": 0.03359590470790863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6797952412161976e-05, + "grad_norm": 24.78297233581543, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8487827777862549, + "num_tokens": 237005075.0, + "step": 6213 + }, + { + "epoch": 0.7904846711614298, + "ewc_loss": 0.03368442878127098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6842213881318457e-05, + "grad_norm": 24.90159034729004, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8621900081634521, + "num_tokens": 237035517.0, + "step": 6214 + }, + { + "epoch": 0.7906118814400204, + "ewc_loss": 0.033636610954999924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.681830508459825e-05, + "grad_norm": 24.76125717163086, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8768083453178406, + "num_tokens": 237076319.0, + "step": 6215 + }, + { + "epoch": 0.7907390917186109, + "ewc_loss": 0.03362961485981941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6814807167975232e-05, + "grad_norm": 24.755342483520508, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8515646457672119, + "num_tokens": 237117935.0, + "step": 6216 + }, + { + "epoch": 0.7908663019972014, + "ewc_loss": 0.033721696585416794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.686084760876838e-05, + "grad_norm": 24.880332946777344, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.869286298751831, + "num_tokens": 237155789.0, + "step": 6217 + }, + { + "epoch": 0.7909935122757918, + "ewc_loss": 0.033660996705293655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.683049777057022e-05, + "grad_norm": 24.78825569152832, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8656775951385498, + "num_tokens": 237198870.0, + "step": 6218 + }, + { + "epoch": 0.7911207225543824, + "ewc_loss": 0.0336616188287735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6830808817758225e-05, + "grad_norm": 24.81880760192871, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8661165237426758, + "num_tokens": 237243084.0, + "step": 6219 + }, + { + "epoch": 0.7912479328329729, + "ewc_loss": 0.033615369349718094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6807684005470946e-05, + "grad_norm": 24.732065200805664, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8523820638656616, + "num_tokens": 237277245.0, + "step": 6220 + }, + { + "epoch": 0.7913751431115634, + "ewc_loss": 0.033665142953395844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.683257141849026e-05, + "grad_norm": 24.888662338256836, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8652704358100891, + "num_tokens": 237317841.0, + "step": 6221 + }, + { + "epoch": 0.791502353390154, + "ewc_loss": 0.033735718578100204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.686785981291905e-05, + "grad_norm": 24.863075256347656, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8496448993682861, + "num_tokens": 237349327.0, + "step": 6222 + }, + { + "epoch": 0.7916295636687445, + "ewc_loss": 0.03366061672568321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.683030859567225e-05, + "grad_norm": 24.824443817138672, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.846338152885437, + "num_tokens": 237396179.0, + "step": 6223 + }, + { + "epoch": 0.7917567739473349, + "ewc_loss": 0.03366096317768097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6830481399665587e-05, + "grad_norm": 24.801605224609375, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8569076657295227, + "num_tokens": 237437766.0, + "step": 6224 + }, + { + "epoch": 0.7918839842259254, + "ewc_loss": 0.033635206520557404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6817602954688482e-05, + "grad_norm": 24.806921005249023, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8646214604377747, + "num_tokens": 237479239.0, + "step": 6225 + }, + { + "epoch": 0.792011194504516, + "ewc_loss": 0.03369344770908356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6846723156049848e-05, + "grad_norm": 24.852197647094727, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8733862638473511, + "num_tokens": 237514641.0, + "step": 6226 + }, + { + "epoch": 0.7921384047831065, + "ewc_loss": 0.03365527465939522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.682763650023844e-05, + "grad_norm": 24.785661697387695, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8707588315010071, + "num_tokens": 237555319.0, + "step": 6227 + }, + { + "epoch": 0.792265615061697, + "ewc_loss": 0.03366481512784958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6832407709443942e-05, + "grad_norm": 24.90966033935547, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8631386756896973, + "num_tokens": 237597786.0, + "step": 6228 + }, + { + "epoch": 0.7923928253402875, + "ewc_loss": 0.03373724967241287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6868625607457943e-05, + "grad_norm": 25.095705032348633, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8729277849197388, + "num_tokens": 237636584.0, + "step": 6229 + }, + { + "epoch": 0.792520035618878, + "ewc_loss": 0.033567626029253006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6783813407528214e-05, + "grad_norm": 24.787221908569336, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8514302372932434, + "num_tokens": 237676000.0, + "step": 6230 + }, + { + "epoch": 0.7926472458974685, + "ewc_loss": 0.03352072462439537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.67603629961377e-05, + "grad_norm": 24.82064437866211, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8868629932403564, + "num_tokens": 237713003.0, + "step": 6231 + }, + { + "epoch": 0.792774456176059, + "ewc_loss": 0.03365028649568558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.682514266576618e-05, + "grad_norm": 24.91501235961914, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8624428510665894, + "num_tokens": 237742054.0, + "step": 6232 + }, + { + "epoch": 0.7929016664546495, + "ewc_loss": 0.03364595025777817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6822974430397153e-05, + "grad_norm": 24.80528450012207, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8523076772689819, + "num_tokens": 237783147.0, + "step": 6233 + }, + { + "epoch": 0.7930288767332401, + "ewc_loss": 0.03355611115694046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.677805630606599e-05, + "grad_norm": 24.864906311035156, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8536844253540039, + "num_tokens": 237814322.0, + "step": 6234 + }, + { + "epoch": 0.7931560870118306, + "ewc_loss": 0.033612098544836044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.680604873399716e-05, + "grad_norm": 24.732763290405273, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8657097816467285, + "num_tokens": 237849278.0, + "step": 6235 + }, + { + "epoch": 0.793283297290421, + "ewc_loss": 0.03366537392139435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6832687833812088e-05, + "grad_norm": 24.954641342163086, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8645933270454407, + "num_tokens": 237893434.0, + "step": 6236 + }, + { + "epoch": 0.7934105075690115, + "ewc_loss": 0.03364616259932518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6823081750771962e-05, + "grad_norm": 24.872039794921875, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8678760528564453, + "num_tokens": 237932135.0, + "step": 6237 + }, + { + "epoch": 0.7935377178476021, + "ewc_loss": 0.03364802151918411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6824011254357174e-05, + "grad_norm": 24.8735294342041, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8555880188941956, + "num_tokens": 237966320.0, + "step": 6238 + }, + { + "epoch": 0.7936649281261926, + "ewc_loss": 0.033667515963315964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6833757399581373e-05, + "grad_norm": 24.776628494262695, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8655544519424438, + "num_tokens": 238006365.0, + "step": 6239 + }, + { + "epoch": 0.7937921384047831, + "ewc_loss": 0.033718086779117584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6859043171280064e-05, + "grad_norm": 24.895877838134766, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8568415641784668, + "num_tokens": 238039307.0, + "step": 6240 + }, + { + "epoch": 0.7939193486833737, + "ewc_loss": 0.03371749073266983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6858744857017882e-05, + "grad_norm": 24.927457809448242, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8471542000770569, + "num_tokens": 238078015.0, + "step": 6241 + }, + { + "epoch": 0.7940465589619641, + "ewc_loss": 0.033637866377830505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6818932635942474e-05, + "grad_norm": 24.782569885253906, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8567781448364258, + "num_tokens": 238116790.0, + "step": 6242 + }, + { + "epoch": 0.7941737692405546, + "ewc_loss": 0.03365441784262657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6827209037728608e-05, + "grad_norm": 24.888164520263672, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8668724894523621, + "num_tokens": 238155330.0, + "step": 6243 + }, + { + "epoch": 0.7943009795191451, + "ewc_loss": 0.0336783267557621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.683916343608871e-05, + "grad_norm": 24.792875289916992, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8452250957489014, + "num_tokens": 238194492.0, + "step": 6244 + }, + { + "epoch": 0.7944281897977357, + "ewc_loss": 0.03367239236831665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6836196664371528e-05, + "grad_norm": 24.830354690551758, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8600342273712158, + "num_tokens": 238230451.0, + "step": 6245 + }, + { + "epoch": 0.7945554000763262, + "ewc_loss": 0.03369511663913727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6847558072186075e-05, + "grad_norm": 24.8747615814209, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8681995272636414, + "num_tokens": 238266331.0, + "step": 6246 + }, + { + "epoch": 0.7946826103549167, + "ewc_loss": 0.03366784378886223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6833921108627692e-05, + "grad_norm": 24.879173278808594, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8488987684249878, + "num_tokens": 238307753.0, + "step": 6247 + }, + { + "epoch": 0.7948098206335071, + "ewc_loss": 0.03368865326046944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6844327547005378e-05, + "grad_norm": 24.955074310302734, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8610181212425232, + "num_tokens": 238349644.0, + "step": 6248 + }, + { + "epoch": 0.7949370309120977, + "ewc_loss": 0.03363415226340294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6817075447761454e-05, + "grad_norm": 24.85435676574707, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.844491720199585, + "num_tokens": 238386308.0, + "step": 6249 + }, + { + "epoch": 0.7950642411906882, + "ewc_loss": 0.03367463871836662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6837318980833516e-05, + "grad_norm": 24.857250213623047, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8442952036857605, + "num_tokens": 238427935.0, + "step": 6250 + }, + { + "epoch": 0.7951914514692787, + "ewc_loss": 0.03371407464146614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6857036825967953e-05, + "grad_norm": 24.944759368896484, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.865765392780304, + "num_tokens": 238463639.0, + "step": 6251 + }, + { + "epoch": 0.7953186617478692, + "ewc_loss": 0.03369060531258583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6845302525325678e-05, + "grad_norm": 24.84112548828125, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8728089928627014, + "num_tokens": 238498295.0, + "step": 6252 + }, + { + "epoch": 0.7954458720264598, + "ewc_loss": 0.03367041423916817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6835207134136e-05, + "grad_norm": 24.85393524169922, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8596097230911255, + "num_tokens": 238538109.0, + "step": 6253 + }, + { + "epoch": 0.7955730823050502, + "ewc_loss": 0.033711958676576614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6855979993124492e-05, + "grad_norm": 24.874664306640625, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8466759920120239, + "num_tokens": 238574415.0, + "step": 6254 + }, + { + "epoch": 0.7957002925836407, + "ewc_loss": 0.03370177745819092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6850888641783968e-05, + "grad_norm": 24.965309143066406, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8618555068969727, + "num_tokens": 238616368.0, + "step": 6255 + }, + { + "epoch": 0.7958275028622313, + "ewc_loss": 0.033694732934236526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6847367078298703e-05, + "grad_norm": 24.783416748046875, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8640560507774353, + "num_tokens": 238650634.0, + "step": 6256 + }, + { + "epoch": 0.7959547131408218, + "ewc_loss": 0.0336572602391243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6828629668452777e-05, + "grad_norm": 24.98926544189453, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8544535040855408, + "num_tokens": 238684383.0, + "step": 6257 + }, + { + "epoch": 0.7960819234194123, + "ewc_loss": 0.033724259585142136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.686212999629788e-05, + "grad_norm": 24.78896713256836, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.857603907585144, + "num_tokens": 238722863.0, + "step": 6258 + }, + { + "epoch": 0.7962091336980028, + "ewc_loss": 0.033742502331733704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.687125040916726e-05, + "grad_norm": 25.014305114746094, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8613433837890625, + "num_tokens": 238765027.0, + "step": 6259 + }, + { + "epoch": 0.7963363439765934, + "ewc_loss": 0.03369956836104393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6849784515216015e-05, + "grad_norm": 24.844453811645508, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8652448654174805, + "num_tokens": 238801950.0, + "step": 6260 + }, + { + "epoch": 0.7964635542551838, + "ewc_loss": 0.03368298336863518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.684149174252525e-05, + "grad_norm": 24.943788528442383, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8517334461212158, + "num_tokens": 238840963.0, + "step": 6261 + }, + { + "epoch": 0.7965907645337743, + "ewc_loss": 0.03374958410859108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6874791981535964e-05, + "grad_norm": 24.96100425720215, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8469931483268738, + "num_tokens": 238876167.0, + "step": 6262 + }, + { + "epoch": 0.7967179748123648, + "ewc_loss": 0.03369372338056564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6846861399244517e-05, + "grad_norm": 24.88140296936035, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.866296648979187, + "num_tokens": 238910293.0, + "step": 6263 + }, + { + "epoch": 0.7968451850909554, + "ewc_loss": 0.03375820443034172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6879102986422367e-05, + "grad_norm": 25.055967330932617, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8572566509246826, + "num_tokens": 238947279.0, + "step": 6264 + }, + { + "epoch": 0.7969723953695459, + "ewc_loss": 0.033692214637994766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6846106518642046e-05, + "grad_norm": 25.031951904296875, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8488805294036865, + "num_tokens": 238984148.0, + "step": 6265 + }, + { + "epoch": 0.7970996056481364, + "ewc_loss": 0.03368603438138962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6843017874634825e-05, + "grad_norm": 24.964866638183594, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.8465594053268433, + "num_tokens": 239022370.0, + "step": 6266 + }, + { + "epoch": 0.7972268159267268, + "ewc_loss": 0.033634770661592484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6817384675960056e-05, + "grad_norm": 24.934133529663086, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8567920923233032, + "num_tokens": 239064542.0, + "step": 6267 + }, + { + "epoch": 0.7973540262053174, + "ewc_loss": 0.033669356256723404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6834677808219567e-05, + "grad_norm": 24.839223861694336, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8595114350318909, + "num_tokens": 239103193.0, + "step": 6268 + }, + { + "epoch": 0.7974812364839079, + "ewc_loss": 0.033659644424915314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6829822925501503e-05, + "grad_norm": 24.939706802368164, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.868009090423584, + "num_tokens": 239137952.0, + "step": 6269 + }, + { + "epoch": 0.7976084467624984, + "ewc_loss": 0.03372383862733841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.686191899352707e-05, + "grad_norm": 24.946046829223633, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8491901755332947, + "num_tokens": 239174600.0, + "step": 6270 + }, + { + "epoch": 0.797735657041089, + "ewc_loss": 0.03367343917489052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6836718714330345e-05, + "grad_norm": 24.86338996887207, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8657851219177246, + "num_tokens": 239206157.0, + "step": 6271 + }, + { + "epoch": 0.7978628673196795, + "ewc_loss": 0.03374624624848366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6873123968252912e-05, + "grad_norm": 24.931472778320312, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8645870685577393, + "num_tokens": 239243094.0, + "step": 6272 + }, + { + "epoch": 0.7979900775982699, + "ewc_loss": 0.03374894708395004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6874473658390343e-05, + "grad_norm": 24.906574249267578, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8609143495559692, + "num_tokens": 239281208.0, + "step": 6273 + }, + { + "epoch": 0.7981172878768604, + "ewc_loss": 0.033741023391485214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.687051189946942e-05, + "grad_norm": 24.850326538085938, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8559383153915405, + "num_tokens": 239320159.0, + "step": 6274 + }, + { + "epoch": 0.798244498155451, + "ewc_loss": 0.033679526299238205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.683976370259188e-05, + "grad_norm": 24.88559341430664, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8486948013305664, + "num_tokens": 239360160.0, + "step": 6275 + }, + { + "epoch": 0.7983717084340415, + "ewc_loss": 0.03376096859574318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.688048359937966e-05, + "grad_norm": 24.778892517089844, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8629111051559448, + "num_tokens": 239398101.0, + "step": 6276 + }, + { + "epoch": 0.798498918712632, + "ewc_loss": 0.03374814614653587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6874073480721563e-05, + "grad_norm": 24.899320602416992, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8744913339614868, + "num_tokens": 239438560.0, + "step": 6277 + }, + { + "epoch": 0.7986261289912225, + "ewc_loss": 0.03384566679596901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6922833310673013e-05, + "grad_norm": 24.81918716430664, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8553414344787598, + "num_tokens": 239475851.0, + "step": 6278 + }, + { + "epoch": 0.798753339269813, + "ewc_loss": 0.03370637074112892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6853186025400646e-05, + "grad_norm": 24.735252380371094, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8649638891220093, + "num_tokens": 239519165.0, + "step": 6279 + }, + { + "epoch": 0.7988805495484035, + "ewc_loss": 0.033785417675971985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.689270902716089e-05, + "grad_norm": 24.991121292114258, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8474037647247314, + "num_tokens": 239560602.0, + "step": 6280 + }, + { + "epoch": 0.799007759826994, + "ewc_loss": 0.033797550946474075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6898775356821716e-05, + "grad_norm": 24.823360443115234, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.860856294631958, + "num_tokens": 239591933.0, + "step": 6281 + }, + { + "epoch": 0.7991349701055845, + "ewc_loss": 0.03373074531555176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6865373254404403e-05, + "grad_norm": 24.94985008239746, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8549127578735352, + "num_tokens": 239625495.0, + "step": 6282 + }, + { + "epoch": 0.7992621803841751, + "ewc_loss": 0.03380095958709717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6900479749892838e-05, + "grad_norm": 24.87603759765625, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8753440976142883, + "num_tokens": 239658249.0, + "step": 6283 + }, + { + "epoch": 0.7993893906627656, + "ewc_loss": 0.03377234563231468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.688617339823395e-05, + "grad_norm": 24.901552200317383, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8511615991592407, + "num_tokens": 239690194.0, + "step": 6284 + }, + { + "epoch": 0.799516600941356, + "ewc_loss": 0.0337861031293869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6893050997168757e-05, + "grad_norm": 24.91705894470215, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.858422040939331, + "num_tokens": 239728315.0, + "step": 6285 + }, + { + "epoch": 0.7996438112199465, + "ewc_loss": 0.03374392166733742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6871961634024046e-05, + "grad_norm": 24.84189224243164, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8648784160614014, + "num_tokens": 239764505.0, + "step": 6286 + }, + { + "epoch": 0.7997710214985371, + "ewc_loss": 0.0337771475315094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.688857446424663e-05, + "grad_norm": 24.770549774169922, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8446133136749268, + "num_tokens": 239809461.0, + "step": 6287 + }, + { + "epoch": 0.7998982317771276, + "ewc_loss": 0.03380228206515312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.690114186203573e-05, + "grad_norm": 24.896133422851562, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8545677661895752, + "num_tokens": 239845232.0, + "step": 6288 + }, + { + "epoch": 0.8000254420557181, + "ewc_loss": 0.03386736288666725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.693368176347576e-05, + "grad_norm": 24.935253143310547, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8649565577507019, + "num_tokens": 239885489.0, + "step": 6289 + }, + { + "epoch": 0.8001526523343087, + "ewc_loss": 0.03375152125954628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6875761502888054e-05, + "grad_norm": 24.860193252563477, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8468241095542908, + "num_tokens": 239921159.0, + "step": 6290 + }, + { + "epoch": 0.8002798626128991, + "ewc_loss": 0.033891838043928146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6945919924182817e-05, + "grad_norm": 24.87560272216797, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8641787767410278, + "num_tokens": 239960879.0, + "step": 6291 + }, + { + "epoch": 0.8004070728914896, + "ewc_loss": 0.033865198493003845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.693259946478065e-05, + "grad_norm": 24.88705062866211, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8490723967552185, + "num_tokens": 239994302.0, + "step": 6292 + }, + { + "epoch": 0.8005342831700801, + "ewc_loss": 0.03387252613902092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6936262909439392e-05, + "grad_norm": 24.736204147338867, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8661019802093506, + "num_tokens": 240027759.0, + "step": 6293 + }, + { + "epoch": 0.8006614934486707, + "ewc_loss": 0.03384586051106453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.69229297171114e-05, + "grad_norm": 24.992263793945312, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8552531599998474, + "num_tokens": 240065049.0, + "step": 6294 + }, + { + "epoch": 0.8007887037272612, + "ewc_loss": 0.03393292427062988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.696646177151706e-05, + "grad_norm": 24.761974334716797, + "learning_rate": 1e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8467391729354858, + "num_tokens": 240101453.0, + "step": 6295 + }, + { + "epoch": 0.8009159140058517, + "ewc_loss": 0.03383822739124298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6919113477342762e-05, + "grad_norm": 24.92823028564453, + "learning_rate": 1e-06, + "loss": 0.5025, + "mean_token_accuracy": 0.838282585144043, + "num_tokens": 240136831.0, + "step": 6296 + }, + { + "epoch": 0.8010431242844421, + "ewc_loss": 0.03393265977501869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6966328985290602e-05, + "grad_norm": 24.821542739868164, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8579527735710144, + "num_tokens": 240173364.0, + "step": 6297 + }, + { + "epoch": 0.8011703345630327, + "ewc_loss": 0.03388594463467598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6942973161349073e-05, + "grad_norm": 24.946144104003906, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8377164006233215, + "num_tokens": 240206718.0, + "step": 6298 + }, + { + "epoch": 0.8012975448416232, + "ewc_loss": 0.03390570357441902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.695285209279973e-05, + "grad_norm": 24.822715759277344, + "learning_rate": 1e-06, + "loss": 0.4918, + "mean_token_accuracy": 0.8471362590789795, + "num_tokens": 240245633.0, + "step": 6299 + }, + { + "epoch": 0.8014247551202137, + "ewc_loss": 0.03386704623699188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6933523511397652e-05, + "grad_norm": 24.881595611572266, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8728043437004089, + "num_tokens": 240283887.0, + "step": 6300 + }, + { + "epoch": 0.8015519653988042, + "ewc_loss": 0.03392833098769188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6964166206889786e-05, + "grad_norm": 24.84893035888672, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8516523838043213, + "num_tokens": 240322208.0, + "step": 6301 + }, + { + "epoch": 0.8016791756773948, + "ewc_loss": 0.033900000154972076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.694999991741497e-05, + "grad_norm": 24.98189353942871, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8622896671295166, + "num_tokens": 240356154.0, + "step": 6302 + }, + { + "epoch": 0.8018063859559852, + "ewc_loss": 0.03388234227895737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.694117054285016e-05, + "grad_norm": 24.878427505493164, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8573808670043945, + "num_tokens": 240389103.0, + "step": 6303 + }, + { + "epoch": 0.8019335962345757, + "ewc_loss": 0.03391801565885544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.695900755294133e-05, + "grad_norm": 25.024778366088867, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8760335445404053, + "num_tokens": 240429545.0, + "step": 6304 + }, + { + "epoch": 0.8020608065131662, + "ewc_loss": 0.03398720175027847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6993601093417965e-05, + "grad_norm": 24.969438552856445, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8769668340682983, + "num_tokens": 240468556.0, + "step": 6305 + }, + { + "epoch": 0.8021880167917568, + "ewc_loss": 0.03383440524339676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6917201719479635e-05, + "grad_norm": 24.84949493408203, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8603912591934204, + "num_tokens": 240508534.0, + "step": 6306 + }, + { + "epoch": 0.8023152270703473, + "ewc_loss": 0.03388804942369461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6944024537224323e-05, + "grad_norm": 24.906742095947266, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8598315119743347, + "num_tokens": 240540833.0, + "step": 6307 + }, + { + "epoch": 0.8024424373489378, + "ewc_loss": 0.033850349485874176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6925174350035377e-05, + "grad_norm": 24.862751007080078, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8513638973236084, + "num_tokens": 240577768.0, + "step": 6308 + }, + { + "epoch": 0.8025696476275284, + "ewc_loss": 0.033912304788827896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6956151739577763e-05, + "grad_norm": 24.994598388671875, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8546633720397949, + "num_tokens": 240610607.0, + "step": 6309 + }, + { + "epoch": 0.8026968579061188, + "ewc_loss": 0.03393863886594772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.696931940387003e-05, + "grad_norm": 25.041961669921875, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8566684722900391, + "num_tokens": 240646056.0, + "step": 6310 + }, + { + "epoch": 0.8028240681847093, + "ewc_loss": 0.03381907194852829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6909536498133093e-05, + "grad_norm": 24.899534225463867, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.866262674331665, + "num_tokens": 240682513.0, + "step": 6311 + }, + { + "epoch": 0.8029512784632998, + "ewc_loss": 0.03383238613605499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.691619399935007e-05, + "grad_norm": 24.883174896240234, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8609943389892578, + "num_tokens": 240716311.0, + "step": 6312 + }, + { + "epoch": 0.8030784887418904, + "ewc_loss": 0.03388286381959915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.694143247732427e-05, + "grad_norm": 25.004674911499023, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8505550622940063, + "num_tokens": 240758873.0, + "step": 6313 + }, + { + "epoch": 0.8032056990204809, + "ewc_loss": 0.03384850174188614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6924250303418376e-05, + "grad_norm": 24.82678985595703, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8637272119522095, + "num_tokens": 240797163.0, + "step": 6314 + }, + { + "epoch": 0.8033329092990714, + "ewc_loss": 0.033910274505615234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6955136743490584e-05, + "grad_norm": 25.072999954223633, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8707146644592285, + "num_tokens": 240835719.0, + "step": 6315 + }, + { + "epoch": 0.8034601195776618, + "ewc_loss": 0.03384600579738617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6923002476687543e-05, + "grad_norm": 24.816974639892578, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.853643000125885, + "num_tokens": 240874359.0, + "step": 6316 + }, + { + "epoch": 0.8035873298562524, + "ewc_loss": 0.03379160165786743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.689580130914692e-05, + "grad_norm": 24.950754165649414, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8726515769958496, + "num_tokens": 240909125.0, + "step": 6317 + }, + { + "epoch": 0.8037145401348429, + "ewc_loss": 0.03395752236247063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6978761777863838e-05, + "grad_norm": 24.935529708862305, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8637734651565552, + "num_tokens": 240949391.0, + "step": 6318 + }, + { + "epoch": 0.8038417504134334, + "ewc_loss": 0.033845581114292145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6922791473916732e-05, + "grad_norm": 25.027450561523438, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8489813804626465, + "num_tokens": 240987390.0, + "step": 6319 + }, + { + "epoch": 0.803968960692024, + "ewc_loss": 0.033858440816402435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6929219782468863e-05, + "grad_norm": 24.873764038085938, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8521097898483276, + "num_tokens": 241025359.0, + "step": 6320 + }, + { + "epoch": 0.8040961709706145, + "ewc_loss": 0.03383047506213188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6915237210923806e-05, + "grad_norm": 25.101459503173828, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8536580204963684, + "num_tokens": 241060532.0, + "step": 6321 + }, + { + "epoch": 0.8042233812492049, + "ewc_loss": 0.0337897464632988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.689487362455111e-05, + "grad_norm": 24.865331649780273, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8664175868034363, + "num_tokens": 241098547.0, + "step": 6322 + }, + { + "epoch": 0.8043505915277954, + "ewc_loss": 0.03375647962093353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6878238966455683e-05, + "grad_norm": 25.003896713256836, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8586143255233765, + "num_tokens": 241131091.0, + "step": 6323 + }, + { + "epoch": 0.804477801806386, + "ewc_loss": 0.0338967926800251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6948395568761043e-05, + "grad_norm": 24.967660903930664, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.849676251411438, + "num_tokens": 241164495.0, + "step": 6324 + }, + { + "epoch": 0.8046050120849765, + "ewc_loss": 0.0337701179087162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6885058357729577e-05, + "grad_norm": 24.809574127197266, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8535546064376831, + "num_tokens": 241203777.0, + "step": 6325 + }, + { + "epoch": 0.804732222363567, + "ewc_loss": 0.03388339281082153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6941696230787784e-05, + "grad_norm": 25.001266479492188, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8536415696144104, + "num_tokens": 241243219.0, + "step": 6326 + }, + { + "epoch": 0.8048594326421575, + "ewc_loss": 0.03387622535228729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.69381128216628e-05, + "grad_norm": 24.856613159179688, + "learning_rate": 1e-06, + "loss": 0.5054, + "mean_token_accuracy": 0.8438783884048462, + "num_tokens": 241284547.0, + "step": 6327 + }, + { + "epoch": 0.804986642920748, + "ewc_loss": 0.03386327624320984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.693163903837558e-05, + "grad_norm": 24.97830581665039, + "learning_rate": 1e-06, + "loss": 0.5203, + "mean_token_accuracy": 0.8410459756851196, + "num_tokens": 241331110.0, + "step": 6328 + }, + { + "epoch": 0.8051138531993385, + "ewc_loss": 0.03390641137957573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.695320497674402e-05, + "grad_norm": 24.942960739135742, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8591667413711548, + "num_tokens": 241372720.0, + "step": 6329 + }, + { + "epoch": 0.805241063477929, + "ewc_loss": 0.03384554386138916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6922771465033293e-05, + "grad_norm": 24.93855094909668, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8610418438911438, + "num_tokens": 241408173.0, + "step": 6330 + }, + { + "epoch": 0.8053682737565195, + "ewc_loss": 0.0339079275727272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6953963495325297e-05, + "grad_norm": 25.070772171020508, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8654062747955322, + "num_tokens": 241445021.0, + "step": 6331 + }, + { + "epoch": 0.8054954840351101, + "ewc_loss": 0.03386726602911949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6933632650761865e-05, + "grad_norm": 25.023202896118164, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8647810816764832, + "num_tokens": 241481874.0, + "step": 6332 + }, + { + "epoch": 0.8056226943137006, + "ewc_loss": 0.033861685544252396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6930842321016826e-05, + "grad_norm": 24.97866439819336, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8453245759010315, + "num_tokens": 241523140.0, + "step": 6333 + }, + { + "epoch": 0.805749904592291, + "ewc_loss": 0.033834174275398254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6917087123147212e-05, + "grad_norm": 24.91002655029297, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.8406080007553101, + "num_tokens": 241563424.0, + "step": 6334 + }, + { + "epoch": 0.8058771148708815, + "ewc_loss": 0.033762842416763306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6881420378922485e-05, + "grad_norm": 24.840499877929688, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8711851835250854, + "num_tokens": 241600004.0, + "step": 6335 + }, + { + "epoch": 0.8060043251494721, + "ewc_loss": 0.03389310464262962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.694655293249525e-05, + "grad_norm": 25.011411666870117, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8574698567390442, + "num_tokens": 241642704.0, + "step": 6336 + }, + { + "epoch": 0.8061315354280626, + "ewc_loss": 0.0338498093187809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6924905139603652e-05, + "grad_norm": 24.83794403076172, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8644731044769287, + "num_tokens": 241679176.0, + "step": 6337 + }, + { + "epoch": 0.8062587457066531, + "ewc_loss": 0.03384682536125183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6923413568292744e-05, + "grad_norm": 24.93667221069336, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8641870617866516, + "num_tokens": 241721168.0, + "step": 6338 + }, + { + "epoch": 0.8063859559852437, + "ewc_loss": 0.033853210508823395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6926605894695967e-05, + "grad_norm": 24.997974395751953, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.873628556728363, + "num_tokens": 241759735.0, + "step": 6339 + }, + { + "epoch": 0.8065131662638341, + "ewc_loss": 0.033809930086135864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6904965377761982e-05, + "grad_norm": 24.938879013061523, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8598765134811401, + "num_tokens": 241799445.0, + "step": 6340 + }, + { + "epoch": 0.8066403765424246, + "ewc_loss": 0.033829327672719955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.691466422926169e-05, + "grad_norm": 24.983837127685547, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.863657534122467, + "num_tokens": 241839333.0, + "step": 6341 + }, + { + "epoch": 0.8067675868210151, + "ewc_loss": 0.03384123742580414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6920619600568898e-05, + "grad_norm": 24.88450813293457, + "learning_rate": 1e-06, + "loss": 0.4902, + "mean_token_accuracy": 0.847994863986969, + "num_tokens": 241880248.0, + "step": 6342 + }, + { + "epoch": 0.8068947970996057, + "ewc_loss": 0.03378136456012726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.689068267296534e-05, + "grad_norm": 24.9352970123291, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8731206655502319, + "num_tokens": 241926717.0, + "step": 6343 + }, + { + "epoch": 0.8070220073781962, + "ewc_loss": 0.03387941047549248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.69397044373909e-05, + "grad_norm": 24.948627471923828, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8506751656532288, + "num_tokens": 241966080.0, + "step": 6344 + }, + { + "epoch": 0.8071492176567867, + "ewc_loss": 0.033806826919317245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6903413779800758e-05, + "grad_norm": 25.015661239624023, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8562873005867004, + "num_tokens": 242011975.0, + "step": 6345 + }, + { + "epoch": 0.8072764279353771, + "ewc_loss": 0.03381355106830597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.690677527221851e-05, + "grad_norm": 24.867334365844727, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8722368478775024, + "num_tokens": 242046320.0, + "step": 6346 + }, + { + "epoch": 0.8074036382139677, + "ewc_loss": 0.03372490778565407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6862453776411712e-05, + "grad_norm": 25.000856399536133, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8611410856246948, + "num_tokens": 242085157.0, + "step": 6347 + }, + { + "epoch": 0.8075308484925582, + "ewc_loss": 0.033825915306806564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6912958017201163e-05, + "grad_norm": 24.851919174194336, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8548901081085205, + "num_tokens": 242116548.0, + "step": 6348 + }, + { + "epoch": 0.8076580587711487, + "ewc_loss": 0.03380865603685379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.690432873147074e-05, + "grad_norm": 24.92892074584961, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8647325038909912, + "num_tokens": 242151883.0, + "step": 6349 + }, + { + "epoch": 0.8077852690497392, + "ewc_loss": 0.033862799406051636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.693139893177431e-05, + "grad_norm": 24.887168884277344, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8566457629203796, + "num_tokens": 242190486.0, + "step": 6350 + }, + { + "epoch": 0.8079124793283298, + "ewc_loss": 0.0338326022028923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.691630131972488e-05, + "grad_norm": 24.833263397216797, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8622764348983765, + "num_tokens": 242232720.0, + "step": 6351 + }, + { + "epoch": 0.8080396896069202, + "ewc_loss": 0.03384930640459061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6924654119065963e-05, + "grad_norm": 24.98079490661621, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8610767126083374, + "num_tokens": 242273926.0, + "step": 6352 + }, + { + "epoch": 0.8081668998855107, + "ewc_loss": 0.03387800231575966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.693900048849173e-05, + "grad_norm": 25.007474899291992, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8490437269210815, + "num_tokens": 242313484.0, + "step": 6353 + }, + { + "epoch": 0.8082941101641012, + "ewc_loss": 0.03386204317212105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6931022400967777e-05, + "grad_norm": 24.923322677612305, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8532840609550476, + "num_tokens": 242352101.0, + "step": 6354 + }, + { + "epoch": 0.8084213204426918, + "ewc_loss": 0.03388764336705208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6943820810411125e-05, + "grad_norm": 24.949893951416016, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8749988079071045, + "num_tokens": 242392958.0, + "step": 6355 + }, + { + "epoch": 0.8085485307212823, + "ewc_loss": 0.03385597839951515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6927988326642662e-05, + "grad_norm": 25.017963409423828, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8599643707275391, + "num_tokens": 242432327.0, + "step": 6356 + }, + { + "epoch": 0.8086757409998728, + "ewc_loss": 0.033840931951999664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6920466805459e-05, + "grad_norm": 24.914772033691406, + "learning_rate": 1e-06, + "loss": 0.5126, + "mean_token_accuracy": 0.8381494283676147, + "num_tokens": 242473714.0, + "step": 6357 + }, + { + "epoch": 0.8088029512784632, + "ewc_loss": 0.033821333199739456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6910666090552695e-05, + "grad_norm": 24.952999114990234, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.86676424741745, + "num_tokens": 242513179.0, + "step": 6358 + }, + { + "epoch": 0.8089301615570538, + "ewc_loss": 0.03382245451211929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6911226339288987e-05, + "grad_norm": 24.961929321289062, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.859481155872345, + "num_tokens": 242549398.0, + "step": 6359 + }, + { + "epoch": 0.8090573718356443, + "ewc_loss": 0.033848222345113754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6924112060223706e-05, + "grad_norm": 24.908475875854492, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8542697429656982, + "num_tokens": 242583705.0, + "step": 6360 + }, + { + "epoch": 0.8091845821142348, + "ewc_loss": 0.03377894312143326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.688947122602258e-05, + "grad_norm": 24.896045684814453, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.864017903804779, + "num_tokens": 242625900.0, + "step": 6361 + }, + { + "epoch": 0.8093117923928254, + "ewc_loss": 0.033895716071128845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6947857147897594e-05, + "grad_norm": 25.040992736816406, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8481003046035767, + "num_tokens": 242665766.0, + "step": 6362 + }, + { + "epoch": 0.8094390026714159, + "ewc_loss": 0.033855926245450974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6927962860791013e-05, + "grad_norm": 24.88726806640625, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8622483015060425, + "num_tokens": 242707634.0, + "step": 6363 + }, + { + "epoch": 0.8095662129500064, + "ewc_loss": 0.03380513936281204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6902569768717512e-05, + "grad_norm": 25.046009063720703, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8555248975753784, + "num_tokens": 242746791.0, + "step": 6364 + }, + { + "epoch": 0.8096934232285968, + "ewc_loss": 0.03394140675663948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.697070365480613e-05, + "grad_norm": 24.891658782958984, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8725404143333435, + "num_tokens": 242784861.0, + "step": 6365 + }, + { + "epoch": 0.8098206335071874, + "ewc_loss": 0.033811021596193314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6905511074583046e-05, + "grad_norm": 25.048593521118164, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8731454014778137, + "num_tokens": 242817612.0, + "step": 6366 + }, + { + "epoch": 0.8099478437857779, + "ewc_loss": 0.03386956453323364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6934782252064906e-05, + "grad_norm": 24.95020294189453, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8671654462814331, + "num_tokens": 242854951.0, + "step": 6367 + }, + { + "epoch": 0.8100750540643684, + "ewc_loss": 0.03377233073115349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6886166122276336e-05, + "grad_norm": 24.984418869018555, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8743126392364502, + "num_tokens": 242894166.0, + "step": 6368 + }, + { + "epoch": 0.8102022643429589, + "ewc_loss": 0.03381795808672905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6908979887375608e-05, + "grad_norm": 24.89548683166504, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.864519476890564, + "num_tokens": 242936230.0, + "step": 6369 + }, + { + "epoch": 0.8103294746215495, + "ewc_loss": 0.033816758543252945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6908379620872438e-05, + "grad_norm": 24.94583511352539, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8704186677932739, + "num_tokens": 242973148.0, + "step": 6370 + }, + { + "epoch": 0.8104566849001399, + "ewc_loss": 0.033809810876846313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6904905351111665e-05, + "grad_norm": 24.88141632080078, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8604484796524048, + "num_tokens": 243013029.0, + "step": 6371 + }, + { + "epoch": 0.8105838951787304, + "ewc_loss": 0.03384024277329445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6920121197472326e-05, + "grad_norm": 25.083984375, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8597214221954346, + "num_tokens": 243054847.0, + "step": 6372 + }, + { + "epoch": 0.810711105457321, + "ewc_loss": 0.03389473259449005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6947366020758636e-05, + "grad_norm": 25.007219314575195, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8529707193374634, + "num_tokens": 243096524.0, + "step": 6373 + }, + { + "epoch": 0.8108383157359115, + "ewc_loss": 0.03375845402479172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.687922667770181e-05, + "grad_norm": 24.88090705871582, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8529669642448425, + "num_tokens": 243132397.0, + "step": 6374 + }, + { + "epoch": 0.810965526014502, + "ewc_loss": 0.03379208222031593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6896041415748186e-05, + "grad_norm": 25.04295539855957, + "learning_rate": 1e-06, + "loss": 0.4996, + "mean_token_accuracy": 0.8458619117736816, + "num_tokens": 243171790.0, + "step": 6375 + }, + { + "epoch": 0.8110927362930925, + "ewc_loss": 0.0338234007358551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6911701095523313e-05, + "grad_norm": 24.895307540893555, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8584049940109253, + "num_tokens": 243208217.0, + "step": 6376 + }, + { + "epoch": 0.811219946571683, + "ewc_loss": 0.033759959042072296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6879979739314876e-05, + "grad_norm": 25.1223201751709, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8650917410850525, + "num_tokens": 243245260.0, + "step": 6377 + }, + { + "epoch": 0.8113471568502735, + "ewc_loss": 0.033840179443359375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6920090274652466e-05, + "grad_norm": 24.93208122253418, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8677750825881958, + "num_tokens": 243283924.0, + "step": 6378 + }, + { + "epoch": 0.811474367128864, + "ewc_loss": 0.03377928212285042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.688964039203711e-05, + "grad_norm": 25.11296844482422, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8678939342498779, + "num_tokens": 243321977.0, + "step": 6379 + }, + { + "epoch": 0.8116015774074545, + "ewc_loss": 0.033850960433483124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6925479940255173e-05, + "grad_norm": 24.98922348022461, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8565349578857422, + "num_tokens": 243359349.0, + "step": 6380 + }, + { + "epoch": 0.8117287876860451, + "ewc_loss": 0.033693138509988785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6846568541950546e-05, + "grad_norm": 24.963041305541992, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8625500202178955, + "num_tokens": 243395722.0, + "step": 6381 + }, + { + "epoch": 0.8118559979646356, + "ewc_loss": 0.0338081419467926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6904070434975438e-05, + "grad_norm": 24.948392868041992, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8485385179519653, + "num_tokens": 243429022.0, + "step": 6382 + }, + { + "epoch": 0.811983208243226, + "ewc_loss": 0.0338466577231884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6923328075790778e-05, + "grad_norm": 25.05523109436035, + "learning_rate": 1e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.838481605052948, + "num_tokens": 243468474.0, + "step": 6383 + }, + { + "epoch": 0.8121104185218165, + "ewc_loss": 0.0338507741689682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6925387171795592e-05, + "grad_norm": 24.970733642578125, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8744698762893677, + "num_tokens": 243503045.0, + "step": 6384 + }, + { + "epoch": 0.8122376288004071, + "ewc_loss": 0.033833615481853485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6916806998779066e-05, + "grad_norm": 24.976245880126953, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8498455286026001, + "num_tokens": 243543865.0, + "step": 6385 + }, + { + "epoch": 0.8123648390789976, + "ewc_loss": 0.03392348438501358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.696174149401486e-05, + "grad_norm": 25.07547378540039, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8524903059005737, + "num_tokens": 243581258.0, + "step": 6386 + }, + { + "epoch": 0.8124920493575881, + "ewc_loss": 0.033855464309453964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6927731849136762e-05, + "grad_norm": 24.9263858795166, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.853003978729248, + "num_tokens": 243622777.0, + "step": 6387 + }, + { + "epoch": 0.8126192596361786, + "ewc_loss": 0.033876992762088776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6938496628426947e-05, + "grad_norm": 24.928869247436523, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8476853370666504, + "num_tokens": 243664549.0, + "step": 6388 + }, + { + "epoch": 0.8127464699147691, + "ewc_loss": 0.033920373767614365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6960186258074827e-05, + "grad_norm": 24.97189712524414, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8544090986251831, + "num_tokens": 243710408.0, + "step": 6389 + }, + { + "epoch": 0.8128736801933596, + "ewc_loss": 0.03385361656546593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.692680780251976e-05, + "grad_norm": 24.94037628173828, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8610236048698425, + "num_tokens": 243752598.0, + "step": 6390 + }, + { + "epoch": 0.8130008904719501, + "ewc_loss": 0.03393545374274254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6967725969152525e-05, + "grad_norm": 25.033491134643555, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8568297624588013, + "num_tokens": 243789805.0, + "step": 6391 + }, + { + "epoch": 0.8131281007505406, + "ewc_loss": 0.03392393887042999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6961968867690302e-05, + "grad_norm": 25.114858627319336, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.87027907371521, + "num_tokens": 243824124.0, + "step": 6392 + }, + { + "epoch": 0.8132553110291312, + "ewc_loss": 0.033882107585668564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6941054127528332e-05, + "grad_norm": 24.953184127807617, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8592909574508667, + "num_tokens": 243863448.0, + "step": 6393 + }, + { + "epoch": 0.8133825213077217, + "ewc_loss": 0.03393050283193588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6965252143563703e-05, + "grad_norm": 25.21320915222168, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8631162643432617, + "num_tokens": 243902085.0, + "step": 6394 + }, + { + "epoch": 0.8135097315863121, + "ewc_loss": 0.03388243913650513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6941219655564055e-05, + "grad_norm": 24.932767868041992, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8538325428962708, + "num_tokens": 243945349.0, + "step": 6395 + }, + { + "epoch": 0.8136369418649027, + "ewc_loss": 0.03376640006899834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6883199350559153e-05, + "grad_norm": 25.060855865478516, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8569626808166504, + "num_tokens": 243982221.0, + "step": 6396 + }, + { + "epoch": 0.8137641521434932, + "ewc_loss": 0.03384790197014809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.692395017016679e-05, + "grad_norm": 25.02201271057129, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8463598489761353, + "num_tokens": 244012576.0, + "step": 6397 + }, + { + "epoch": 0.8138913624220837, + "ewc_loss": 0.033811304718256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6905652955756523e-05, + "grad_norm": 24.945571899414062, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8664169907569885, + "num_tokens": 244052089.0, + "step": 6398 + }, + { + "epoch": 0.8140185727006742, + "ewc_loss": 0.033872444182634354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6936222891672514e-05, + "grad_norm": 25.085887908935547, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8612487316131592, + "num_tokens": 244088712.0, + "step": 6399 + }, + { + "epoch": 0.8141457829792648, + "ewc_loss": 0.03383324667811394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6916623280849308e-05, + "grad_norm": 25.04922103881836, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8612686991691589, + "num_tokens": 244127655.0, + "step": 6400 + }, + { + "epoch": 0.8142729932578552, + "ewc_loss": 0.03382468596100807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6912343198782764e-05, + "grad_norm": 24.958261489868164, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8648618459701538, + "num_tokens": 244158588.0, + "step": 6401 + }, + { + "epoch": 0.8144002035364457, + "ewc_loss": 0.03384687379002571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.692343721515499e-05, + "grad_norm": 24.955280303955078, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8778513073921204, + "num_tokens": 244193872.0, + "step": 6402 + }, + { + "epoch": 0.8145274138150362, + "ewc_loss": 0.033939819782972336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6969910575426184e-05, + "grad_norm": 25.14871597290039, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8444816470146179, + "num_tokens": 244234055.0, + "step": 6403 + }, + { + "epoch": 0.8146546240936268, + "ewc_loss": 0.03390475735068321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.695237915555481e-05, + "grad_norm": 24.97758674621582, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.859896719455719, + "num_tokens": 244275163.0, + "step": 6404 + }, + { + "epoch": 0.8147818343722173, + "ewc_loss": 0.03385356441140175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.692678233666811e-05, + "grad_norm": 25.035934448242188, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8564708232879639, + "num_tokens": 244315241.0, + "step": 6405 + }, + { + "epoch": 0.8149090446508078, + "ewc_loss": 0.033891137689352036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6945568859227933e-05, + "grad_norm": 25.136940002441406, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8740172386169434, + "num_tokens": 244351947.0, + "step": 6406 + }, + { + "epoch": 0.8150362549293982, + "ewc_loss": 0.03393450379371643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.69672512129182e-05, + "grad_norm": 25.128950119018555, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8583788275718689, + "num_tokens": 244387706.0, + "step": 6407 + }, + { + "epoch": 0.8151634652079888, + "ewc_loss": 0.03387543559074402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.693771810096223e-05, + "grad_norm": 24.945587158203125, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8680922389030457, + "num_tokens": 244428447.0, + "step": 6408 + }, + { + "epoch": 0.8152906754865793, + "ewc_loss": 0.0339030995965004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6951549696386792e-05, + "grad_norm": 25.119840621948242, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8441108465194702, + "num_tokens": 244471191.0, + "step": 6409 + }, + { + "epoch": 0.8154178857651698, + "ewc_loss": 0.033941950649023056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6970974684227258e-05, + "grad_norm": 25.165380477905273, + "learning_rate": 1e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8415218591690063, + "num_tokens": 244509198.0, + "step": 6410 + }, + { + "epoch": 0.8155450960437604, + "ewc_loss": 0.033914707601070404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6957354091573507e-05, + "grad_norm": 25.056068420410156, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8504481315612793, + "num_tokens": 244549677.0, + "step": 6411 + }, + { + "epoch": 0.8156723063223509, + "ewc_loss": 0.03384183347225189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6920916095841676e-05, + "grad_norm": 25.012170791625977, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8462272882461548, + "num_tokens": 244583586.0, + "step": 6412 + }, + { + "epoch": 0.8157995166009414, + "ewc_loss": 0.03385477885603905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6927389879128896e-05, + "grad_norm": 25.057294845581055, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8671228885650635, + "num_tokens": 244621220.0, + "step": 6413 + }, + { + "epoch": 0.8159267268795318, + "ewc_loss": 0.03391466662287712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6957334082690068e-05, + "grad_norm": 25.05268669128418, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8668742179870605, + "num_tokens": 244655941.0, + "step": 6414 + }, + { + "epoch": 0.8160539371581224, + "ewc_loss": 0.03388974443078041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6944872186286375e-05, + "grad_norm": 24.959991455078125, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8761940598487854, + "num_tokens": 244685807.0, + "step": 6415 + }, + { + "epoch": 0.8161811474367129, + "ewc_loss": 0.03391830250620842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.695915125310421e-05, + "grad_norm": 24.999155044555664, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8574481010437012, + "num_tokens": 244726866.0, + "step": 6416 + }, + { + "epoch": 0.8163083577153034, + "ewc_loss": 0.03395386412739754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.697693187452387e-05, + "grad_norm": 25.0418701171875, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8505709171295166, + "num_tokens": 244767273.0, + "step": 6417 + }, + { + "epoch": 0.8164355679938939, + "ewc_loss": 0.033914774656295776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.695738683338277e-05, + "grad_norm": 25.0301570892334, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8540947437286377, + "num_tokens": 244806111.0, + "step": 6418 + }, + { + "epoch": 0.8165627782724845, + "ewc_loss": 0.0339326374232769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.696631807135418e-05, + "grad_norm": 24.96627426147461, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8699322938919067, + "num_tokens": 244841706.0, + "step": 6419 + }, + { + "epoch": 0.8166899885510749, + "ewc_loss": 0.03391961380839348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6959806089289486e-05, + "grad_norm": 25.03780746459961, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8615868091583252, + "num_tokens": 244883908.0, + "step": 6420 + }, + { + "epoch": 0.8168171988296654, + "ewc_loss": 0.033979084342718124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6989542928058654e-05, + "grad_norm": 25.043989181518555, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8516241908073425, + "num_tokens": 244919090.0, + "step": 6421 + }, + { + "epoch": 0.8169444091082559, + "ewc_loss": 0.03399244695901871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6996224076137878e-05, + "grad_norm": 25.06597328186035, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8730581998825073, + "num_tokens": 244961141.0, + "step": 6422 + }, + { + "epoch": 0.8170716193868465, + "ewc_loss": 0.03393339365720749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6966696421150118e-05, + "grad_norm": 24.995641708374023, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8426632881164551, + "num_tokens": 245004503.0, + "step": 6423 + }, + { + "epoch": 0.817198829665437, + "ewc_loss": 0.03389745205640793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.694872662483249e-05, + "grad_norm": 25.06857681274414, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8582823276519775, + "num_tokens": 245045042.0, + "step": 6424 + }, + { + "epoch": 0.8173260399440275, + "ewc_loss": 0.033983442932367325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6991722077364102e-05, + "grad_norm": 25.09140968322754, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8585821986198425, + "num_tokens": 245085416.0, + "step": 6425 + }, + { + "epoch": 0.8174532502226179, + "ewc_loss": 0.03391995280981064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.695997707429342e-05, + "grad_norm": 25.089128494262695, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8484668731689453, + "num_tokens": 245116137.0, + "step": 6426 + }, + { + "epoch": 0.8175804605012085, + "ewc_loss": 0.03390534222126007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.695267201284878e-05, + "grad_norm": 25.030399322509766, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8715775609016418, + "num_tokens": 245159181.0, + "step": 6427 + }, + { + "epoch": 0.817707670779799, + "ewc_loss": 0.033865559846162796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6932779544731602e-05, + "grad_norm": 24.97041130065918, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.851237416267395, + "num_tokens": 245205199.0, + "step": 6428 + }, + { + "epoch": 0.8178348810583895, + "ewc_loss": 0.033944565802812576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6972282537608407e-05, + "grad_norm": 25.150354385375977, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8635104298591614, + "num_tokens": 245242455.0, + "step": 6429 + }, + { + "epoch": 0.8179620913369801, + "ewc_loss": 0.03393194451928139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6965972463367507e-05, + "grad_norm": 25.011709213256836, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8788586854934692, + "num_tokens": 245280675.0, + "step": 6430 + }, + { + "epoch": 0.8180893016155706, + "ewc_loss": 0.03383379429578781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6916897948249243e-05, + "grad_norm": 25.031034469604492, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8754586577415466, + "num_tokens": 245316947.0, + "step": 6431 + }, + { + "epoch": 0.818216511894161, + "ewc_loss": 0.03394237905740738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6971189324976876e-05, + "grad_norm": 25.042497634887695, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8638445138931274, + "num_tokens": 245355145.0, + "step": 6432 + }, + { + "epoch": 0.8183437221727515, + "ewc_loss": 0.033894285559654236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6947142285062e-05, + "grad_norm": 25.013771057128906, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8706275224685669, + "num_tokens": 245390282.0, + "step": 6433 + }, + { + "epoch": 0.8184709324513421, + "ewc_loss": 0.033828798681497574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.691439865680877e-05, + "grad_norm": 24.942338943481445, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8693230152130127, + "num_tokens": 245430262.0, + "step": 6434 + }, + { + "epoch": 0.8185981427299326, + "ewc_loss": 0.03393159434199333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6965797840384766e-05, + "grad_norm": 25.02150535583496, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.856117844581604, + "num_tokens": 245470874.0, + "step": 6435 + }, + { + "epoch": 0.8187253530085231, + "ewc_loss": 0.03390440344810486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.695220089459326e-05, + "grad_norm": 25.00290298461914, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8659833669662476, + "num_tokens": 245506626.0, + "step": 6436 + }, + { + "epoch": 0.8188525632871136, + "ewc_loss": 0.033952146768569946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6976073311525397e-05, + "grad_norm": 25.01372528076172, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8431782722473145, + "num_tokens": 245546794.0, + "step": 6437 + }, + { + "epoch": 0.8189797735657041, + "ewc_loss": 0.033886704593896866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.694335151114501e-05, + "grad_norm": 24.969558715820312, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8622332811355591, + "num_tokens": 245589544.0, + "step": 6438 + }, + { + "epoch": 0.8191069838442946, + "ewc_loss": 0.03392402082681656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6962010704446584e-05, + "grad_norm": 25.04527473449707, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8758234977722168, + "num_tokens": 245628889.0, + "step": 6439 + }, + { + "epoch": 0.8192341941228851, + "ewc_loss": 0.03395514562726021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.697757215879392e-05, + "grad_norm": 25.07809066772461, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8567582368850708, + "num_tokens": 245674615.0, + "step": 6440 + }, + { + "epoch": 0.8193614044014756, + "ewc_loss": 0.03386499732732773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6932499420363456e-05, + "grad_norm": 24.9114933013916, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8596329689025879, + "num_tokens": 245713966.0, + "step": 6441 + }, + { + "epoch": 0.8194886146800662, + "ewc_loss": 0.03392565995454788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.696282924967818e-05, + "grad_norm": 25.088102340698242, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8645755052566528, + "num_tokens": 245750035.0, + "step": 6442 + }, + { + "epoch": 0.8196158249586567, + "ewc_loss": 0.03396402671933174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6982014130917378e-05, + "grad_norm": 25.02808380126953, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8744536638259888, + "num_tokens": 245788506.0, + "step": 6443 + }, + { + "epoch": 0.8197430352372471, + "ewc_loss": 0.03389905020594597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6949525161180645e-05, + "grad_norm": 24.997604370117188, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8630664944648743, + "num_tokens": 245826547.0, + "step": 6444 + }, + { + "epoch": 0.8198702455158376, + "ewc_loss": 0.03395070508122444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6975352991721593e-05, + "grad_norm": 25.03615951538086, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8508576154708862, + "num_tokens": 245874363.0, + "step": 6445 + }, + { + "epoch": 0.8199974557944282, + "ewc_loss": 0.03390953317284584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6954767488641664e-05, + "grad_norm": 24.986713409423828, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8421158790588379, + "num_tokens": 245917813.0, + "step": 6446 + }, + { + "epoch": 0.8201246660730187, + "ewc_loss": 0.03392898663878441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6964493624982424e-05, + "grad_norm": 24.981163024902344, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8562141060829163, + "num_tokens": 245952177.0, + "step": 6447 + }, + { + "epoch": 0.8202518763516092, + "ewc_loss": 0.0339035801589489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.695178980298806e-05, + "grad_norm": 24.970111846923828, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8625882267951965, + "num_tokens": 245992897.0, + "step": 6448 + }, + { + "epoch": 0.8203790866301998, + "ewc_loss": 0.033979084342718124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6989542928058654e-05, + "grad_norm": 25.00959014892578, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8684042096138, + "num_tokens": 246029657.0, + "step": 6449 + }, + { + "epoch": 0.8205062969087902, + "ewc_loss": 0.033905427902936935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.695271384960506e-05, + "grad_norm": 25.03675079345703, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8491386771202087, + "num_tokens": 246072425.0, + "step": 6450 + }, + { + "epoch": 0.8206335071873807, + "ewc_loss": 0.033962495625019073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6981248336378485e-05, + "grad_norm": 25.00238609313965, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8772106766700745, + "num_tokens": 246113131.0, + "step": 6451 + }, + { + "epoch": 0.8207607174659712, + "ewc_loss": 0.03396526724100113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6982634406303987e-05, + "grad_norm": 25.07109260559082, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8621333837509155, + "num_tokens": 246154548.0, + "step": 6452 + }, + { + "epoch": 0.8208879277445618, + "ewc_loss": 0.033959802240133286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6979900465230457e-05, + "grad_norm": 25.03695297241211, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8559902310371399, + "num_tokens": 246189518.0, + "step": 6453 + }, + { + "epoch": 0.8210151380231523, + "ewc_loss": 0.033908966928720474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.695448372629471e-05, + "grad_norm": 25.080406188964844, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8557523488998413, + "num_tokens": 246235248.0, + "step": 6454 + }, + { + "epoch": 0.8211423483017428, + "ewc_loss": 0.033907290548086166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6953645172179677e-05, + "grad_norm": 24.968231201171875, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8528976440429688, + "num_tokens": 246277016.0, + "step": 6455 + }, + { + "epoch": 0.8212695585803332, + "ewc_loss": 0.03392715007066727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6963575035333633e-05, + "grad_norm": 25.182374954223633, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8571556210517883, + "num_tokens": 246312352.0, + "step": 6456 + }, + { + "epoch": 0.8213967688589238, + "ewc_loss": 0.03390825167298317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6954125385382213e-05, + "grad_norm": 24.922405242919922, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8469464778900146, + "num_tokens": 246355079.0, + "step": 6457 + }, + { + "epoch": 0.8215239791375143, + "ewc_loss": 0.033883724361658096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6941861758823507e-05, + "grad_norm": 25.060993194580078, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8469928503036499, + "num_tokens": 246391073.0, + "step": 6458 + }, + { + "epoch": 0.8216511894161048, + "ewc_loss": 0.03400743007659912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.700371467450168e-05, + "grad_norm": 25.04198455810547, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8659216165542603, + "num_tokens": 246435407.0, + "step": 6459 + }, + { + "epoch": 0.8217783996946953, + "ewc_loss": 0.033895861357450485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6947929907473736e-05, + "grad_norm": 24.997426986694336, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8467433452606201, + "num_tokens": 246475303.0, + "step": 6460 + }, + { + "epoch": 0.8219056099732859, + "ewc_loss": 0.03405360132455826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7026801288011484e-05, + "grad_norm": 25.176586151123047, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8575828075408936, + "num_tokens": 246515237.0, + "step": 6461 + }, + { + "epoch": 0.8220328202518764, + "ewc_loss": 0.03395489975810051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.697745028650388e-05, + "grad_norm": 25.008216857910156, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.854716420173645, + "num_tokens": 246554958.0, + "step": 6462 + }, + { + "epoch": 0.8221600305304668, + "ewc_loss": 0.033912044018507004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.695602259133011e-05, + "grad_norm": 25.002614974975586, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8614399433135986, + "num_tokens": 246592327.0, + "step": 6463 + }, + { + "epoch": 0.8222872408090574, + "ewc_loss": 0.033956222236156464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.697811057965737e-05, + "grad_norm": 25.04700469970703, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.86354660987854, + "num_tokens": 246626965.0, + "step": 6464 + }, + { + "epoch": 0.8224144510876479, + "ewc_loss": 0.033913254737854004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6956626495812088e-05, + "grad_norm": 24.945375442504883, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8556936383247375, + "num_tokens": 246662049.0, + "step": 6465 + }, + { + "epoch": 0.8225416613662384, + "ewc_loss": 0.03399979695677757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.699989843473304e-05, + "grad_norm": 24.94896125793457, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8612163066864014, + "num_tokens": 246696989.0, + "step": 6466 + }, + { + "epoch": 0.8226688716448289, + "ewc_loss": 0.034008774906396866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7004387700580992e-05, + "grad_norm": 25.0761775970459, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8640354871749878, + "num_tokens": 246738418.0, + "step": 6467 + }, + { + "epoch": 0.8227960819234195, + "ewc_loss": 0.03399616479873657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6998083083308302e-05, + "grad_norm": 24.975582122802734, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8608279228210449, + "num_tokens": 246776779.0, + "step": 6468 + }, + { + "epoch": 0.8229232922020099, + "ewc_loss": 0.034013472497463226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.700673601590097e-05, + "grad_norm": 24.998449325561523, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8665564656257629, + "num_tokens": 246814029.0, + "step": 6469 + }, + { + "epoch": 0.8230505024806004, + "ewc_loss": 0.03400209918618202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7001049855025485e-05, + "grad_norm": 25.049571990966797, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8514356017112732, + "num_tokens": 246856390.0, + "step": 6470 + }, + { + "epoch": 0.8231777127591909, + "ewc_loss": 0.03402991592884064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7014957848004997e-05, + "grad_norm": 25.030147552490234, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.861114501953125, + "num_tokens": 246895563.0, + "step": 6471 + }, + { + "epoch": 0.8233049230377815, + "ewc_loss": 0.03398469462990761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6992347809718922e-05, + "grad_norm": 24.971158981323242, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8562424778938293, + "num_tokens": 246935956.0, + "step": 6472 + }, + { + "epoch": 0.823432133316372, + "ewc_loss": 0.033999405801296234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.699970380286686e-05, + "grad_norm": 24.98432731628418, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8560139536857605, + "num_tokens": 246984765.0, + "step": 6473 + }, + { + "epoch": 0.8235593435949625, + "ewc_loss": 0.03405999764800072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7029999071382917e-05, + "grad_norm": 25.001686096191406, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8629722595214844, + "num_tokens": 247017086.0, + "step": 6474 + }, + { + "epoch": 0.8236865538735529, + "ewc_loss": 0.03403661400079727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7018306607496925e-05, + "grad_norm": 25.03508186340332, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.864736795425415, + "num_tokens": 247056195.0, + "step": 6475 + }, + { + "epoch": 0.8238137641521435, + "ewc_loss": 0.03401649370789528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7008247596095316e-05, + "grad_norm": 25.053218841552734, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8728747963905334, + "num_tokens": 247094063.0, + "step": 6476 + }, + { + "epoch": 0.823940974430734, + "ewc_loss": 0.03399600088596344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6998001228785142e-05, + "grad_norm": 25.05420684814453, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8581905364990234, + "num_tokens": 247131743.0, + "step": 6477 + }, + { + "epoch": 0.8240681847093245, + "ewc_loss": 0.03394637256860733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.697318657534197e-05, + "grad_norm": 25.004989624023438, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8580936193466187, + "num_tokens": 247169477.0, + "step": 6478 + }, + { + "epoch": 0.824195394987915, + "ewc_loss": 0.03404351696372032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7021759049384855e-05, + "grad_norm": 24.9164981842041, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8685455322265625, + "num_tokens": 247209159.0, + "step": 6479 + }, + { + "epoch": 0.8243226052665056, + "ewc_loss": 0.034029293805360794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.701464680081699e-05, + "grad_norm": 25.08635139465332, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8616993427276611, + "num_tokens": 247247603.0, + "step": 6480 + }, + { + "epoch": 0.824449815545096, + "ewc_loss": 0.03405706211924553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7028531146934256e-05, + "grad_norm": 24.999670028686523, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8691399097442627, + "num_tokens": 247290184.0, + "step": 6481 + }, + { + "epoch": 0.8245770258236865, + "ewc_loss": 0.03403386101126671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.701692963251844e-05, + "grad_norm": 25.01020622253418, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8421421051025391, + "num_tokens": 247331862.0, + "step": 6482 + }, + { + "epoch": 0.824704236102277, + "ewc_loss": 0.03402189537882805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7010946976370178e-05, + "grad_norm": 24.999624252319336, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8588298559188843, + "num_tokens": 247368271.0, + "step": 6483 + }, + { + "epoch": 0.8248314463808676, + "ewc_loss": 0.034071289002895355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7035645214491524e-05, + "grad_norm": 24.992488861083984, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.858973503112793, + "num_tokens": 247408077.0, + "step": 6484 + }, + { + "epoch": 0.8249586566594581, + "ewc_loss": 0.03400201350450516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.70010061992798e-05, + "grad_norm": 24.89764404296875, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8676787614822388, + "num_tokens": 247447408.0, + "step": 6485 + }, + { + "epoch": 0.8250858669380486, + "ewc_loss": 0.03407789766788483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7038948499248363e-05, + "grad_norm": 25.00498390197754, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.870975136756897, + "num_tokens": 247489863.0, + "step": 6486 + }, + { + "epoch": 0.8252130772166391, + "ewc_loss": 0.03412604704499245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7063022824004292e-05, + "grad_norm": 24.98297119140625, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8554947376251221, + "num_tokens": 247532233.0, + "step": 6487 + }, + { + "epoch": 0.8253402874952296, + "ewc_loss": 0.03408091515302658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7040458260453306e-05, + "grad_norm": 25.042572021484375, + "learning_rate": 1e-06, + "loss": 0.5322, + "mean_token_accuracy": 0.8340903520584106, + "num_tokens": 247570719.0, + "step": 6488 + }, + { + "epoch": 0.8254674977738201, + "ewc_loss": 0.034085217863321304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7042608305928297e-05, + "grad_norm": 24.91073989868164, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8650130033493042, + "num_tokens": 247603302.0, + "step": 6489 + }, + { + "epoch": 0.8255947080524106, + "ewc_loss": 0.03405686467885971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7028432921506464e-05, + "grad_norm": 25.00977897644043, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.862974226474762, + "num_tokens": 247642837.0, + "step": 6490 + }, + { + "epoch": 0.8257219183310012, + "ewc_loss": 0.034188155084848404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.709407843009103e-05, + "grad_norm": 25.061626434326172, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.856640100479126, + "num_tokens": 247685246.0, + "step": 6491 + }, + { + "epoch": 0.8258491286095917, + "ewc_loss": 0.03402290120720863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.701145083643496e-05, + "grad_norm": 24.91434097290039, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.852116048336029, + "num_tokens": 247720871.0, + "step": 6492 + }, + { + "epoch": 0.8259763388881821, + "ewc_loss": 0.03410227969288826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7051139366230927e-05, + "grad_norm": 25.031171798706055, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8609330654144287, + "num_tokens": 247759796.0, + "step": 6493 + }, + { + "epoch": 0.8261035491667726, + "ewc_loss": 0.03418470919132233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.709235402813647e-05, + "grad_norm": 24.971660614013672, + "learning_rate": 1e-06, + "loss": 0.5249, + "mean_token_accuracy": 0.838594913482666, + "num_tokens": 247806486.0, + "step": 6494 + }, + { + "epoch": 0.8262307594453632, + "ewc_loss": 0.03408633545041084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.704316855466459e-05, + "grad_norm": 25.07103157043457, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.867743730545044, + "num_tokens": 247842031.0, + "step": 6495 + }, + { + "epoch": 0.8263579697239537, + "ewc_loss": 0.03413644805550575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.706822331470903e-05, + "grad_norm": 24.912303924560547, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8628175258636475, + "num_tokens": 247883438.0, + "step": 6496 + }, + { + "epoch": 0.8264851800025442, + "ewc_loss": 0.034141816198825836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7070908143068664e-05, + "grad_norm": 25.13610076904297, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8685023188591003, + "num_tokens": 247921491.0, + "step": 6497 + }, + { + "epoch": 0.8266123902811348, + "ewc_loss": 0.0341353677213192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7067683074856177e-05, + "grad_norm": 24.934188842773438, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8695699572563171, + "num_tokens": 247957454.0, + "step": 6498 + }, + { + "epoch": 0.8267396005597252, + "ewc_loss": 0.03404451534152031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7022257452481426e-05, + "grad_norm": 25.047773361206055, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.8447399139404297, + "num_tokens": 247997705.0, + "step": 6499 + }, + { + "epoch": 0.8268668108383157, + "ewc_loss": 0.034140221774578094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.707011142570991e-05, + "grad_norm": 24.970874786376953, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8552440404891968, + "num_tokens": 248034536.0, + "step": 6500 + }, + { + "epoch": 0.8269940211169062, + "ewc_loss": 0.034035950899124146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.701797555142548e-05, + "grad_norm": 25.004587173461914, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8603449463844299, + "num_tokens": 248080916.0, + "step": 6501 + }, + { + "epoch": 0.8271212313954968, + "ewc_loss": 0.03418130427598953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.709065145405475e-05, + "grad_norm": 25.02071762084961, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8585031628608704, + "num_tokens": 248119719.0, + "step": 6502 + }, + { + "epoch": 0.8272484416740873, + "ewc_loss": 0.03413627669215202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7068137822207063e-05, + "grad_norm": 24.958587646484375, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8570968508720398, + "num_tokens": 248156221.0, + "step": 6503 + }, + { + "epoch": 0.8273756519526778, + "ewc_loss": 0.03407689556479454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7038448277162388e-05, + "grad_norm": 24.934850692749023, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8627781867980957, + "num_tokens": 248191780.0, + "step": 6504 + }, + { + "epoch": 0.8275028622312682, + "ewc_loss": 0.03412343189120293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7061716789612547e-05, + "grad_norm": 24.99236488342285, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.854582667350769, + "num_tokens": 248228594.0, + "step": 6505 + }, + { + "epoch": 0.8276300725098588, + "ewc_loss": 0.0341559462249279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7077973097912036e-05, + "grad_norm": 25.056074142456055, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8643660545349121, + "num_tokens": 248260372.0, + "step": 6506 + }, + { + "epoch": 0.8277572827884493, + "ewc_loss": 0.034166451543569565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7083226339309476e-05, + "grad_norm": 24.988582611083984, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8517321348190308, + "num_tokens": 248299361.0, + "step": 6507 + }, + { + "epoch": 0.8278844930670398, + "ewc_loss": 0.03411775827407837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7058879166143015e-05, + "grad_norm": 24.95191764831543, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8517833948135376, + "num_tokens": 248337370.0, + "step": 6508 + }, + { + "epoch": 0.8280117033456303, + "ewc_loss": 0.034163698554039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7081849364330992e-05, + "grad_norm": 24.985652923583984, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8626977205276489, + "num_tokens": 248377541.0, + "step": 6509 + }, + { + "epoch": 0.8281389136242209, + "ewc_loss": 0.03413806110620499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7069030946004204e-05, + "grad_norm": 24.917863845825195, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8422651290893555, + "num_tokens": 248413928.0, + "step": 6510 + }, + { + "epoch": 0.8282661239028114, + "ewc_loss": 0.034229129552841187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7114565707743168e-05, + "grad_norm": 25.08648109436035, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8595890998840332, + "num_tokens": 248453283.0, + "step": 6511 + }, + { + "epoch": 0.8283933341814018, + "ewc_loss": 0.034223772585392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7111886336351745e-05, + "grad_norm": 25.0438232421875, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8657349348068237, + "num_tokens": 248490286.0, + "step": 6512 + }, + { + "epoch": 0.8285205444599923, + "ewc_loss": 0.03418910503387451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7094553186325356e-05, + "grad_norm": 24.98455238342285, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.8442939519882202, + "num_tokens": 248526349.0, + "step": 6513 + }, + { + "epoch": 0.8286477547385829, + "ewc_loss": 0.034191399812698364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.709569914964959e-05, + "grad_norm": 25.08447265625, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8515192270278931, + "num_tokens": 248561751.0, + "step": 6514 + }, + { + "epoch": 0.8287749650171734, + "ewc_loss": 0.034280020743608475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7140009731519967e-05, + "grad_norm": 24.96316909790039, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8725637197494507, + "num_tokens": 248601921.0, + "step": 6515 + }, + { + "epoch": 0.8289021752957639, + "ewc_loss": 0.03418774530291557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.709387288428843e-05, + "grad_norm": 24.99323844909668, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8710117936134338, + "num_tokens": 248641218.0, + "step": 6516 + }, + { + "epoch": 0.8290293855743545, + "ewc_loss": 0.034263744950294495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7131873391917907e-05, + "grad_norm": 25.13564682006836, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8603953719139099, + "num_tokens": 248672616.0, + "step": 6517 + }, + { + "epoch": 0.8291565958529449, + "ewc_loss": 0.03423592820763588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.711796357994899e-05, + "grad_norm": 25.04877471923828, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8633420467376709, + "num_tokens": 248708212.0, + "step": 6518 + }, + { + "epoch": 0.8292838061315354, + "ewc_loss": 0.0342426598072052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7121330529334955e-05, + "grad_norm": 25.02337074279785, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8644740581512451, + "num_tokens": 248746575.0, + "step": 6519 + }, + { + "epoch": 0.8294110164101259, + "ewc_loss": 0.034250445663928986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7125223166658543e-05, + "grad_norm": 25.003089904785156, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8557898998260498, + "num_tokens": 248789423.0, + "step": 6520 + }, + { + "epoch": 0.8295382266887165, + "ewc_loss": 0.03421919047832489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7109594409703277e-05, + "grad_norm": 25.03450584411621, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.862136960029602, + "num_tokens": 248824044.0, + "step": 6521 + }, + { + "epoch": 0.829665436967307, + "ewc_loss": 0.03422966226935387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7114831280196086e-05, + "grad_norm": 25.113178253173828, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8569969534873962, + "num_tokens": 248860805.0, + "step": 6522 + }, + { + "epoch": 0.8297926472458975, + "ewc_loss": 0.03423330560326576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7116652088589035e-05, + "grad_norm": 24.970783233642578, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8666690587997437, + "num_tokens": 248898958.0, + "step": 6523 + }, + { + "epoch": 0.8299198575244879, + "ewc_loss": 0.03422601893544197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7113008652813733e-05, + "grad_norm": 25.07640266418457, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8582257628440857, + "num_tokens": 248938847.0, + "step": 6524 + }, + { + "epoch": 0.8300470678030785, + "ewc_loss": 0.03422088921070099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7110443877754733e-05, + "grad_norm": 24.941749572753906, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8564015030860901, + "num_tokens": 248977944.0, + "step": 6525 + }, + { + "epoch": 0.830174278081669, + "ewc_loss": 0.034213609993457794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7106804079958238e-05, + "grad_norm": 25.113447189331055, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8527317643165588, + "num_tokens": 249014610.0, + "step": 6526 + }, + { + "epoch": 0.8303014883602595, + "ewc_loss": 0.03432215005159378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7161075447802432e-05, + "grad_norm": 24.979127883911133, + "learning_rate": 1e-06, + "loss": 0.5491, + "mean_token_accuracy": 0.8293002247810364, + "num_tokens": 249053869.0, + "step": 6527 + }, + { + "epoch": 0.83042869863885, + "ewc_loss": 0.034183140844106674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.709157004370354e-05, + "grad_norm": 25.09880256652832, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8526951670646667, + "num_tokens": 249096874.0, + "step": 6528 + }, + { + "epoch": 0.8305559089174406, + "ewc_loss": 0.03424953296780586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7124766600318253e-05, + "grad_norm": 24.947879791259766, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8580523133277893, + "num_tokens": 249133141.0, + "step": 6529 + }, + { + "epoch": 0.830683119196031, + "ewc_loss": 0.03414972499012947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7074862626031972e-05, + "grad_norm": 25.07401466369629, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8640609979629517, + "num_tokens": 249163813.0, + "step": 6530 + }, + { + "epoch": 0.8308103294746215, + "ewc_loss": 0.03435708209872246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.717854138405528e-05, + "grad_norm": 25.015422821044922, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8519003391265869, + "num_tokens": 249202935.0, + "step": 6531 + }, + { + "epoch": 0.830937539753212, + "ewc_loss": 0.0342119038105011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7105950973927975e-05, + "grad_norm": 25.105730056762695, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8668431043624878, + "num_tokens": 249241061.0, + "step": 6532 + }, + { + "epoch": 0.8310647500318026, + "ewc_loss": 0.03425871953368187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7129359548562206e-05, + "grad_norm": 24.985761642456055, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8477809429168701, + "num_tokens": 249277863.0, + "step": 6533 + }, + { + "epoch": 0.8311919603103931, + "ewc_loss": 0.0342344269156456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.711721415631473e-05, + "grad_norm": 24.954483032226562, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.859237551689148, + "num_tokens": 249318047.0, + "step": 6534 + }, + { + "epoch": 0.8313191705889836, + "ewc_loss": 0.034267961978912354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7133981600636616e-05, + "grad_norm": 24.98576545715332, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.854426383972168, + "num_tokens": 249362095.0, + "step": 6535 + }, + { + "epoch": 0.831446380867574, + "ewc_loss": 0.034290265291929245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7145132005680352e-05, + "grad_norm": 25.037412643432617, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8733130097389221, + "num_tokens": 249403362.0, + "step": 6536 + }, + { + "epoch": 0.8315735911461646, + "ewc_loss": 0.03429649770259857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7148247934528627e-05, + "grad_norm": 25.00871467590332, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8657642602920532, + "num_tokens": 249438051.0, + "step": 6537 + }, + { + "epoch": 0.8317008014247551, + "ewc_loss": 0.03430827707052231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7154137822217308e-05, + "grad_norm": 25.131851196289062, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8631032705307007, + "num_tokens": 249477185.0, + "step": 6538 + }, + { + "epoch": 0.8318280117033456, + "ewc_loss": 0.034267161041498184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7133581422967836e-05, + "grad_norm": 25.051395416259766, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8525999784469604, + "num_tokens": 249516880.0, + "step": 6539 + }, + { + "epoch": 0.8319552219819362, + "ewc_loss": 0.03425593674182892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7127968021668494e-05, + "grad_norm": 25.08592987060547, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8696315884590149, + "num_tokens": 249555864.0, + "step": 6540 + }, + { + "epoch": 0.8320824322605267, + "ewc_loss": 0.03431333601474762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.715666803647764e-05, + "grad_norm": 25.04737663269043, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8578587770462036, + "num_tokens": 249590566.0, + "step": 6541 + }, + { + "epoch": 0.8322096425391171, + "ewc_loss": 0.03421712666749954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7108563042711467e-05, + "grad_norm": 25.1302547454834, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8716225028038025, + "num_tokens": 249631523.0, + "step": 6542 + }, + { + "epoch": 0.8323368528177076, + "ewc_loss": 0.03424474224448204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7122370991273783e-05, + "grad_norm": 25.010892868041992, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8678467273712158, + "num_tokens": 249674113.0, + "step": 6543 + }, + { + "epoch": 0.8324640630962982, + "ewc_loss": 0.03425708785653114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7128544641309418e-05, + "grad_norm": 25.072799682617188, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8711186647415161, + "num_tokens": 249712934.0, + "step": 6544 + }, + { + "epoch": 0.8325912733748887, + "ewc_loss": 0.03420533239841461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.710266587906517e-05, + "grad_norm": 25.011608123779297, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8650429248809814, + "num_tokens": 249750295.0, + "step": 6545 + }, + { + "epoch": 0.8327184836534792, + "ewc_loss": 0.034242454916238785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7121226846938953e-05, + "grad_norm": 25.14253044128418, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8644788265228271, + "num_tokens": 249790289.0, + "step": 6546 + }, + { + "epoch": 0.8328456939320698, + "ewc_loss": 0.034191977232694626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7095988368964754e-05, + "grad_norm": 24.966323852539062, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.87242591381073, + "num_tokens": 249822873.0, + "step": 6547 + }, + { + "epoch": 0.8329729042106602, + "ewc_loss": 0.03415543586015701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.707771843939554e-05, + "grad_norm": 25.03373146057129, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8609146475791931, + "num_tokens": 249860564.0, + "step": 6548 + }, + { + "epoch": 0.8331001144892507, + "ewc_loss": 0.03428006172180176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.714003155939281e-05, + "grad_norm": 25.020004272460938, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8591749668121338, + "num_tokens": 249901448.0, + "step": 6549 + }, + { + "epoch": 0.8332273247678412, + "ewc_loss": 0.0342569537460804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7128477338701487e-05, + "grad_norm": 25.139816284179688, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.848857045173645, + "num_tokens": 249933524.0, + "step": 6550 + }, + { + "epoch": 0.8333545350464318, + "ewc_loss": 0.03428608551621437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7143041986855678e-05, + "grad_norm": 25.036571502685547, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8555717468261719, + "num_tokens": 249971856.0, + "step": 6551 + }, + { + "epoch": 0.8334817453250223, + "ewc_loss": 0.03418121486902237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7090607798309065e-05, + "grad_norm": 25.010149002075195, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8518449068069458, + "num_tokens": 250013940.0, + "step": 6552 + }, + { + "epoch": 0.8336089556036128, + "ewc_loss": 0.0342816598713398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7140830095740966e-05, + "grad_norm": 25.039487838745117, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8429045081138611, + "num_tokens": 250044826.0, + "step": 6553 + }, + { + "epoch": 0.8337361658822032, + "ewc_loss": 0.034276969730854034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7138485418399796e-05, + "grad_norm": 25.166887283325195, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.844211220741272, + "num_tokens": 250087409.0, + "step": 6554 + }, + { + "epoch": 0.8338633761607938, + "ewc_loss": 0.03424132615327835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7120662960223854e-05, + "grad_norm": 24.92958641052246, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.866213858127594, + "num_tokens": 250126236.0, + "step": 6555 + }, + { + "epoch": 0.8339905864393843, + "ewc_loss": 0.03421974182128906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7109870896092616e-05, + "grad_norm": 25.094207763671875, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8683929443359375, + "num_tokens": 250163559.0, + "step": 6556 + }, + { + "epoch": 0.8341177967179748, + "ewc_loss": 0.03426729142665863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.713364508759696e-05, + "grad_norm": 24.96469497680664, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8608556985855103, + "num_tokens": 250203536.0, + "step": 6557 + }, + { + "epoch": 0.8342450069965653, + "ewc_loss": 0.03430594131350517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.715297003102023e-05, + "grad_norm": 25.22130012512207, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8648507595062256, + "num_tokens": 250237455.0, + "step": 6558 + }, + { + "epoch": 0.8343722172751559, + "ewc_loss": 0.03435135260224342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7175676475744694e-05, + "grad_norm": 25.089466094970703, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8678396344184875, + "num_tokens": 250274559.0, + "step": 6559 + }, + { + "epoch": 0.8344994275537464, + "ewc_loss": 0.03424980118870735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7124901205534115e-05, + "grad_norm": 25.10967254638672, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8613201379776001, + "num_tokens": 250309392.0, + "step": 6560 + }, + { + "epoch": 0.8346266378323368, + "ewc_loss": 0.03428462892770767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.714231439109426e-05, + "grad_norm": 24.972211837768555, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8652822971343994, + "num_tokens": 250352010.0, + "step": 6561 + }, + { + "epoch": 0.8347538481109273, + "ewc_loss": 0.03424591198563576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7122956705861725e-05, + "grad_norm": 25.134790420532227, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8477460145950317, + "num_tokens": 250385745.0, + "step": 6562 + }, + { + "epoch": 0.8348810583895179, + "ewc_loss": 0.03435726463794708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7178632333525456e-05, + "grad_norm": 25.111316680908203, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8581547737121582, + "num_tokens": 250426055.0, + "step": 6563 + }, + { + "epoch": 0.8350082686681084, + "ewc_loss": 0.03425528109073639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7127640603575855e-05, + "grad_norm": 25.102413177490234, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8576929569244385, + "num_tokens": 250468795.0, + "step": 6564 + }, + { + "epoch": 0.8351354789466989, + "ewc_loss": 0.03432704880833626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7163523807539605e-05, + "grad_norm": 25.085233688354492, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8582465648651123, + "num_tokens": 250509115.0, + "step": 6565 + }, + { + "epoch": 0.8352626892252895, + "ewc_loss": 0.03426888585090637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7134443623945117e-05, + "grad_norm": 24.9349365234375, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8599283695220947, + "num_tokens": 250547770.0, + "step": 6566 + }, + { + "epoch": 0.8353898995038799, + "ewc_loss": 0.034308914095163345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7154457964352332e-05, + "grad_norm": 25.129636764526367, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8565461039543152, + "num_tokens": 250589737.0, + "step": 6567 + }, + { + "epoch": 0.8355171097824704, + "ewc_loss": 0.034356873482465744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7178435882669874e-05, + "grad_norm": 25.10667610168457, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8681221008300781, + "num_tokens": 250623058.0, + "step": 6568 + }, + { + "epoch": 0.8356443200610609, + "ewc_loss": 0.034305162727832794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7152580767287873e-05, + "grad_norm": 25.096607208251953, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8597652912139893, + "num_tokens": 250660526.0, + "step": 6569 + }, + { + "epoch": 0.8357715303396515, + "ewc_loss": 0.03436805307865143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7184027456096373e-05, + "grad_norm": 25.103548049926758, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8437179923057556, + "num_tokens": 250698218.0, + "step": 6570 + }, + { + "epoch": 0.835898740618242, + "ewc_loss": 0.03426993638277054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7134967492893338e-05, + "grad_norm": 25.04859733581543, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8528414964675903, + "num_tokens": 250743410.0, + "step": 6571 + }, + { + "epoch": 0.8360259508968325, + "ewc_loss": 0.03433693200349808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.716846600174904e-05, + "grad_norm": 25.131624221801758, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8686933517456055, + "num_tokens": 250781919.0, + "step": 6572 + }, + { + "epoch": 0.8361531611754229, + "ewc_loss": 0.034293390810489655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7146696336567402e-05, + "grad_norm": 24.976804733276367, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.858573317527771, + "num_tokens": 250817051.0, + "step": 6573 + }, + { + "epoch": 0.8362803714540135, + "ewc_loss": 0.03428976982831955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.714488462312147e-05, + "grad_norm": 25.063386917114258, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.857286810874939, + "num_tokens": 250857241.0, + "step": 6574 + }, + { + "epoch": 0.836407581732604, + "ewc_loss": 0.03431635722517967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7158177797682583e-05, + "grad_norm": 25.09189224243164, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8607773184776306, + "num_tokens": 250903591.0, + "step": 6575 + }, + { + "epoch": 0.8365347920111945, + "ewc_loss": 0.03425687551498413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.712843732093461e-05, + "grad_norm": 25.01345443725586, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8529917597770691, + "num_tokens": 250945788.0, + "step": 6576 + }, + { + "epoch": 0.836662002289785, + "ewc_loss": 0.03431640565395355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7158203263534233e-05, + "grad_norm": 24.989070892333984, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8534474968910217, + "num_tokens": 250984884.0, + "step": 6577 + }, + { + "epoch": 0.8367892125683756, + "ewc_loss": 0.034322239458560944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7161119103548117e-05, + "grad_norm": 25.038848876953125, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8711596727371216, + "num_tokens": 251024347.0, + "step": 6578 + }, + { + "epoch": 0.836916422846966, + "ewc_loss": 0.03425459936261177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7127300452557392e-05, + "grad_norm": 24.976181030273438, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8817644119262695, + "num_tokens": 251068527.0, + "step": 6579 + }, + { + "epoch": 0.8370436331255565, + "ewc_loss": 0.034237802028656006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7118900359491818e-05, + "grad_norm": 24.948715209960938, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8648860454559326, + "num_tokens": 251109914.0, + "step": 6580 + }, + { + "epoch": 0.837170843404147, + "ewc_loss": 0.03427008166909218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.713504025246948e-05, + "grad_norm": 24.983116149902344, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8521419763565063, + "num_tokens": 251144760.0, + "step": 6581 + }, + { + "epoch": 0.8372980536827376, + "ewc_loss": 0.034313201904296875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.715660073386971e-05, + "grad_norm": 25.010391235351562, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8575345277786255, + "num_tokens": 251184456.0, + "step": 6582 + }, + { + "epoch": 0.8374252639613281, + "ewc_loss": 0.03432188183069229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.716094084258657e-05, + "grad_norm": 25.062055587768555, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8517333269119263, + "num_tokens": 251229493.0, + "step": 6583 + }, + { + "epoch": 0.8375524742399186, + "ewc_loss": 0.03433584049344063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7167920304927975e-05, + "grad_norm": 25.017824172973633, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8650027513504028, + "num_tokens": 251270886.0, + "step": 6584 + }, + { + "epoch": 0.837679684518509, + "ewc_loss": 0.03422711044549942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7113554349634796e-05, + "grad_norm": 24.997447967529297, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8682785034179688, + "num_tokens": 251304833.0, + "step": 6585 + }, + { + "epoch": 0.8378068947970996, + "ewc_loss": 0.034294117242097855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7147058315458708e-05, + "grad_norm": 25.027408599853516, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8626983165740967, + "num_tokens": 251334196.0, + "step": 6586 + }, + { + "epoch": 0.8379341050756901, + "ewc_loss": 0.03426675125956535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7133375877165236e-05, + "grad_norm": 25.173559188842773, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8486863374710083, + "num_tokens": 251380278.0, + "step": 6587 + }, + { + "epoch": 0.8380613153542806, + "ewc_loss": 0.034299395978450775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7149697669083253e-05, + "grad_norm": 25.04667854309082, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8536235094070435, + "num_tokens": 251418432.0, + "step": 6588 + }, + { + "epoch": 0.8381885256328712, + "ewc_loss": 0.034251585602760315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7125792510341853e-05, + "grad_norm": 25.062814712524414, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8579877614974976, + "num_tokens": 251453747.0, + "step": 6589 + }, + { + "epoch": 0.8383157359114617, + "ewc_loss": 0.034301191568374634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7150596249848604e-05, + "grad_norm": 25.050464630126953, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8589566946029663, + "num_tokens": 251492439.0, + "step": 6590 + }, + { + "epoch": 0.8384429461900521, + "ewc_loss": 0.034242045134305954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7121023120125756e-05, + "grad_norm": 25.093318939208984, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8609170913696289, + "num_tokens": 251527945.0, + "step": 6591 + }, + { + "epoch": 0.8385701564686426, + "ewc_loss": 0.034281257539987564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7140628187917173e-05, + "grad_norm": 25.09880828857422, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8678902983665466, + "num_tokens": 251567161.0, + "step": 6592 + }, + { + "epoch": 0.8386973667472332, + "ewc_loss": 0.03423422947525978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7117114111897536e-05, + "grad_norm": 24.977359771728516, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8596013188362122, + "num_tokens": 251601189.0, + "step": 6593 + }, + { + "epoch": 0.8388245770258237, + "ewc_loss": 0.03435945883393288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7179729184135795e-05, + "grad_norm": 25.264726638793945, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8578936457633972, + "num_tokens": 251630846.0, + "step": 6594 + }, + { + "epoch": 0.8389517873044142, + "ewc_loss": 0.034289486706256866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7144742741947994e-05, + "grad_norm": 24.951082229614258, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8676756620407104, + "num_tokens": 251665701.0, + "step": 6595 + }, + { + "epoch": 0.8390789975830047, + "ewc_loss": 0.03425915166735649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7129576008301228e-05, + "grad_norm": 25.164852142333984, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8520592451095581, + "num_tokens": 251699909.0, + "step": 6596 + }, + { + "epoch": 0.8392062078615952, + "ewc_loss": 0.034410662949085236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7205331459990703e-05, + "grad_norm": 25.084917068481445, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8581130504608154, + "num_tokens": 251734052.0, + "step": 6597 + }, + { + "epoch": 0.8393334181401857, + "ewc_loss": 0.03426191955804825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7130960259237327e-05, + "grad_norm": 25.002042770385742, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8480395078659058, + "num_tokens": 251772518.0, + "step": 6598 + }, + { + "epoch": 0.8394606284187762, + "ewc_loss": 0.03435709327459335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.717854684102349e-05, + "grad_norm": 25.111587524414062, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8574755787849426, + "num_tokens": 251816378.0, + "step": 6599 + }, + { + "epoch": 0.8395878386973668, + "ewc_loss": 0.03437649831175804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7188249330502003e-05, + "grad_norm": 25.111173629760742, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8602578639984131, + "num_tokens": 251851264.0, + "step": 6600 + }, + { + "epoch": 0.8397150489759573, + "ewc_loss": 0.0344185046851635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7209253201144747e-05, + "grad_norm": 25.14332389831543, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8616518378257751, + "num_tokens": 251888906.0, + "step": 6601 + }, + { + "epoch": 0.8398422592545478, + "ewc_loss": 0.03441714122891426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7208571080118418e-05, + "grad_norm": 25.111968994140625, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.866687536239624, + "num_tokens": 251930641.0, + "step": 6602 + }, + { + "epoch": 0.8399694695331382, + "ewc_loss": 0.03434807434678078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.71740375662921e-05, + "grad_norm": 25.019752502441406, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8409005403518677, + "num_tokens": 251968929.0, + "step": 6603 + }, + { + "epoch": 0.8400966798117288, + "ewc_loss": 0.034434642642736435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7217320419149473e-05, + "grad_norm": 25.158756256103516, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8510985374450684, + "num_tokens": 252006751.0, + "step": 6604 + }, + { + "epoch": 0.8402238900903193, + "ewc_loss": 0.034371014684438705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.718550811347086e-05, + "grad_norm": 25.1152286529541, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.85591059923172, + "num_tokens": 252045861.0, + "step": 6605 + }, + { + "epoch": 0.8403511003689098, + "ewc_loss": 0.03439124673604965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.719562351354398e-05, + "grad_norm": 25.041656494140625, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8530535697937012, + "num_tokens": 252084412.0, + "step": 6606 + }, + { + "epoch": 0.8404783106475003, + "ewc_loss": 0.03439944609999657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.719972351565957e-05, + "grad_norm": 25.209665298461914, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8638723492622375, + "num_tokens": 252120576.0, + "step": 6607 + }, + { + "epoch": 0.8406055209260909, + "ewc_loss": 0.034377869218587875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.718893508950714e-05, + "grad_norm": 25.053438186645508, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8576480150222778, + "num_tokens": 252161636.0, + "step": 6608 + }, + { + "epoch": 0.8407327312046813, + "ewc_loss": 0.03431682661175728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.715841244731564e-05, + "grad_norm": 25.044872283935547, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8596504330635071, + "num_tokens": 252199669.0, + "step": 6609 + }, + { + "epoch": 0.8408599414832718, + "ewc_loss": 0.03444289788603783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7221449525095522e-05, + "grad_norm": 25.17291259765625, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8673425912857056, + "num_tokens": 252236832.0, + "step": 6610 + }, + { + "epoch": 0.8409871517618623, + "ewc_loss": 0.034434862434864044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.721743137750309e-05, + "grad_norm": 25.07151985168457, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8557182550430298, + "num_tokens": 252282518.0, + "step": 6611 + }, + { + "epoch": 0.8411143620404529, + "ewc_loss": 0.03441507741808891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7207537894137204e-05, + "grad_norm": 25.214752197265625, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.852440595626831, + "num_tokens": 252316479.0, + "step": 6612 + }, + { + "epoch": 0.8412415723190434, + "ewc_loss": 0.03436417132616043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7182084775413387e-05, + "grad_norm": 25.11003303527832, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8536515235900879, + "num_tokens": 252353789.0, + "step": 6613 + }, + { + "epoch": 0.8413687825976339, + "ewc_loss": 0.03431040048599243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.715520011202898e-05, + "grad_norm": 25.156192779541016, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8632581830024719, + "num_tokens": 252391609.0, + "step": 6614 + }, + { + "epoch": 0.8414959928762245, + "ewc_loss": 0.03438882157206535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7194410247611813e-05, + "grad_norm": 25.205461502075195, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.871917188167572, + "num_tokens": 252433514.0, + "step": 6615 + }, + { + "epoch": 0.8416232031548149, + "ewc_loss": 0.034338708966970444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.716935366857797e-05, + "grad_norm": 25.090824127197266, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8635547161102295, + "num_tokens": 252469700.0, + "step": 6616 + }, + { + "epoch": 0.8417504134334054, + "ewc_loss": 0.034333132207393646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7166566976811737e-05, + "grad_norm": 25.218761444091797, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8567032814025879, + "num_tokens": 252501658.0, + "step": 6617 + }, + { + "epoch": 0.8418776237119959, + "ewc_loss": 0.034234948456287384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7117474271799438e-05, + "grad_norm": 25.110227584838867, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8617140650749207, + "num_tokens": 252534546.0, + "step": 6618 + }, + { + "epoch": 0.8420048339905865, + "ewc_loss": 0.03431446850299835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7157233742182143e-05, + "grad_norm": 25.0403995513916, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8671959042549133, + "num_tokens": 252573172.0, + "step": 6619 + }, + { + "epoch": 0.842132044269177, + "ewc_loss": 0.034318987280130386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7159492927021347e-05, + "grad_norm": 25.134464263916016, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8686480522155762, + "num_tokens": 252607258.0, + "step": 6620 + }, + { + "epoch": 0.8422592545477675, + "ewc_loss": 0.03440200909972191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7201004084199667e-05, + "grad_norm": 25.116559982299805, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8704047799110413, + "num_tokens": 252651451.0, + "step": 6621 + }, + { + "epoch": 0.8423864648263579, + "ewc_loss": 0.03430821746587753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.715410871838685e-05, + "grad_norm": 25.097904205322266, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8721312284469604, + "num_tokens": 252686348.0, + "step": 6622 + }, + { + "epoch": 0.8425136751049485, + "ewc_loss": 0.034292444586753845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7146221580333076e-05, + "grad_norm": 25.08853530883789, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8545501828193665, + "num_tokens": 252724316.0, + "step": 6623 + }, + { + "epoch": 0.842640885383539, + "ewc_loss": 0.034344881772994995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7172440493595786e-05, + "grad_norm": 25.184921264648438, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8488993644714355, + "num_tokens": 252764612.0, + "step": 6624 + }, + { + "epoch": 0.8427680956621295, + "ewc_loss": 0.03429168462753296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.714584323053714e-05, + "grad_norm": 25.050376892089844, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8383026719093323, + "num_tokens": 252809510.0, + "step": 6625 + }, + { + "epoch": 0.84289530594072, + "ewc_loss": 0.034261904656887054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7130952983279712e-05, + "grad_norm": 25.074434280395508, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8568183779716492, + "num_tokens": 252847598.0, + "step": 6626 + }, + { + "epoch": 0.8430225162193106, + "ewc_loss": 0.03433402255177498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7167010810226202e-05, + "grad_norm": 25.182910919189453, + "learning_rate": 1e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.845737874507904, + "num_tokens": 252884401.0, + "step": 6627 + }, + { + "epoch": 0.843149726497901, + "ewc_loss": 0.034382615238428116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7191307051689364e-05, + "grad_norm": 25.22865104675293, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8440850973129272, + "num_tokens": 252928348.0, + "step": 6628 + }, + { + "epoch": 0.8432769367764915, + "ewc_loss": 0.03428211435675621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.714105746941641e-05, + "grad_norm": 25.088327407836914, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8585089445114136, + "num_tokens": 252965084.0, + "step": 6629 + }, + { + "epoch": 0.843404147055082, + "ewc_loss": 0.03433864936232567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7169324564747512e-05, + "grad_norm": 25.14735984802246, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8636846542358398, + "num_tokens": 253003697.0, + "step": 6630 + }, + { + "epoch": 0.8435313573336726, + "ewc_loss": 0.03439376875758171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7196884073200636e-05, + "grad_norm": 25.126304626464844, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8460677862167358, + "num_tokens": 253037239.0, + "step": 6631 + }, + { + "epoch": 0.8436585676122631, + "ewc_loss": 0.034293804317712784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7146901882370003e-05, + "grad_norm": 25.119096755981445, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8503630757331848, + "num_tokens": 253078163.0, + "step": 6632 + }, + { + "epoch": 0.8437857778908536, + "ewc_loss": 0.03433855250477791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7169275452033617e-05, + "grad_norm": 25.042158126831055, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.85733562707901, + "num_tokens": 253118907.0, + "step": 6633 + }, + { + "epoch": 0.843912988169444, + "ewc_loss": 0.034297481179237366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7148740880656987e-05, + "grad_norm": 25.129684448242188, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8777937293052673, + "num_tokens": 253153132.0, + "step": 6634 + }, + { + "epoch": 0.8440401984480346, + "ewc_loss": 0.03434227034449577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.717113445920404e-05, + "grad_norm": 25.035200119018555, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8556187748908997, + "num_tokens": 253194446.0, + "step": 6635 + }, + { + "epoch": 0.8441674087266251, + "ewc_loss": 0.03427673876285553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.713836900307797e-05, + "grad_norm": 25.114864349365234, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8624994158744812, + "num_tokens": 253230798.0, + "step": 6636 + }, + { + "epoch": 0.8442946190052156, + "ewc_loss": 0.034326061606407166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7163030861411244e-05, + "grad_norm": 25.063804626464844, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8520749807357788, + "num_tokens": 253269029.0, + "step": 6637 + }, + { + "epoch": 0.8444218292838062, + "ewc_loss": 0.03435640409588814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7178201233036816e-05, + "grad_norm": 25.160926818847656, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8553379774093628, + "num_tokens": 253310656.0, + "step": 6638 + }, + { + "epoch": 0.8445490395623967, + "ewc_loss": 0.03433246538043022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7166232282761484e-05, + "grad_norm": 25.065990447998047, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8772352933883667, + "num_tokens": 253346882.0, + "step": 6639 + }, + { + "epoch": 0.8446762498409871, + "ewc_loss": 0.03430456668138504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.715228245302569e-05, + "grad_norm": 25.183940887451172, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.849822998046875, + "num_tokens": 253389601.0, + "step": 6640 + }, + { + "epoch": 0.8448034601195776, + "ewc_loss": 0.03434355929493904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.71717802004423e-05, + "grad_norm": 25.137813568115234, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.858177661895752, + "num_tokens": 253428725.0, + "step": 6641 + }, + { + "epoch": 0.8449306703981682, + "ewc_loss": 0.03426419198513031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.713209530862514e-05, + "grad_norm": 25.09786033630371, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8727433085441589, + "num_tokens": 253461404.0, + "step": 6642 + }, + { + "epoch": 0.8450578806767587, + "ewc_loss": 0.034308064728975296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7154032320831902e-05, + "grad_norm": 25.079986572265625, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8636042475700378, + "num_tokens": 253500324.0, + "step": 6643 + }, + { + "epoch": 0.8451850909553492, + "ewc_loss": 0.03426245599985123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7131227650679648e-05, + "grad_norm": 25.044288635253906, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8462815284729004, + "num_tokens": 253542500.0, + "step": 6644 + }, + { + "epoch": 0.8453123012339397, + "ewc_loss": 0.034349698573350906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.717484883556608e-05, + "grad_norm": 25.158241271972656, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8418159484863281, + "num_tokens": 253581887.0, + "step": 6645 + }, + { + "epoch": 0.8454395115125302, + "ewc_loss": 0.03431395813822746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7156979083665647e-05, + "grad_norm": 25.143144607543945, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8679991960525513, + "num_tokens": 253616942.0, + "step": 6646 + }, + { + "epoch": 0.8455667217911207, + "ewc_loss": 0.03430153429508209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7150767234852538e-05, + "grad_norm": 25.0414981842041, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8533598780632019, + "num_tokens": 253652610.0, + "step": 6647 + }, + { + "epoch": 0.8456939320697112, + "ewc_loss": 0.03426138311624527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7130691048805602e-05, + "grad_norm": 25.15764808654785, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8564233779907227, + "num_tokens": 253686269.0, + "step": 6648 + }, + { + "epoch": 0.8458211423483017, + "ewc_loss": 0.03440980613231659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7204902178491466e-05, + "grad_norm": 25.128629684448242, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8536816835403442, + "num_tokens": 253723632.0, + "step": 6649 + }, + { + "epoch": 0.8459483526268923, + "ewc_loss": 0.034252461045980453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7126230886788107e-05, + "grad_norm": 25.052532196044922, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8525668382644653, + "num_tokens": 253755990.0, + "step": 6650 + }, + { + "epoch": 0.8460755629054828, + "ewc_loss": 0.03438684344291687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7193422536365688e-05, + "grad_norm": 25.122766494750977, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8537393808364868, + "num_tokens": 253794200.0, + "step": 6651 + }, + { + "epoch": 0.8462027731840732, + "ewc_loss": 0.034392841160297394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.719642023090273e-05, + "grad_norm": 25.06687355041504, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8702678680419922, + "num_tokens": 253840554.0, + "step": 6652 + }, + { + "epoch": 0.8463299834626637, + "ewc_loss": 0.0343439020216465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7171951185446233e-05, + "grad_norm": 25.129562377929688, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8688607811927795, + "num_tokens": 253873642.0, + "step": 6653 + }, + { + "epoch": 0.8464571937412543, + "ewc_loss": 0.03439134731888771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7195674445247278e-05, + "grad_norm": 25.23918914794922, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8657569885253906, + "num_tokens": 253911839.0, + "step": 6654 + }, + { + "epoch": 0.8465844040198448, + "ewc_loss": 0.03436462581157684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.718231214908883e-05, + "grad_norm": 25.103500366210938, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8491134643554688, + "num_tokens": 253944649.0, + "step": 6655 + }, + { + "epoch": 0.8467116142984353, + "ewc_loss": 0.034388113766908646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7194057363667525e-05, + "grad_norm": 36.34089660644531, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8586092591285706, + "num_tokens": 253977739.0, + "step": 6656 + }, + { + "epoch": 0.8468388245770259, + "ewc_loss": 0.040627673268318176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0313837012508884e-05, + "grad_norm": 26.67533302307129, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8459228873252869, + "num_tokens": 254021216.0, + "step": 6657 + }, + { + "epoch": 0.8469660348556163, + "ewc_loss": 0.033796172589063644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.6898085959837772e-05, + "grad_norm": 23.594820022583008, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8560665249824524, + "num_tokens": 254063209.0, + "step": 6658 + }, + { + "epoch": 0.8470932451342068, + "ewc_loss": 0.03818381577730179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.909190723381471e-05, + "grad_norm": 25.81363296508789, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8542973399162292, + "num_tokens": 254102325.0, + "step": 6659 + }, + { + "epoch": 0.8472204554127973, + "ewc_loss": 0.0381774827837944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.908874219225254e-05, + "grad_norm": 25.048004150390625, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8645471334457397, + "num_tokens": 254136991.0, + "step": 6660 + }, + { + "epoch": 0.8473476656913879, + "ewc_loss": 0.038195449858903885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.909772436192725e-05, + "grad_norm": 25.23066520690918, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8588077425956726, + "num_tokens": 254179520.0, + "step": 6661 + }, + { + "epoch": 0.8474748759699784, + "ewc_loss": 0.038893550634384155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9446775695541874e-05, + "grad_norm": 25.22175407409668, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8590277433395386, + "num_tokens": 254220661.0, + "step": 6662 + }, + { + "epoch": 0.8476020862485689, + "ewc_loss": 0.03891613706946373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9458067981759086e-05, + "grad_norm": 25.220277786254883, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8662698864936829, + "num_tokens": 254261853.0, + "step": 6663 + }, + { + "epoch": 0.8477292965271594, + "ewc_loss": 0.0393415130674839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9670756955747493e-05, + "grad_norm": 25.290525436401367, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8598551154136658, + "num_tokens": 254301577.0, + "step": 6664 + }, + { + "epoch": 0.8478565068057499, + "ewc_loss": 0.03922535106539726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.961267480510287e-05, + "grad_norm": 25.38434600830078, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8721754550933838, + "num_tokens": 254341792.0, + "step": 6665 + }, + { + "epoch": 0.8479837170843404, + "ewc_loss": 0.03920472413301468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.960236295417417e-05, + "grad_norm": 25.392465591430664, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8473148345947266, + "num_tokens": 254376755.0, + "step": 6666 + }, + { + "epoch": 0.8481109273629309, + "ewc_loss": 0.038964029401540756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9482014977256767e-05, + "grad_norm": 25.316650390625, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8707828521728516, + "num_tokens": 254413565.0, + "step": 6667 + }, + { + "epoch": 0.8482381376415215, + "ewc_loss": 0.03877902403473854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9389512090128846e-05, + "grad_norm": 25.3927001953125, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8484776020050049, + "num_tokens": 254449889.0, + "step": 6668 + }, + { + "epoch": 0.848365347920112, + "ewc_loss": 0.038598403334617615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9299201085232198e-05, + "grad_norm": 25.376752853393555, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8726056814193726, + "num_tokens": 254488129.0, + "step": 6669 + }, + { + "epoch": 0.8484925581987025, + "ewc_loss": 0.03836452215909958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9182261894457042e-05, + "grad_norm": 25.471065521240234, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8702316880226135, + "num_tokens": 254522031.0, + "step": 6670 + }, + { + "epoch": 0.8486197684772929, + "ewc_loss": 0.03808596730232239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.904298369481694e-05, + "grad_norm": 25.396610260009766, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8705152273178101, + "num_tokens": 254559249.0, + "step": 6671 + }, + { + "epoch": 0.8487469787558835, + "ewc_loss": 0.03778477758169174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8892389562097378e-05, + "grad_norm": 25.454734802246094, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8531277179718018, + "num_tokens": 254603968.0, + "step": 6672 + }, + { + "epoch": 0.848874189034474, + "ewc_loss": 0.037589240819215775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8794620700646192e-05, + "grad_norm": 25.365583419799805, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8691886067390442, + "num_tokens": 254642178.0, + "step": 6673 + }, + { + "epoch": 0.8490013993130645, + "ewc_loss": 0.03727599233388901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8637996618053876e-05, + "grad_norm": 25.290847778320312, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.851823091506958, + "num_tokens": 254679129.0, + "step": 6674 + }, + { + "epoch": 0.849128609591655, + "ewc_loss": 0.037096697837114334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8548349544289522e-05, + "grad_norm": 25.528276443481445, + "learning_rate": 1e-06, + "loss": 0.5571, + "mean_token_accuracy": 0.8303138017654419, + "num_tokens": 254714952.0, + "step": 6675 + }, + { + "epoch": 0.8492558198702456, + "ewc_loss": 0.036884959787130356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8442480723024346e-05, + "grad_norm": 25.27696418762207, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8549996614456177, + "num_tokens": 254743323.0, + "step": 6676 + }, + { + "epoch": 0.849383030148836, + "ewc_loss": 0.036587346345186234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8293672837899067e-05, + "grad_norm": 25.511857986450195, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8649480938911438, + "num_tokens": 254784749.0, + "step": 6677 + }, + { + "epoch": 0.8495102404274265, + "ewc_loss": 0.03652956336736679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8264781829202548e-05, + "grad_norm": 25.3309268951416, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8655697107315063, + "num_tokens": 254820102.0, + "step": 6678 + }, + { + "epoch": 0.849637450706017, + "ewc_loss": 0.03621617332100868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8108086806023493e-05, + "grad_norm": 25.539045333862305, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8555415868759155, + "num_tokens": 254859988.0, + "step": 6679 + }, + { + "epoch": 0.8497646609846076, + "ewc_loss": 0.03610197827219963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.805098872864619e-05, + "grad_norm": 25.437549591064453, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8555419445037842, + "num_tokens": 254906792.0, + "step": 6680 + }, + { + "epoch": 0.8498918712631981, + "ewc_loss": 0.03582122176885605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.791061004041694e-05, + "grad_norm": 25.35368537902832, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8620174527168274, + "num_tokens": 254941552.0, + "step": 6681 + }, + { + "epoch": 0.8500190815417886, + "ewc_loss": 0.03568205237388611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7841026419773698e-05, + "grad_norm": 25.315410614013672, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8512282371520996, + "num_tokens": 254974653.0, + "step": 6682 + }, + { + "epoch": 0.850146291820379, + "ewc_loss": 0.03557678684592247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.77883939613821e-05, + "grad_norm": 25.316116333007812, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8438297510147095, + "num_tokens": 255018418.0, + "step": 6683 + }, + { + "epoch": 0.8502735020989696, + "ewc_loss": 0.03548005223274231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7740027033141814e-05, + "grad_norm": 25.458383560180664, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8643354177474976, + "num_tokens": 255049626.0, + "step": 6684 + }, + { + "epoch": 0.8504007123775601, + "ewc_loss": 0.03528902307152748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.764451189956162e-05, + "grad_norm": 25.256990432739258, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8526493310928345, + "num_tokens": 255086323.0, + "step": 6685 + }, + { + "epoch": 0.8505279226561506, + "ewc_loss": 0.03523056209087372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.761528073984664e-05, + "grad_norm": 25.327964782714844, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8680107593536377, + "num_tokens": 255124102.0, + "step": 6686 + }, + { + "epoch": 0.8506551329347412, + "ewc_loss": 0.03519872575998306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.759936276357621e-05, + "grad_norm": 25.42038917541504, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8633034825325012, + "num_tokens": 255162176.0, + "step": 6687 + }, + { + "epoch": 0.8507823432133317, + "ewc_loss": 0.03510085865855217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7550430129631422e-05, + "grad_norm": 25.29027557373047, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8626617789268494, + "num_tokens": 255196256.0, + "step": 6688 + }, + { + "epoch": 0.8509095534919221, + "ewc_loss": 0.03500576317310333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7502881746622734e-05, + "grad_norm": 25.435762405395508, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8519474267959595, + "num_tokens": 255236752.0, + "step": 6689 + }, + { + "epoch": 0.8510367637705126, + "ewc_loss": 0.034995317459106445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7497659428045154e-05, + "grad_norm": 25.233198165893555, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8598471879959106, + "num_tokens": 255268694.0, + "step": 6690 + }, + { + "epoch": 0.8511639740491032, + "ewc_loss": 0.03485236316919327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.742618223943282e-05, + "grad_norm": 25.43428611755371, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8655787706375122, + "num_tokens": 255309445.0, + "step": 6691 + }, + { + "epoch": 0.8512911843276937, + "ewc_loss": 0.034931544214487076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7465772543800995e-05, + "grad_norm": 25.185382843017578, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8619720935821533, + "num_tokens": 255341126.0, + "step": 6692 + }, + { + "epoch": 0.8514183946062842, + "ewc_loss": 0.03469013795256615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7345069863949902e-05, + "grad_norm": 25.25945281982422, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.868253231048584, + "num_tokens": 255379553.0, + "step": 6693 + }, + { + "epoch": 0.8515456048848747, + "ewc_loss": 0.03490308299660683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7451540770707652e-05, + "grad_norm": 25.46357536315918, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8594756126403809, + "num_tokens": 255416588.0, + "step": 6694 + }, + { + "epoch": 0.8516728151634652, + "ewc_loss": 0.03464486077427864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.732243072183337e-05, + "grad_norm": 25.238325119018555, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8475116491317749, + "num_tokens": 255449574.0, + "step": 6695 + }, + { + "epoch": 0.8518000254420557, + "ewc_loss": 0.03467388078570366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7336940800305456e-05, + "grad_norm": 25.455036163330078, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8574800491333008, + "num_tokens": 255494088.0, + "step": 6696 + }, + { + "epoch": 0.8519272357206462, + "ewc_loss": 0.034689560532569885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.734478064463474e-05, + "grad_norm": 25.468843460083008, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8679282665252686, + "num_tokens": 255529785.0, + "step": 6697 + }, + { + "epoch": 0.8520544459992367, + "ewc_loss": 0.03456375002861023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7281874534091912e-05, + "grad_norm": 25.37420654296875, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8461273908615112, + "num_tokens": 255566675.0, + "step": 6698 + }, + { + "epoch": 0.8521816562778273, + "ewc_loss": 0.034547727555036545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7273863704758696e-05, + "grad_norm": 25.403745651245117, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8638713359832764, + "num_tokens": 255606502.0, + "step": 6699 + }, + { + "epoch": 0.8523088665564178, + "ewc_loss": 0.034515224397182465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7257612853427418e-05, + "grad_norm": 25.23868179321289, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8508472442626953, + "num_tokens": 255649179.0, + "step": 6700 + }, + { + "epoch": 0.8524360768350082, + "ewc_loss": 0.03442675620317459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.721337866911199e-05, + "grad_norm": 25.28032112121582, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8592227697372437, + "num_tokens": 255688190.0, + "step": 6701 + }, + { + "epoch": 0.8525632871135987, + "ewc_loss": 0.034515995532274246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.725799847918097e-05, + "grad_norm": 25.397016525268555, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8588113784790039, + "num_tokens": 255730401.0, + "step": 6702 + }, + { + "epoch": 0.8526904973921893, + "ewc_loss": 0.034443825483322144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7221913367393427e-05, + "grad_norm": 25.263751983642578, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8547700047492981, + "num_tokens": 255767881.0, + "step": 6703 + }, + { + "epoch": 0.8528177076707798, + "ewc_loss": 0.034398969262838364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7199485228047706e-05, + "grad_norm": 25.23841094970703, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8526082634925842, + "num_tokens": 255808347.0, + "step": 6704 + }, + { + "epoch": 0.8529449179493703, + "ewc_loss": 0.03444444388151169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.722222259559203e-05, + "grad_norm": 25.39154624938965, + "learning_rate": 1e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.840224027633667, + "num_tokens": 255849538.0, + "step": 6705 + }, + { + "epoch": 0.8530721282279609, + "ewc_loss": 0.034407325088977814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.720366344670765e-05, + "grad_norm": 25.32073402404785, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8707132339477539, + "num_tokens": 255890228.0, + "step": 6706 + }, + { + "epoch": 0.8531993385065513, + "ewc_loss": 0.03434637188911438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.717318627925124e-05, + "grad_norm": 25.288665771484375, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.858604371547699, + "num_tokens": 255930189.0, + "step": 6707 + }, + { + "epoch": 0.8533265487851418, + "ewc_loss": 0.034365464001894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.718273233564105e-05, + "grad_norm": 25.309139251708984, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.866226315498352, + "num_tokens": 255968310.0, + "step": 6708 + }, + { + "epoch": 0.8534537590637323, + "ewc_loss": 0.03434642031788826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7173209926113486e-05, + "grad_norm": 25.416568756103516, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8703362941741943, + "num_tokens": 256005872.0, + "step": 6709 + }, + { + "epoch": 0.8535809693423229, + "ewc_loss": 0.03436535596847534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7182677765958942e-05, + "grad_norm": 25.252290725708008, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8642798066139221, + "num_tokens": 256045374.0, + "step": 6710 + }, + { + "epoch": 0.8537081796209134, + "ewc_loss": 0.03430062159895897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7150310668512248e-05, + "grad_norm": 25.283859252929688, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8619986176490784, + "num_tokens": 256084874.0, + "step": 6711 + }, + { + "epoch": 0.8538353898995039, + "ewc_loss": 0.03444152697920799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7220763766090386e-05, + "grad_norm": 25.339439392089844, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.852218747138977, + "num_tokens": 256122251.0, + "step": 6712 + }, + { + "epoch": 0.8539626001780944, + "ewc_loss": 0.03428637608885765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7143187506007962e-05, + "grad_norm": 25.241453170776367, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8573501110076904, + "num_tokens": 256165025.0, + "step": 6713 + }, + { + "epoch": 0.8540898104566849, + "ewc_loss": 0.03435773029923439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.717886516416911e-05, + "grad_norm": 25.31488609313965, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8661366701126099, + "num_tokens": 256202220.0, + "step": 6714 + }, + { + "epoch": 0.8542170207352754, + "ewc_loss": 0.03429512307047844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.714756217552349e-05, + "grad_norm": 25.22499656677246, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8616631031036377, + "num_tokens": 256234036.0, + "step": 6715 + }, + { + "epoch": 0.8543442310138659, + "ewc_loss": 0.03429871425032616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.714935751806479e-05, + "grad_norm": 25.244365692138672, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.858008861541748, + "num_tokens": 256275253.0, + "step": 6716 + }, + { + "epoch": 0.8544714412924564, + "ewc_loss": 0.03437542915344238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.718771454761736e-05, + "grad_norm": 25.274219512939453, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8559291362762451, + "num_tokens": 256311296.0, + "step": 6717 + }, + { + "epoch": 0.854598651571047, + "ewc_loss": 0.03429480269551277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7147402104455978e-05, + "grad_norm": 25.219768524169922, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8656688928604126, + "num_tokens": 256348149.0, + "step": 6718 + }, + { + "epoch": 0.8547258618496375, + "ewc_loss": 0.034339502453804016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7169752027257346e-05, + "grad_norm": 25.337560653686523, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8729261159896851, + "num_tokens": 256387861.0, + "step": 6719 + }, + { + "epoch": 0.8548530721282279, + "ewc_loss": 0.034299712628126144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.714985592116136e-05, + "grad_norm": 25.250349044799805, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8685004711151123, + "num_tokens": 256427131.0, + "step": 6720 + }, + { + "epoch": 0.8549802824068184, + "ewc_loss": 0.03433242440223694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7166212273878045e-05, + "grad_norm": 25.389894485473633, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8650075197219849, + "num_tokens": 256465041.0, + "step": 6721 + }, + { + "epoch": 0.855107492685409, + "ewc_loss": 0.034298744052648544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7149372069980018e-05, + "grad_norm": 25.309085845947266, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8509655594825745, + "num_tokens": 256502576.0, + "step": 6722 + }, + { + "epoch": 0.8552347029639995, + "ewc_loss": 0.03423386812210083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7116934031946585e-05, + "grad_norm": 25.250259399414062, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8621277809143066, + "num_tokens": 256540493.0, + "step": 6723 + }, + { + "epoch": 0.85536191324259, + "ewc_loss": 0.03425334021449089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7126669263234362e-05, + "grad_norm": 25.233030319213867, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8435015678405762, + "num_tokens": 256578115.0, + "step": 6724 + }, + { + "epoch": 0.8554891235211806, + "ewc_loss": 0.034284286201000214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7142143406090327e-05, + "grad_norm": 25.222030639648438, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.867117702960968, + "num_tokens": 256615957.0, + "step": 6725 + }, + { + "epoch": 0.855616333799771, + "ewc_loss": 0.03430614620447159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7153073713416234e-05, + "grad_norm": 25.24147605895996, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8627533316612244, + "num_tokens": 256649972.0, + "step": 6726 + }, + { + "epoch": 0.8557435440783615, + "ewc_loss": 0.03433571383357048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.716785664029885e-05, + "grad_norm": 25.27971839904785, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8665193319320679, + "num_tokens": 256686683.0, + "step": 6727 + }, + { + "epoch": 0.855870754356952, + "ewc_loss": 0.03433375060558319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7166874386020936e-05, + "grad_norm": 25.37037467956543, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.857823371887207, + "num_tokens": 256723587.0, + "step": 6728 + }, + { + "epoch": 0.8559979646355426, + "ewc_loss": 0.034302741289138794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7151371139334515e-05, + "grad_norm": 25.298786163330078, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8603948354721069, + "num_tokens": 256765245.0, + "step": 6729 + }, + { + "epoch": 0.8561251749141331, + "ewc_loss": 0.034307174384593964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7153586668428034e-05, + "grad_norm": 25.259681701660156, + "learning_rate": 1e-06, + "loss": 0.355, + "mean_token_accuracy": 0.8869261741638184, + "num_tokens": 256806930.0, + "step": 6730 + }, + { + "epoch": 0.8562523851927236, + "ewc_loss": 0.03434593975543976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7172969819512218e-05, + "grad_norm": 25.394397735595703, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8581904172897339, + "num_tokens": 256844568.0, + "step": 6731 + }, + { + "epoch": 0.856379595471314, + "ewc_loss": 0.03432401642203331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.716200858936645e-05, + "grad_norm": 25.435224533081055, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8579599857330322, + "num_tokens": 256877483.0, + "step": 6732 + }, + { + "epoch": 0.8565068057499046, + "ewc_loss": 0.034221306443214417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.711065306153614e-05, + "grad_norm": 25.177566528320312, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8646351099014282, + "num_tokens": 256921139.0, + "step": 6733 + }, + { + "epoch": 0.8566340160284951, + "ewc_loss": 0.03433974087238312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7169870261568576e-05, + "grad_norm": 25.438087463378906, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8612567186355591, + "num_tokens": 256957861.0, + "step": 6734 + }, + { + "epoch": 0.8567612263070856, + "ewc_loss": 0.03435283526778221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7176416804431938e-05, + "grad_norm": 25.381486892700195, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8609780073165894, + "num_tokens": 256998582.0, + "step": 6735 + }, + { + "epoch": 0.8568884365856761, + "ewc_loss": 0.03426738083362579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.713369056233205e-05, + "grad_norm": 25.221155166625977, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8597203493118286, + "num_tokens": 257037638.0, + "step": 6736 + }, + { + "epoch": 0.8570156468642667, + "ewc_loss": 0.03428934887051582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.714467362035066e-05, + "grad_norm": 25.30371856689453, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8626632690429688, + "num_tokens": 257074427.0, + "step": 6737 + }, + { + "epoch": 0.8571428571428571, + "ewc_loss": 0.0343630276620388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7181513612740673e-05, + "grad_norm": 25.34287452697754, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8635135889053345, + "num_tokens": 257113826.0, + "step": 6738 + }, + { + "epoch": 0.8572700674214476, + "ewc_loss": 0.03434857726097107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.717428858682979e-05, + "grad_norm": 25.36683464050293, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8469977974891663, + "num_tokens": 257152387.0, + "step": 6739 + }, + { + "epoch": 0.8573972777000382, + "ewc_loss": 0.034307241439819336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.71536212292267e-05, + "grad_norm": 25.15015411376953, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8609671592712402, + "num_tokens": 257192083.0, + "step": 6740 + }, + { + "epoch": 0.8575244879786287, + "ewc_loss": 0.034296147525310516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7148073311545886e-05, + "grad_norm": 25.291067123413086, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8520371317863464, + "num_tokens": 257235425.0, + "step": 6741 + }, + { + "epoch": 0.8576516982572192, + "ewc_loss": 0.034312665462493896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7156333342427388e-05, + "grad_norm": 25.295928955078125, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8564709424972534, + "num_tokens": 257268621.0, + "step": 6742 + }, + { + "epoch": 0.8577789085358097, + "ewc_loss": 0.0343322716653347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7166135876323096e-05, + "grad_norm": 25.31812858581543, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8652726411819458, + "num_tokens": 257306909.0, + "step": 6743 + }, + { + "epoch": 0.8579061188144002, + "ewc_loss": 0.03433942049741745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7169710190501064e-05, + "grad_norm": 25.332090377807617, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8740594983100891, + "num_tokens": 257343388.0, + "step": 6744 + }, + { + "epoch": 0.8580333290929907, + "ewc_loss": 0.034312207251787186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.715610414976254e-05, + "grad_norm": 25.22038459777832, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8640554547309875, + "num_tokens": 257377172.0, + "step": 6745 + }, + { + "epoch": 0.8581605393715812, + "ewc_loss": 0.034316834062337875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.715841790428385e-05, + "grad_norm": 25.3941593170166, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8524730205535889, + "num_tokens": 257410312.0, + "step": 6746 + }, + { + "epoch": 0.8582877496501717, + "ewc_loss": 0.03436990827322006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.718495332170278e-05, + "grad_norm": 25.229713439941406, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8628314137458801, + "num_tokens": 257449919.0, + "step": 6747 + }, + { + "epoch": 0.8584149599287623, + "ewc_loss": 0.03431922197341919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7159611161332577e-05, + "grad_norm": 25.366933822631836, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8713903427124023, + "num_tokens": 257488296.0, + "step": 6748 + }, + { + "epoch": 0.8585421702073528, + "ewc_loss": 0.03442857414484024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.721428634482436e-05, + "grad_norm": 25.281103134155273, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8690879344940186, + "num_tokens": 257524678.0, + "step": 6749 + }, + { + "epoch": 0.8586693804859432, + "ewc_loss": 0.03428731858730316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.714365862426348e-05, + "grad_norm": 25.30422019958496, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8649337887763977, + "num_tokens": 257559060.0, + "step": 6750 + }, + { + "epoch": 0.8587965907645337, + "ewc_loss": 0.03443637490272522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7218188077094965e-05, + "grad_norm": 25.41143226623535, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8697872161865234, + "num_tokens": 257594340.0, + "step": 6751 + }, + { + "epoch": 0.8589238010431243, + "ewc_loss": 0.03436596319079399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7182981537189335e-05, + "grad_norm": 25.321331024169922, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8715633153915405, + "num_tokens": 257627936.0, + "step": 6752 + }, + { + "epoch": 0.8590510113217148, + "ewc_loss": 0.03436284139752388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7181420844281092e-05, + "grad_norm": 25.274333953857422, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8577855825424194, + "num_tokens": 257660311.0, + "step": 6753 + }, + { + "epoch": 0.8591782216003053, + "ewc_loss": 0.03438457101583481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.719228566798847e-05, + "grad_norm": 25.389726638793945, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.852752685546875, + "num_tokens": 257697496.0, + "step": 6754 + }, + { + "epoch": 0.8593054318788959, + "ewc_loss": 0.03441176936030388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.720588443276938e-05, + "grad_norm": 25.39763069152832, + "learning_rate": 1e-06, + "loss": 0.503, + "mean_token_accuracy": 0.8424341678619385, + "num_tokens": 257737242.0, + "step": 6755 + }, + { + "epoch": 0.8594326421574863, + "ewc_loss": 0.03436369076371193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.718184466881212e-05, + "grad_norm": 25.144691467285156, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8617942333221436, + "num_tokens": 257772946.0, + "step": 6756 + }, + { + "epoch": 0.8595598524360768, + "ewc_loss": 0.03437839820981026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7189198842970654e-05, + "grad_norm": 25.336505889892578, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8555138111114502, + "num_tokens": 257807198.0, + "step": 6757 + }, + { + "epoch": 0.8596870627146673, + "ewc_loss": 0.03445599600672722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7227997886948287e-05, + "grad_norm": 25.228347778320312, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8654506802558899, + "num_tokens": 257848618.0, + "step": 6758 + }, + { + "epoch": 0.8598142729932579, + "ewc_loss": 0.034442514181137085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7221256712218747e-05, + "grad_norm": 25.330808639526367, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8718401193618774, + "num_tokens": 257887805.0, + "step": 6759 + }, + { + "epoch": 0.8599414832718484, + "ewc_loss": 0.034505847841501236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7252923498745076e-05, + "grad_norm": 25.289569854736328, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8659416437149048, + "num_tokens": 257929276.0, + "step": 6760 + }, + { + "epoch": 0.8600686935504389, + "ewc_loss": 0.03444451093673706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7222255337401293e-05, + "grad_norm": 25.302249908447266, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8539273738861084, + "num_tokens": 257958629.0, + "step": 6761 + }, + { + "epoch": 0.8601959038290294, + "ewc_loss": 0.03449951112270355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.72497548192041e-05, + "grad_norm": 25.323171615600586, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.856480598449707, + "num_tokens": 257990303.0, + "step": 6762 + }, + { + "epoch": 0.8603231141076199, + "ewc_loss": 0.034466031938791275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.723301647871267e-05, + "grad_norm": 25.38401985168457, + "learning_rate": 1e-06, + "loss": 0.5069, + "mean_token_accuracy": 0.8445322513580322, + "num_tokens": 258028704.0, + "step": 6763 + }, + { + "epoch": 0.8604503243862104, + "ewc_loss": 0.034490108489990234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7245054550585337e-05, + "grad_norm": 25.331586837768555, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8688408136367798, + "num_tokens": 258062757.0, + "step": 6764 + }, + { + "epoch": 0.8605775346648009, + "ewc_loss": 0.03446909412741661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7234546248801053e-05, + "grad_norm": 25.30797576904297, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8597798943519592, + "num_tokens": 258098574.0, + "step": 6765 + }, + { + "epoch": 0.8607047449433914, + "ewc_loss": 0.03450189530849457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7250948076252826e-05, + "grad_norm": 25.305532455444336, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8696944713592529, + "num_tokens": 258130937.0, + "step": 6766 + }, + { + "epoch": 0.860831955221982, + "ewc_loss": 0.034440428018569946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7220214431290515e-05, + "grad_norm": 25.343502044677734, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.862742006778717, + "num_tokens": 258165142.0, + "step": 6767 + }, + { + "epoch": 0.8609591655005725, + "ewc_loss": 0.03452698886394501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.726349364616908e-05, + "grad_norm": 25.276458740234375, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8775351643562317, + "num_tokens": 258203986.0, + "step": 6768 + }, + { + "epoch": 0.8610863757791629, + "ewc_loss": 0.03449874743819237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7249372831429355e-05, + "grad_norm": 25.292362213134766, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8604211211204529, + "num_tokens": 258238692.0, + "step": 6769 + }, + { + "epoch": 0.8612135860577534, + "ewc_loss": 0.0344427153468132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.722135675663594e-05, + "grad_norm": 25.25032615661621, + "learning_rate": 1e-06, + "loss": 0.5183, + "mean_token_accuracy": 0.8410179615020752, + "num_tokens": 258280942.0, + "step": 6770 + }, + { + "epoch": 0.861340796336344, + "ewc_loss": 0.034517545253038406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.725877336866688e-05, + "grad_norm": 25.226709365844727, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8648732900619507, + "num_tokens": 258318835.0, + "step": 6771 + }, + { + "epoch": 0.8614680066149345, + "ewc_loss": 0.034551169723272324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.727558446873445e-05, + "grad_norm": 25.34899139404297, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8508097529411316, + "num_tokens": 258359461.0, + "step": 6772 + }, + { + "epoch": 0.861595216893525, + "ewc_loss": 0.034551989287137985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7275995560339652e-05, + "grad_norm": 25.25016212463379, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8561725616455078, + "num_tokens": 258394582.0, + "step": 6773 + }, + { + "epoch": 0.8617224271721156, + "ewc_loss": 0.034555647522211075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7277823644690216e-05, + "grad_norm": 25.449331283569336, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8621766567230225, + "num_tokens": 258438526.0, + "step": 6774 + }, + { + "epoch": 0.861849637450706, + "ewc_loss": 0.03449611738324165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7248059521079995e-05, + "grad_norm": 25.29183006286621, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8478367328643799, + "num_tokens": 258479332.0, + "step": 6775 + }, + { + "epoch": 0.8619768477292965, + "ewc_loss": 0.03449138626456261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.724569301586598e-05, + "grad_norm": 25.4066162109375, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8514934778213501, + "num_tokens": 258513202.0, + "step": 6776 + }, + { + "epoch": 0.862104058007887, + "ewc_loss": 0.034525465220212936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.72627333085984e-05, + "grad_norm": 25.37995719909668, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8623582124710083, + "num_tokens": 258549145.0, + "step": 6777 + }, + { + "epoch": 0.8622312682864776, + "ewc_loss": 0.03444880247116089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7224401744897477e-05, + "grad_norm": 25.321617126464844, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8514122366905212, + "num_tokens": 258586421.0, + "step": 6778 + }, + { + "epoch": 0.8623584785650681, + "ewc_loss": 0.034497711807489395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7248856238438748e-05, + "grad_norm": 25.435205459594727, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8598978519439697, + "num_tokens": 258620014.0, + "step": 6779 + }, + { + "epoch": 0.8624856888436586, + "ewc_loss": 0.03437209501862526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7186048353323713e-05, + "grad_norm": 25.34937286376953, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8662638664245605, + "num_tokens": 258656124.0, + "step": 6780 + }, + { + "epoch": 0.862612899122249, + "ewc_loss": 0.03445639833807945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.722819979477208e-05, + "grad_norm": 25.383533477783203, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8490874767303467, + "num_tokens": 258692537.0, + "step": 6781 + }, + { + "epoch": 0.8627401094008396, + "ewc_loss": 0.03445199131965637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7225995179614983e-05, + "grad_norm": 25.46443748474121, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8639477491378784, + "num_tokens": 258732737.0, + "step": 6782 + }, + { + "epoch": 0.8628673196794301, + "ewc_loss": 0.034470099955797195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7235050108865835e-05, + "grad_norm": 25.519514083862305, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8480951189994812, + "num_tokens": 258770634.0, + "step": 6783 + }, + { + "epoch": 0.8629945299580206, + "ewc_loss": 0.034364890307188034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.718244493531529e-05, + "grad_norm": 25.45700454711914, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8606805801391602, + "num_tokens": 258809643.0, + "step": 6784 + }, + { + "epoch": 0.8631217402366111, + "ewc_loss": 0.03441077843308449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7205389667651616e-05, + "grad_norm": 25.61141014099121, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.866342306137085, + "num_tokens": 258845336.0, + "step": 6785 + }, + { + "epoch": 0.8632489505152017, + "ewc_loss": 0.03439180925488472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7195905456901528e-05, + "grad_norm": 25.48169708251953, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8624356985092163, + "num_tokens": 258882206.0, + "step": 6786 + }, + { + "epoch": 0.8633761607937921, + "ewc_loss": 0.03425464779138565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.712732409941964e-05, + "grad_norm": 25.512596130371094, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8682966828346252, + "num_tokens": 258918661.0, + "step": 6787 + }, + { + "epoch": 0.8635033710723826, + "ewc_loss": 0.03440231829881668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.720115869829897e-05, + "grad_norm": 25.33726692199707, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8557276725769043, + "num_tokens": 258957452.0, + "step": 6788 + }, + { + "epoch": 0.8636305813509731, + "ewc_loss": 0.03426365926861763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.713182973617222e-05, + "grad_norm": 25.409460067749023, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8520050644874573, + "num_tokens": 259003507.0, + "step": 6789 + }, + { + "epoch": 0.8637577916295637, + "ewc_loss": 0.034365907311439514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.718295425234828e-05, + "grad_norm": 25.554399490356445, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8552019596099854, + "num_tokens": 259042284.0, + "step": 6790 + }, + { + "epoch": 0.8638850019081542, + "ewc_loss": 0.03432067483663559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7160336938104592e-05, + "grad_norm": 25.22356605529785, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8743586540222168, + "num_tokens": 259082929.0, + "step": 6791 + }, + { + "epoch": 0.8640122121867447, + "ewc_loss": 0.03438003361225128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.719001738820225e-05, + "grad_norm": 25.80927848815918, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.876059889793396, + "num_tokens": 259125920.0, + "step": 6792 + }, + { + "epoch": 0.8641394224653351, + "ewc_loss": 0.034421853721141815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.721092667139601e-05, + "grad_norm": 25.66332244873047, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8616066575050354, + "num_tokens": 259163140.0, + "step": 6793 + }, + { + "epoch": 0.8642666327439257, + "ewc_loss": 0.034107137471437454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7053569536074065e-05, + "grad_norm": 25.11527442932129, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8586563467979431, + "num_tokens": 259199038.0, + "step": 6794 + }, + { + "epoch": 0.8643938430225162, + "ewc_loss": 0.03423473984003067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7117370589403436e-05, + "grad_norm": 25.822355270385742, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8587215542793274, + "num_tokens": 259235713.0, + "step": 6795 + }, + { + "epoch": 0.8645210533011067, + "ewc_loss": 0.03438977897167206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7194888641824946e-05, + "grad_norm": 25.255964279174805, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8502953052520752, + "num_tokens": 259278314.0, + "step": 6796 + }, + { + "epoch": 0.8646482635796973, + "ewc_loss": 0.0341331772506237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7066588043235242e-05, + "grad_norm": 25.44843101501465, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.868833601474762, + "num_tokens": 259313039.0, + "step": 6797 + }, + { + "epoch": 0.8647754738582878, + "ewc_loss": 0.03435490280389786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7177451809402555e-05, + "grad_norm": 25.506662368774414, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8674492835998535, + "num_tokens": 259348845.0, + "step": 6798 + }, + { + "epoch": 0.8649026841368782, + "ewc_loss": 0.03418617695569992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7093088899855502e-05, + "grad_norm": 25.336666107177734, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8548489809036255, + "num_tokens": 259387345.0, + "step": 6799 + }, + { + "epoch": 0.8650298944154687, + "ewc_loss": 0.03428385406732559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7141926946351305e-05, + "grad_norm": 25.464929580688477, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8598823547363281, + "num_tokens": 259427835.0, + "step": 6800 + }, + { + "epoch": 0.8651571046940593, + "ewc_loss": 0.034318361431360245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7159180060843937e-05, + "grad_norm": 25.557266235351562, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8643707036972046, + "num_tokens": 259460453.0, + "step": 6801 + }, + { + "epoch": 0.8652843149726498, + "ewc_loss": 0.034222494810819626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.71112478710711e-05, + "grad_norm": 25.306550979614258, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8663269281387329, + "num_tokens": 259499296.0, + "step": 6802 + }, + { + "epoch": 0.8654115252512403, + "ewc_loss": 0.034250225871801376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7125112208304927e-05, + "grad_norm": 25.32621955871582, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8573336601257324, + "num_tokens": 259537649.0, + "step": 6803 + }, + { + "epoch": 0.8655387355298308, + "ewc_loss": 0.03430283069610596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.71514147950802e-05, + "grad_norm": 25.228816986083984, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8523378968238831, + "num_tokens": 259583136.0, + "step": 6804 + }, + { + "epoch": 0.8656659458084213, + "ewc_loss": 0.03432362899184227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.716181395750027e-05, + "grad_norm": 25.57122802734375, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8645657300949097, + "num_tokens": 259620422.0, + "step": 6805 + }, + { + "epoch": 0.8657931560870118, + "ewc_loss": 0.03439515456557274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7197577108163387e-05, + "grad_norm": 25.26226234436035, + "learning_rate": 1e-06, + "loss": 0.4942, + "mean_token_accuracy": 0.8482316732406616, + "num_tokens": 259663948.0, + "step": 6806 + }, + { + "epoch": 0.8659203663656023, + "ewc_loss": 0.03428531438112259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7142656361102127e-05, + "grad_norm": 25.34438133239746, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8616941571235657, + "num_tokens": 259701986.0, + "step": 6807 + }, + { + "epoch": 0.8660475766441929, + "ewc_loss": 0.03444303572177887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7221518646692857e-05, + "grad_norm": 25.423982620239258, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8753668069839478, + "num_tokens": 259740684.0, + "step": 6808 + }, + { + "epoch": 0.8661747869227834, + "ewc_loss": 0.03435009345412254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7175047105411068e-05, + "grad_norm": 25.230344772338867, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8447249531745911, + "num_tokens": 259781659.0, + "step": 6809 + }, + { + "epoch": 0.8663019972013739, + "ewc_loss": 0.034420568495988846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7210284568136558e-05, + "grad_norm": 25.327173233032227, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8541188836097717, + "num_tokens": 259818554.0, + "step": 6810 + }, + { + "epoch": 0.8664292074799644, + "ewc_loss": 0.034450728446245193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7225363990291953e-05, + "grad_norm": 25.326810836791992, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8627923727035522, + "num_tokens": 259856652.0, + "step": 6811 + }, + { + "epoch": 0.8665564177585549, + "ewc_loss": 0.03437337279319763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7186686818604358e-05, + "grad_norm": 25.357881546020508, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8785187005996704, + "num_tokens": 259898255.0, + "step": 6812 + }, + { + "epoch": 0.8666836280371454, + "ewc_loss": 0.03441393002867699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7206964912475087e-05, + "grad_norm": 25.263349533081055, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8653106689453125, + "num_tokens": 259935305.0, + "step": 6813 + }, + { + "epoch": 0.8668108383157359, + "ewc_loss": 0.034421082586050034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7210541045642458e-05, + "grad_norm": 25.465179443359375, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8639108538627625, + "num_tokens": 259975245.0, + "step": 6814 + }, + { + "epoch": 0.8669380485943264, + "ewc_loss": 0.03442682698369026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7213413229910657e-05, + "grad_norm": 25.268882751464844, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8658283948898315, + "num_tokens": 260014150.0, + "step": 6815 + }, + { + "epoch": 0.867065258872917, + "ewc_loss": 0.03441907837986946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7209538782481104e-05, + "grad_norm": 25.40785026550293, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8510240316390991, + "num_tokens": 260054655.0, + "step": 6816 + }, + { + "epoch": 0.8671924691515075, + "ewc_loss": 0.034442201256752014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.722110027913004e-05, + "grad_norm": 25.35284423828125, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8606696724891663, + "num_tokens": 260091528.0, + "step": 6817 + }, + { + "epoch": 0.8673196794300979, + "ewc_loss": 0.03440447151660919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7202235540025868e-05, + "grad_norm": 25.394060134887695, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8750754594802856, + "num_tokens": 260123069.0, + "step": 6818 + }, + { + "epoch": 0.8674468897086884, + "ewc_loss": 0.03443290665745735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7216452761203982e-05, + "grad_norm": 25.536876678466797, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.845780611038208, + "num_tokens": 260154482.0, + "step": 6819 + }, + { + "epoch": 0.867574099987279, + "ewc_loss": 0.03441028669476509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7205144104082137e-05, + "grad_norm": 25.26639747619629, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8627843260765076, + "num_tokens": 260193983.0, + "step": 6820 + }, + { + "epoch": 0.8677013102658695, + "ewc_loss": 0.0344049371778965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.720246837066952e-05, + "grad_norm": 25.52259635925293, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8630840182304382, + "num_tokens": 260228705.0, + "step": 6821 + }, + { + "epoch": 0.86782852054446, + "ewc_loss": 0.034531038254499435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.726552000036463e-05, + "grad_norm": 25.333873748779297, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8594935536384583, + "num_tokens": 260262821.0, + "step": 6822 + }, + { + "epoch": 0.8679557308230506, + "ewc_loss": 0.03440086171030998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.720043110253755e-05, + "grad_norm": 25.4502010345459, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8590885400772095, + "num_tokens": 260298949.0, + "step": 6823 + }, + { + "epoch": 0.868082941101641, + "ewc_loss": 0.03449518606066704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7247593859792687e-05, + "grad_norm": 25.201204299926758, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8683913946151733, + "num_tokens": 260340028.0, + "step": 6824 + }, + { + "epoch": 0.8682101513802315, + "ewc_loss": 0.0344446562230587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7222328096977435e-05, + "grad_norm": 25.405099868774414, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8599305152893066, + "num_tokens": 260381328.0, + "step": 6825 + }, + { + "epoch": 0.868337361658822, + "ewc_loss": 0.03459157049655914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.729578616505023e-05, + "grad_norm": 25.323320388793945, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8610762357711792, + "num_tokens": 260423421.0, + "step": 6826 + }, + { + "epoch": 0.8684645719374126, + "ewc_loss": 0.03449133411049843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7245667550014332e-05, + "grad_norm": 25.350542068481445, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8589153289794922, + "num_tokens": 260461985.0, + "step": 6827 + }, + { + "epoch": 0.8685917822160031, + "ewc_loss": 0.03458692505955696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7293463315581903e-05, + "grad_norm": 25.486879348754883, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8447215557098389, + "num_tokens": 260503041.0, + "step": 6828 + }, + { + "epoch": 0.8687189924945936, + "ewc_loss": 0.0344865620136261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.724328103591688e-05, + "grad_norm": 25.324745178222656, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8685202598571777, + "num_tokens": 260545553.0, + "step": 6829 + }, + { + "epoch": 0.868846202773184, + "ewc_loss": 0.034470606595277786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7235302948392928e-05, + "grad_norm": 25.322669982910156, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8678048849105835, + "num_tokens": 260577160.0, + "step": 6830 + }, + { + "epoch": 0.8689734130517746, + "ewc_loss": 0.034533899277448654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.726694972603582e-05, + "grad_norm": 25.4665470123291, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.872589111328125, + "num_tokens": 260614748.0, + "step": 6831 + }, + { + "epoch": 0.8691006233303651, + "ewc_loss": 0.034513797610998154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7256897990591824e-05, + "grad_norm": 25.293622970581055, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.861823320388794, + "num_tokens": 260651762.0, + "step": 6832 + }, + { + "epoch": 0.8692278336089556, + "ewc_loss": 0.034514494240283966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7257247236557305e-05, + "grad_norm": 25.39067268371582, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8486168384552002, + "num_tokens": 260694438.0, + "step": 6833 + }, + { + "epoch": 0.8693550438875461, + "ewc_loss": 0.03455295041203499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7276475773542188e-05, + "grad_norm": 25.481409072875977, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8691206574440002, + "num_tokens": 260730256.0, + "step": 6834 + }, + { + "epoch": 0.8694822541661367, + "ewc_loss": 0.03446114435791969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7230571756954305e-05, + "grad_norm": 25.32256507873535, + "learning_rate": 1e-06, + "loss": 0.5134, + "mean_token_accuracy": 0.8400630950927734, + "num_tokens": 260766190.0, + "step": 6835 + }, + { + "epoch": 0.8696094644447271, + "ewc_loss": 0.03446436673402786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7232183381565847e-05, + "grad_norm": 25.335512161254883, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8752413988113403, + "num_tokens": 260801733.0, + "step": 6836 + }, + { + "epoch": 0.8697366747233176, + "ewc_loss": 0.034539997577667236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.726999835227616e-05, + "grad_norm": 25.387800216674805, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8715049624443054, + "num_tokens": 260843461.0, + "step": 6837 + }, + { + "epoch": 0.8698638850019081, + "ewc_loss": 0.03451511263847351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7257556464755908e-05, + "grad_norm": 25.331348419189453, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8589270114898682, + "num_tokens": 260875006.0, + "step": 6838 + }, + { + "epoch": 0.8699910952804987, + "ewc_loss": 0.034461427479982376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.723071363812778e-05, + "grad_norm": 25.326128005981445, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.854864776134491, + "num_tokens": 260917553.0, + "step": 6839 + }, + { + "epoch": 0.8701183055590892, + "ewc_loss": 0.034568529576063156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.728426468616817e-05, + "grad_norm": 25.40546226501465, + "learning_rate": 1e-06, + "loss": 0.5201, + "mean_token_accuracy": 0.8364182710647583, + "num_tokens": 260955293.0, + "step": 6840 + }, + { + "epoch": 0.8702455158376797, + "ewc_loss": 0.03454055264592171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7270276657654904e-05, + "grad_norm": 25.381235122680664, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8599894046783447, + "num_tokens": 260983966.0, + "step": 6841 + }, + { + "epoch": 0.8703727261162701, + "ewc_loss": 0.03454700484871864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.727350172586739e-05, + "grad_norm": 25.2401123046875, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8733016848564148, + "num_tokens": 261019434.0, + "step": 6842 + }, + { + "epoch": 0.8704999363948607, + "ewc_loss": 0.034548960626125336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7274480342166498e-05, + "grad_norm": 25.457740783691406, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8492320775985718, + "num_tokens": 261053761.0, + "step": 6843 + }, + { + "epoch": 0.8706271466734512, + "ewc_loss": 0.0346679762005806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.73339885805035e-05, + "grad_norm": 25.39710807800293, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8508374094963074, + "num_tokens": 261093490.0, + "step": 6844 + }, + { + "epoch": 0.8707543569520417, + "ewc_loss": 0.03453167900443077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7265840142499655e-05, + "grad_norm": 25.311328887939453, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8616336584091187, + "num_tokens": 261129518.0, + "step": 6845 + }, + { + "epoch": 0.8708815672306323, + "ewc_loss": 0.03465920686721802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7329602997051552e-05, + "grad_norm": 25.40103530883789, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.843552827835083, + "num_tokens": 261164282.0, + "step": 6846 + }, + { + "epoch": 0.8710087775092228, + "ewc_loss": 0.03459791839122772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7298958482570015e-05, + "grad_norm": 25.32413673400879, + "learning_rate": 1e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8400388956069946, + "num_tokens": 261205055.0, + "step": 6847 + }, + { + "epoch": 0.8711359877878132, + "ewc_loss": 0.034591346979141235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.729567338770721e-05, + "grad_norm": 25.295244216918945, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8542452454566956, + "num_tokens": 261242329.0, + "step": 6848 + }, + { + "epoch": 0.8712631980664037, + "ewc_loss": 0.03463906794786453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7319533071713522e-05, + "grad_norm": 25.35765266418457, + "learning_rate": 1e-06, + "loss": 0.5379, + "mean_token_accuracy": 0.8341000080108643, + "num_tokens": 261280330.0, + "step": 6849 + }, + { + "epoch": 0.8713904083449943, + "ewc_loss": 0.03463312238454819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7316560843028128e-05, + "grad_norm": 25.256690979003906, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8714958429336548, + "num_tokens": 261318232.0, + "step": 6850 + }, + { + "epoch": 0.8715176186235848, + "ewc_loss": 0.03462095558643341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.731047814246267e-05, + "grad_norm": 25.43869972229004, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8770929574966431, + "num_tokens": 261351483.0, + "step": 6851 + }, + { + "epoch": 0.8716448289021753, + "ewc_loss": 0.03471526503562927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.735763180477079e-05, + "grad_norm": 25.334169387817383, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8631772994995117, + "num_tokens": 261387058.0, + "step": 6852 + }, + { + "epoch": 0.8717720391807658, + "ewc_loss": 0.03460497781634331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7302489141002297e-05, + "grad_norm": 25.34182357788086, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8762564659118652, + "num_tokens": 261430110.0, + "step": 6853 + }, + { + "epoch": 0.8718992494593563, + "ewc_loss": 0.034702450037002563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.73512253240915e-05, + "grad_norm": 25.342456817626953, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8679556846618652, + "num_tokens": 261470707.0, + "step": 6854 + }, + { + "epoch": 0.8720264597379468, + "ewc_loss": 0.03466026484966278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7330132322967984e-05, + "grad_norm": 25.42186737060547, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8675986528396606, + "num_tokens": 261507146.0, + "step": 6855 + }, + { + "epoch": 0.8721536700165373, + "ewc_loss": 0.034657906740903854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7328953617834486e-05, + "grad_norm": 25.360042572021484, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8458341360092163, + "num_tokens": 261549723.0, + "step": 6856 + }, + { + "epoch": 0.8722808802951278, + "ewc_loss": 0.03462350368499756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7311751435045153e-05, + "grad_norm": 25.29474449157715, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.866037905216217, + "num_tokens": 261591030.0, + "step": 6857 + }, + { + "epoch": 0.8724080905737184, + "ewc_loss": 0.03461506962776184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7307535017607734e-05, + "grad_norm": 25.27285385131836, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8704398274421692, + "num_tokens": 261623919.0, + "step": 6858 + }, + { + "epoch": 0.8725353008523089, + "ewc_loss": 0.034738149493932724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7369075067108497e-05, + "grad_norm": 25.433908462524414, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8767508864402771, + "num_tokens": 261655277.0, + "step": 6859 + }, + { + "epoch": 0.8726625111308994, + "ewc_loss": 0.03468484804034233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7342423234367743e-05, + "grad_norm": 25.41967010498047, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8485409021377563, + "num_tokens": 261697462.0, + "step": 6860 + }, + { + "epoch": 0.8727897214094898, + "ewc_loss": 0.034534748643636703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7267373550566845e-05, + "grad_norm": 25.25611686706543, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8577374815940857, + "num_tokens": 261739134.0, + "step": 6861 + }, + { + "epoch": 0.8729169316880804, + "ewc_loss": 0.03467070683836937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.733535282255616e-05, + "grad_norm": 25.323911666870117, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8635387420654297, + "num_tokens": 261776295.0, + "step": 6862 + }, + { + "epoch": 0.8730441419666709, + "ewc_loss": 0.03464100509881973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7320502593065612e-05, + "grad_norm": 25.348119735717773, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8646754026412964, + "num_tokens": 261812388.0, + "step": 6863 + }, + { + "epoch": 0.8731713522452614, + "ewc_loss": 0.03460180386900902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7300901163253002e-05, + "grad_norm": 25.363967895507812, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8624756932258606, + "num_tokens": 261855537.0, + "step": 6864 + }, + { + "epoch": 0.873298562523852, + "ewc_loss": 0.034542836248874664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.727141898300033e-05, + "grad_norm": 25.1256160736084, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8687615990638733, + "num_tokens": 261899922.0, + "step": 6865 + }, + { + "epoch": 0.8734257728024425, + "ewc_loss": 0.03473114222288132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7365571693517268e-05, + "grad_norm": 25.577117919921875, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8731639385223389, + "num_tokens": 261934475.0, + "step": 6866 + }, + { + "epoch": 0.8735529830810329, + "ewc_loss": 0.0346909761428833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7345488231512718e-05, + "grad_norm": 25.185508728027344, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8617146015167236, + "num_tokens": 261973395.0, + "step": 6867 + }, + { + "epoch": 0.8736801933596234, + "ewc_loss": 0.03453559800982475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7267799194087274e-05, + "grad_norm": 25.36511993408203, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8444517850875854, + "num_tokens": 262011644.0, + "step": 6868 + }, + { + "epoch": 0.873807403638214, + "ewc_loss": 0.03468097373843193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7340486010652967e-05, + "grad_norm": 25.2902889251709, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.856521725654602, + "num_tokens": 262044038.0, + "step": 6869 + }, + { + "epoch": 0.8739346139168045, + "ewc_loss": 0.034604355692863464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.730217809381429e-05, + "grad_norm": 25.29020118713379, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.859959602355957, + "num_tokens": 262081023.0, + "step": 6870 + }, + { + "epoch": 0.874061824195395, + "ewc_loss": 0.03463972359895706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7319862308795564e-05, + "grad_norm": 25.310009002685547, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8541141748428345, + "num_tokens": 262123981.0, + "step": 6871 + }, + { + "epoch": 0.8741890344739855, + "ewc_loss": 0.03464985638856888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.732492819428444e-05, + "grad_norm": 25.321550369262695, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8517665863037109, + "num_tokens": 262161542.0, + "step": 6872 + }, + { + "epoch": 0.874316244752576, + "ewc_loss": 0.03473825380206108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.73691278178012e-05, + "grad_norm": 25.31471061706543, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.850715160369873, + "num_tokens": 262202482.0, + "step": 6873 + }, + { + "epoch": 0.8744434550311665, + "ewc_loss": 0.03464696928858757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7323483916698024e-05, + "grad_norm": 25.38017463684082, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8627678751945496, + "num_tokens": 262239499.0, + "step": 6874 + }, + { + "epoch": 0.874570665309757, + "ewc_loss": 0.03473624587059021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7368123735650443e-05, + "grad_norm": 25.33403778076172, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8614279627799988, + "num_tokens": 262281324.0, + "step": 6875 + }, + { + "epoch": 0.8746978755883476, + "ewc_loss": 0.034678731113672256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7339365513180383e-05, + "grad_norm": 25.39971160888672, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8525130748748779, + "num_tokens": 262318772.0, + "step": 6876 + }, + { + "epoch": 0.8748250858669381, + "ewc_loss": 0.03472238406538963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.736119156703353e-05, + "grad_norm": 25.35914421081543, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8729497790336609, + "num_tokens": 262353264.0, + "step": 6877 + }, + { + "epoch": 0.8749522961455286, + "ewc_loss": 0.03471171855926514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7355860109091736e-05, + "grad_norm": 25.38768768310547, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8581843376159668, + "num_tokens": 262393360.0, + "step": 6878 + }, + { + "epoch": 0.875079506424119, + "ewc_loss": 0.03466632589697838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.733316275931429e-05, + "grad_norm": 25.278501510620117, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8713159561157227, + "num_tokens": 262432010.0, + "step": 6879 + }, + { + "epoch": 0.8752067167027096, + "ewc_loss": 0.03466283157467842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7331416529486887e-05, + "grad_norm": 25.329252243041992, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8604068160057068, + "num_tokens": 262468828.0, + "step": 6880 + }, + { + "epoch": 0.8753339269813001, + "ewc_loss": 0.03468892350792885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7344462321489118e-05, + "grad_norm": 25.315086364746094, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8683195114135742, + "num_tokens": 262515141.0, + "step": 6881 + }, + { + "epoch": 0.8754611372598906, + "ewc_loss": 0.03473411872982979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7367059626849368e-05, + "grad_norm": 25.36685562133789, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8577227592468262, + "num_tokens": 262553137.0, + "step": 6882 + }, + { + "epoch": 0.8755883475384811, + "ewc_loss": 0.03467639535665512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7338197721983306e-05, + "grad_norm": 25.274999618530273, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8831585645675659, + "num_tokens": 262593847.0, + "step": 6883 + }, + { + "epoch": 0.8757155578170717, + "ewc_loss": 0.034724436700344086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.736221747705713e-05, + "grad_norm": 25.468761444091797, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8500180244445801, + "num_tokens": 262635048.0, + "step": 6884 + }, + { + "epoch": 0.8758427680956621, + "ewc_loss": 0.0347367599606514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7368380213156343e-05, + "grad_norm": 25.31316375732422, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8523739576339722, + "num_tokens": 262671200.0, + "step": 6885 + }, + { + "epoch": 0.8759699783742526, + "ewc_loss": 0.034701842814683914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.735092155286111e-05, + "grad_norm": 25.461444854736328, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8629496097564697, + "num_tokens": 262708304.0, + "step": 6886 + }, + { + "epoch": 0.8760971886528431, + "ewc_loss": 0.03471462428569794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7357311662635766e-05, + "grad_norm": 25.352663040161133, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8581063747406006, + "num_tokens": 262750680.0, + "step": 6887 + }, + { + "epoch": 0.8762243989314337, + "ewc_loss": 0.03471720591187477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7358603145112284e-05, + "grad_norm": 25.491910934448242, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8559558391571045, + "num_tokens": 262784845.0, + "step": 6888 + }, + { + "epoch": 0.8763516092100242, + "ewc_loss": 0.03471221402287483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.735610749165062e-05, + "grad_norm": 25.440868377685547, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8608385324478149, + "num_tokens": 262822646.0, + "step": 6889 + }, + { + "epoch": 0.8764788194886147, + "ewc_loss": 0.03463783860206604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7318920072284527e-05, + "grad_norm": 25.306427001953125, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8574550151824951, + "num_tokens": 262863663.0, + "step": 6890 + }, + { + "epoch": 0.8766060297672051, + "ewc_loss": 0.03465300053358078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7326499801129103e-05, + "grad_norm": 25.33935546875, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.847864031791687, + "num_tokens": 262903726.0, + "step": 6891 + }, + { + "epoch": 0.8767332400457957, + "ewc_loss": 0.03469392657279968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7346963431918994e-05, + "grad_norm": 25.401397705078125, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8497208952903748, + "num_tokens": 262942748.0, + "step": 6892 + }, + { + "epoch": 0.8768604503243862, + "ewc_loss": 0.0347292385995388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.736461854306981e-05, + "grad_norm": 25.350156784057617, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8401473760604858, + "num_tokens": 262977639.0, + "step": 6893 + }, + { + "epoch": 0.8769876606029767, + "ewc_loss": 0.034679606556892395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7339803889626637e-05, + "grad_norm": 25.391357421875, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8634117841720581, + "num_tokens": 263015970.0, + "step": 6894 + }, + { + "epoch": 0.8771148708815673, + "ewc_loss": 0.0347631461918354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.738157334330026e-05, + "grad_norm": 25.31011962890625, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8703032732009888, + "num_tokens": 263058618.0, + "step": 6895 + }, + { + "epoch": 0.8772420811601578, + "ewc_loss": 0.03470493480563164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7352467693854123e-05, + "grad_norm": 25.335100173950195, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8439221382141113, + "num_tokens": 263098628.0, + "step": 6896 + }, + { + "epoch": 0.8773692914387482, + "ewc_loss": 0.034775782376527786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7387890693498775e-05, + "grad_norm": 25.467830657958984, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8635789752006531, + "num_tokens": 263136793.0, + "step": 6897 + }, + { + "epoch": 0.8774965017173387, + "ewc_loss": 0.034728482365608215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7364242012263276e-05, + "grad_norm": 25.29265785217285, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8644381761550903, + "num_tokens": 263170996.0, + "step": 6898 + }, + { + "epoch": 0.8776237119959293, + "ewc_loss": 0.03469667583703995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7348338587908074e-05, + "grad_norm": 25.33803367614746, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8520240187644958, + "num_tokens": 263199961.0, + "step": 6899 + }, + { + "epoch": 0.8777509222745198, + "ewc_loss": 0.03476674109697342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7383370504830964e-05, + "grad_norm": 25.292831420898438, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8588613867759705, + "num_tokens": 263241541.0, + "step": 6900 + }, + { + "epoch": 0.8778781325531103, + "ewc_loss": 0.034734439104795456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.736721969791688e-05, + "grad_norm": 25.431358337402344, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8736162185668945, + "num_tokens": 263278778.0, + "step": 6901 + }, + { + "epoch": 0.8780053428317008, + "ewc_loss": 0.034776460379362106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.738823084451724e-05, + "grad_norm": 25.34300994873047, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8640187978744507, + "num_tokens": 263310155.0, + "step": 6902 + }, + { + "epoch": 0.8781325531102913, + "ewc_loss": 0.034701090306043625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7350545022054575e-05, + "grad_norm": 25.377674102783203, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8566147089004517, + "num_tokens": 263355793.0, + "step": 6903 + }, + { + "epoch": 0.8782597633888818, + "ewc_loss": 0.03473857790231705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7369289707858115e-05, + "grad_norm": 25.37552261352539, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8709614276885986, + "num_tokens": 263394446.0, + "step": 6904 + }, + { + "epoch": 0.8783869736674723, + "ewc_loss": 0.03472564369440079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7362821381539106e-05, + "grad_norm": 25.305442810058594, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8642861843109131, + "num_tokens": 263438479.0, + "step": 6905 + }, + { + "epoch": 0.8785141839460628, + "ewc_loss": 0.03476418927311897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7382095393259078e-05, + "grad_norm": 25.26862335205078, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8463174104690552, + "num_tokens": 263478514.0, + "step": 6906 + }, + { + "epoch": 0.8786413942246534, + "ewc_loss": 0.0348140113055706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.740700645314064e-05, + "grad_norm": 25.349123001098633, + "learning_rate": 1e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8408492803573608, + "num_tokens": 263516734.0, + "step": 6907 + }, + { + "epoch": 0.8787686045032439, + "ewc_loss": 0.03470798209309578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.735399018798489e-05, + "grad_norm": 25.271175384521484, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8513653874397278, + "num_tokens": 263560105.0, + "step": 6908 + }, + { + "epoch": 0.8788958147818343, + "ewc_loss": 0.03480402007699013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7402009689249098e-05, + "grad_norm": 25.360612869262695, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8634462356567383, + "num_tokens": 263597033.0, + "step": 6909 + }, + { + "epoch": 0.8790230250604248, + "ewc_loss": 0.034861527383327484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.743076427374035e-05, + "grad_norm": 25.40666389465332, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8674431443214417, + "num_tokens": 263631106.0, + "step": 6910 + }, + { + "epoch": 0.8791502353390154, + "ewc_loss": 0.034765731543302536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7382866644766182e-05, + "grad_norm": 25.220582962036133, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8545503616333008, + "num_tokens": 263668227.0, + "step": 6911 + }, + { + "epoch": 0.8792774456176059, + "ewc_loss": 0.03474334999918938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7371674402966164e-05, + "grad_norm": 25.336366653442383, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.862067461013794, + "num_tokens": 263704335.0, + "step": 6912 + }, + { + "epoch": 0.8794046558961964, + "ewc_loss": 0.03480972349643707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7404861864633858e-05, + "grad_norm": 25.402267456054688, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8668112754821777, + "num_tokens": 263742532.0, + "step": 6913 + }, + { + "epoch": 0.879531866174787, + "ewc_loss": 0.03482288494706154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7411442968295887e-05, + "grad_norm": 25.338687896728516, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8617234230041504, + "num_tokens": 263782321.0, + "step": 6914 + }, + { + "epoch": 0.8796590764533775, + "ewc_loss": 0.0348086841404438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7404341633664444e-05, + "grad_norm": 25.444137573242188, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8687477111816406, + "num_tokens": 263818037.0, + "step": 6915 + }, + { + "epoch": 0.8797862867319679, + "ewc_loss": 0.03482172265648842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7410860891686752e-05, + "grad_norm": 25.357467651367188, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.846531867980957, + "num_tokens": 263854901.0, + "step": 6916 + }, + { + "epoch": 0.8799134970105584, + "ewc_loss": 0.03481973707675934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7409867723472416e-05, + "grad_norm": 25.45219612121582, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8656994104385376, + "num_tokens": 263896241.0, + "step": 6917 + }, + { + "epoch": 0.880040707289149, + "ewc_loss": 0.034821510314941406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7410755390301347e-05, + "grad_norm": 25.3316707611084, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8625754117965698, + "num_tokens": 263934673.0, + "step": 6918 + }, + { + "epoch": 0.8801679175677395, + "ewc_loss": 0.034785348922014236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.73926746356301e-05, + "grad_norm": 25.411561965942383, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8519889116287231, + "num_tokens": 263973456.0, + "step": 6919 + }, + { + "epoch": 0.88029512784633, + "ewc_loss": 0.034856900572776794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.742845051921904e-05, + "grad_norm": 25.416034698486328, + "learning_rate": 1e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.843647837638855, + "num_tokens": 264006050.0, + "step": 6920 + }, + { + "epoch": 0.8804223381249205, + "ewc_loss": 0.03475179523229599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7375898096361198e-05, + "grad_norm": 25.408409118652344, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8709036111831665, + "num_tokens": 264042736.0, + "step": 6921 + }, + { + "epoch": 0.880549548403511, + "ewc_loss": 0.034811172634363174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.740558582241647e-05, + "grad_norm": 25.388774871826172, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8511513471603394, + "num_tokens": 264084433.0, + "step": 6922 + }, + { + "epoch": 0.8806767586821015, + "ewc_loss": 0.034788016229867935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.73940079548629e-05, + "grad_norm": 25.34859275817871, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8687931895256042, + "num_tokens": 264118331.0, + "step": 6923 + }, + { + "epoch": 0.880803968960692, + "ewc_loss": 0.034790001809597015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7395001123077236e-05, + "grad_norm": 25.42698860168457, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8633937835693359, + "num_tokens": 264152015.0, + "step": 6924 + }, + { + "epoch": 0.8809311792392825, + "ewc_loss": 0.034835994243621826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7417996787116863e-05, + "grad_norm": 25.30031967163086, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8673295378684998, + "num_tokens": 264187085.0, + "step": 6925 + }, + { + "epoch": 0.8810583895178731, + "ewc_loss": 0.03478594869375229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7392974768881686e-05, + "grad_norm": 25.42365837097168, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8580443859100342, + "num_tokens": 264225558.0, + "step": 6926 + }, + { + "epoch": 0.8811855997964636, + "ewc_loss": 0.03485140576958656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7425702026230283e-05, + "grad_norm": 25.378812789916992, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8515082597732544, + "num_tokens": 264264220.0, + "step": 6927 + }, + { + "epoch": 0.881312810075054, + "ewc_loss": 0.034807585179805756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7403792298864573e-05, + "grad_norm": 25.36086654663086, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8642024397850037, + "num_tokens": 264302097.0, + "step": 6928 + }, + { + "epoch": 0.8814400203536445, + "ewc_loss": 0.03485066816210747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7425334590370767e-05, + "grad_norm": 25.37908935546875, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8631623387336731, + "num_tokens": 264341966.0, + "step": 6929 + }, + { + "epoch": 0.8815672306322351, + "ewc_loss": 0.03480314835906029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.740157495078165e-05, + "grad_norm": 25.444379806518555, + "learning_rate": 1e-06, + "loss": 0.5324, + "mean_token_accuracy": 0.8374783396720886, + "num_tokens": 264384298.0, + "step": 6930 + }, + { + "epoch": 0.8816944409108256, + "ewc_loss": 0.03486260399222374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.74313026946038e-05, + "grad_norm": 25.34638023376465, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8609420657157898, + "num_tokens": 264422513.0, + "step": 6931 + }, + { + "epoch": 0.8818216511894161, + "ewc_loss": 0.034782931208610535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7391465007676743e-05, + "grad_norm": 25.442832946777344, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8737573623657227, + "num_tokens": 264461819.0, + "step": 6932 + }, + { + "epoch": 0.8819488614680067, + "ewc_loss": 0.034866753965616226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7433376342523843e-05, + "grad_norm": 25.360490798950195, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8505129218101501, + "num_tokens": 264502618.0, + "step": 6933 + }, + { + "epoch": 0.8820760717465971, + "ewc_loss": 0.03476547449827194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.738273749651853e-05, + "grad_norm": 25.36209487915039, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.854285717010498, + "num_tokens": 264534382.0, + "step": 6934 + }, + { + "epoch": 0.8822032820251876, + "ewc_loss": 0.03487035632133484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7435178961022757e-05, + "grad_norm": 25.395565032958984, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8498719334602356, + "num_tokens": 264575297.0, + "step": 6935 + }, + { + "epoch": 0.8823304923037781, + "ewc_loss": 0.034794945269823074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7397473129676655e-05, + "grad_norm": 25.446584701538086, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8663578629493713, + "num_tokens": 264611822.0, + "step": 6936 + }, + { + "epoch": 0.8824577025823687, + "ewc_loss": 0.03475483879446983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7377418771502562e-05, + "grad_norm": 25.313350677490234, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8563448786735535, + "num_tokens": 264657483.0, + "step": 6937 + }, + { + "epoch": 0.8825849128609592, + "ewc_loss": 0.03479841724038124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7399208445567638e-05, + "grad_norm": 25.470111846923828, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8656891584396362, + "num_tokens": 264690633.0, + "step": 6938 + }, + { + "epoch": 0.8827121231395497, + "ewc_loss": 0.034826092422008514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.741304549796041e-05, + "grad_norm": 25.37203598022461, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8748878836631775, + "num_tokens": 264733268.0, + "step": 6939 + }, + { + "epoch": 0.8828393334181401, + "ewc_loss": 0.03475799784064293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7378999473294243e-05, + "grad_norm": 25.50070571899414, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8534342646598816, + "num_tokens": 264774769.0, + "step": 6940 + }, + { + "epoch": 0.8829665436967307, + "ewc_loss": 0.03476778790354729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7383894373779185e-05, + "grad_norm": 25.33171844482422, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8567805290222168, + "num_tokens": 264806658.0, + "step": 6941 + }, + { + "epoch": 0.8830937539753212, + "ewc_loss": 0.0347660630941391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7383032172801904e-05, + "grad_norm": 25.42584991455078, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8476579785346985, + "num_tokens": 264841054.0, + "step": 6942 + }, + { + "epoch": 0.8832209642539117, + "ewc_loss": 0.03483039513230324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7415197362424806e-05, + "grad_norm": 25.435827255249023, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8547784686088562, + "num_tokens": 264880241.0, + "step": 6943 + }, + { + "epoch": 0.8833481745325023, + "ewc_loss": 0.03477957099676132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.738978608045727e-05, + "grad_norm": 25.394433975219727, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8619425296783447, + "num_tokens": 264922449.0, + "step": 6944 + }, + { + "epoch": 0.8834753848110928, + "ewc_loss": 0.03481375798583031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.740687912388239e-05, + "grad_norm": 25.370899200439453, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8603147268295288, + "num_tokens": 264961605.0, + "step": 6945 + }, + { + "epoch": 0.8836025950896832, + "ewc_loss": 0.03480764478445053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7403823221684434e-05, + "grad_norm": 25.338085174560547, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8688347339630127, + "num_tokens": 264992924.0, + "step": 6946 + }, + { + "epoch": 0.8837298053682737, + "ewc_loss": 0.03487365320324898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.743682696542237e-05, + "grad_norm": 25.490121841430664, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8605057001113892, + "num_tokens": 265031849.0, + "step": 6947 + }, + { + "epoch": 0.8838570156468643, + "ewc_loss": 0.034874700009822845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.743735083437059e-05, + "grad_norm": 25.326950073242188, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8730347752571106, + "num_tokens": 265069714.0, + "step": 6948 + }, + { + "epoch": 0.8839842259254548, + "ewc_loss": 0.034795962274074554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7397980627720244e-05, + "grad_norm": 25.43389129638672, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8556567430496216, + "num_tokens": 265103325.0, + "step": 6949 + }, + { + "epoch": 0.8841114362040453, + "ewc_loss": 0.034891705960035324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7445852790842764e-05, + "grad_norm": 25.426828384399414, + "learning_rate": 1e-06, + "loss": 0.5091, + "mean_token_accuracy": 0.8388287425041199, + "num_tokens": 265139133.0, + "step": 6950 + }, + { + "epoch": 0.8842386464826358, + "ewc_loss": 0.03487754613161087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7438773284084164e-05, + "grad_norm": 25.430252075195312, + "learning_rate": 1e-06, + "loss": 0.5258, + "mean_token_accuracy": 0.8358725905418396, + "num_tokens": 265183536.0, + "step": 6951 + }, + { + "epoch": 0.8843658567612263, + "ewc_loss": 0.034872617572546005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.743630855344236e-05, + "grad_norm": 25.353364944458008, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8518766760826111, + "num_tokens": 265226288.0, + "step": 6952 + }, + { + "epoch": 0.8844930670398168, + "ewc_loss": 0.03489766642451286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7448834114475176e-05, + "grad_norm": 25.570964813232422, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8592775464057922, + "num_tokens": 265265846.0, + "step": 6953 + }, + { + "epoch": 0.8846202773184073, + "ewc_loss": 0.034880198538303375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.744009932735935e-05, + "grad_norm": 25.307512283325195, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8736722469329834, + "num_tokens": 265300612.0, + "step": 6954 + }, + { + "epoch": 0.8847474875969978, + "ewc_loss": 0.03484511747956276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7422558812540956e-05, + "grad_norm": 25.473953247070312, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8701465725898743, + "num_tokens": 265341440.0, + "step": 6955 + }, + { + "epoch": 0.8848746978755884, + "ewc_loss": 0.034956321120262146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7478159861639142e-05, + "grad_norm": 25.541038513183594, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8502703905105591, + "num_tokens": 265384854.0, + "step": 6956 + }, + { + "epoch": 0.8850019081541789, + "ewc_loss": 0.03484393656253815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7421967640984803e-05, + "grad_norm": 25.499866485595703, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8537238240242004, + "num_tokens": 265420611.0, + "step": 6957 + }, + { + "epoch": 0.8851291184327693, + "ewc_loss": 0.03488987684249878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.744493783917278e-05, + "grad_norm": 25.516847610473633, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8591635227203369, + "num_tokens": 265457720.0, + "step": 6958 + }, + { + "epoch": 0.8852563287113598, + "ewc_loss": 0.03479158133268356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7395790564478375e-05, + "grad_norm": 25.449506759643555, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8691403865814209, + "num_tokens": 265493434.0, + "step": 6959 + }, + { + "epoch": 0.8853835389899504, + "ewc_loss": 0.03481375053524971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7406875485903583e-05, + "grad_norm": 25.56410789489746, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8619394302368164, + "num_tokens": 265529015.0, + "step": 6960 + }, + { + "epoch": 0.8855107492685409, + "ewc_loss": 0.03474530950188637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7372654838254675e-05, + "grad_norm": 25.41377067565918, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8449788093566895, + "num_tokens": 265568698.0, + "step": 6961 + }, + { + "epoch": 0.8856379595471314, + "ewc_loss": 0.03474506735801697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7372532965964638e-05, + "grad_norm": 25.3625545501709, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8609280586242676, + "num_tokens": 265605631.0, + "step": 6962 + }, + { + "epoch": 0.885765169825722, + "ewc_loss": 0.034828029572963715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.74140150193125e-05, + "grad_norm": 25.49190330505371, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8616746068000793, + "num_tokens": 265647071.0, + "step": 6963 + }, + { + "epoch": 0.8858923801043125, + "ewc_loss": 0.03478076681494713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7390382708981633e-05, + "grad_norm": 25.426109313964844, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8594663143157959, + "num_tokens": 265688680.0, + "step": 6964 + }, + { + "epoch": 0.8860195903829029, + "ewc_loss": 0.034771405160427094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.738570244924631e-05, + "grad_norm": 25.49043083190918, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8663491010665894, + "num_tokens": 265727707.0, + "step": 6965 + }, + { + "epoch": 0.8861468006614934, + "ewc_loss": 0.0348052978515625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7402648154529743e-05, + "grad_norm": 25.394304275512695, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8574215769767761, + "num_tokens": 265765860.0, + "step": 6966 + }, + { + "epoch": 0.886274010940084, + "ewc_loss": 0.034704968333244324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7352484064758755e-05, + "grad_norm": 25.405366897583008, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8626946210861206, + "num_tokens": 265802988.0, + "step": 6967 + }, + { + "epoch": 0.8864012212186745, + "ewc_loss": 0.034793782979249954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.739689105306752e-05, + "grad_norm": 25.340579986572266, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8547775149345398, + "num_tokens": 265840614.0, + "step": 6968 + }, + { + "epoch": 0.886528431497265, + "ewc_loss": 0.03479470685124397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.739735307637602e-05, + "grad_norm": 25.38578987121582, + "learning_rate": 1e-06, + "loss": 0.5274, + "mean_token_accuracy": 0.8342491388320923, + "num_tokens": 265883032.0, + "step": 6969 + }, + { + "epoch": 0.8866556417758555, + "ewc_loss": 0.034853916615247726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7426958947908133e-05, + "grad_norm": 25.4471435546875, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8719403743743896, + "num_tokens": 265918309.0, + "step": 6970 + }, + { + "epoch": 0.886782852054446, + "ewc_loss": 0.03484984114766121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.742491986078676e-05, + "grad_norm": 25.424095153808594, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8626549243927002, + "num_tokens": 265956406.0, + "step": 6971 + }, + { + "epoch": 0.8869100623330365, + "ewc_loss": 0.03479327633976936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7396638213540427e-05, + "grad_norm": 25.473236083984375, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8688035011291504, + "num_tokens": 265988413.0, + "step": 6972 + }, + { + "epoch": 0.887037272611627, + "ewc_loss": 0.034796539694070816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7398269847035408e-05, + "grad_norm": 25.39055633544922, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8594471216201782, + "num_tokens": 266022036.0, + "step": 6973 + }, + { + "epoch": 0.8871644828902175, + "ewc_loss": 0.03479955345392227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7399775970261544e-05, + "grad_norm": 25.48834228515625, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8580697774887085, + "num_tokens": 266053735.0, + "step": 6974 + }, + { + "epoch": 0.8872916931688081, + "ewc_loss": 0.03484411537647247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.742205859045498e-05, + "grad_norm": 25.362140655517578, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8743543028831482, + "num_tokens": 266094198.0, + "step": 6975 + }, + { + "epoch": 0.8874189034473986, + "ewc_loss": 0.03479153290390968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.739576691761613e-05, + "grad_norm": 25.579330444335938, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8688184022903442, + "num_tokens": 266132213.0, + "step": 6976 + }, + { + "epoch": 0.887546113725989, + "ewc_loss": 0.034838758409023285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7419379219063558e-05, + "grad_norm": 25.431846618652344, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8598363399505615, + "num_tokens": 266168001.0, + "step": 6977 + }, + { + "epoch": 0.8876733240045795, + "ewc_loss": 0.034780941903591156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7390471839462407e-05, + "grad_norm": 25.525745391845703, + "learning_rate": 1e-06, + "loss": 0.5295, + "mean_token_accuracy": 0.8311884999275208, + "num_tokens": 266208771.0, + "step": 6978 + }, + { + "epoch": 0.8878005342831701, + "ewc_loss": 0.03483697399497032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7418486095266417e-05, + "grad_norm": 25.407428741455078, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8651411533355713, + "num_tokens": 266248090.0, + "step": 6979 + }, + { + "epoch": 0.8879277445617606, + "ewc_loss": 0.03479265049099922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7396325347363017e-05, + "grad_norm": 25.467056274414062, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8495684862136841, + "num_tokens": 266283748.0, + "step": 6980 + }, + { + "epoch": 0.8880549548403511, + "ewc_loss": 0.03481684625148773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7408423445886e-05, + "grad_norm": 25.385902404785156, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8674430847167969, + "num_tokens": 266316613.0, + "step": 6981 + }, + { + "epoch": 0.8881821651189417, + "ewc_loss": 0.03484566509723663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.742283347994089e-05, + "grad_norm": 25.485000610351562, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.862578272819519, + "num_tokens": 266357515.0, + "step": 6982 + }, + { + "epoch": 0.8883093753975321, + "ewc_loss": 0.034836120903491974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7418060451745987e-05, + "grad_norm": 25.467483520507812, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8723946809768677, + "num_tokens": 266393593.0, + "step": 6983 + }, + { + "epoch": 0.8884365856761226, + "ewc_loss": 0.03489864990115166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7449325241614133e-05, + "grad_norm": 25.55268669128418, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8713833093643188, + "num_tokens": 266436104.0, + "step": 6984 + }, + { + "epoch": 0.8885637959547131, + "ewc_loss": 0.03483382984995842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7416914488421753e-05, + "grad_norm": 25.40965461730957, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8729357719421387, + "num_tokens": 266472809.0, + "step": 6985 + }, + { + "epoch": 0.8886910062333037, + "ewc_loss": 0.034796204417943954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7398102500010282e-05, + "grad_norm": 25.375286102294922, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8708866834640503, + "num_tokens": 266512726.0, + "step": 6986 + }, + { + "epoch": 0.8888182165118942, + "ewc_loss": 0.03489820659160614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.74491033249069e-05, + "grad_norm": 25.42886734008789, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.849234938621521, + "num_tokens": 266546098.0, + "step": 6987 + }, + { + "epoch": 0.8889454267904847, + "ewc_loss": 0.03485463187098503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7427315469831228e-05, + "grad_norm": 25.331865310668945, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8511626720428467, + "num_tokens": 266588478.0, + "step": 6988 + }, + { + "epoch": 0.8890726370690751, + "ewc_loss": 0.034952159970998764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.747608075675089e-05, + "grad_norm": 25.433996200561523, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8494420051574707, + "num_tokens": 266628975.0, + "step": 6989 + }, + { + "epoch": 0.8891998473476657, + "ewc_loss": 0.034947119653224945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7473559637437575e-05, + "grad_norm": 25.481765747070312, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8690584301948547, + "num_tokens": 266667533.0, + "step": 6990 + }, + { + "epoch": 0.8893270576262562, + "ewc_loss": 0.034870993345975876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7435497284168378e-05, + "grad_norm": 25.352428436279297, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8437262773513794, + "num_tokens": 266706990.0, + "step": 6991 + }, + { + "epoch": 0.8894542679048467, + "ewc_loss": 0.034864626824855804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7432314052712172e-05, + "grad_norm": 25.49288558959961, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8488253355026245, + "num_tokens": 266747314.0, + "step": 6992 + }, + { + "epoch": 0.8895814781834372, + "ewc_loss": 0.034950487315654755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7475244021625258e-05, + "grad_norm": 25.38133430480957, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8517577648162842, + "num_tokens": 266791031.0, + "step": 6993 + }, + { + "epoch": 0.8897086884620278, + "ewc_loss": 0.03484022617340088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7420112271793187e-05, + "grad_norm": 25.582340240478516, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8589515089988708, + "num_tokens": 266823236.0, + "step": 6994 + }, + { + "epoch": 0.8898358987406182, + "ewc_loss": 0.034950848668813705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.747542410157621e-05, + "grad_norm": 25.46324920654297, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8576348423957825, + "num_tokens": 266867993.0, + "step": 6995 + }, + { + "epoch": 0.8899631090192087, + "ewc_loss": 0.034806638956069946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.740331936161965e-05, + "grad_norm": 25.269468307495117, + "learning_rate": 1e-06, + "loss": 0.5034, + "mean_token_accuracy": 0.8424566984176636, + "num_tokens": 266908573.0, + "step": 6996 + }, + { + "epoch": 0.8900903192977992, + "ewc_loss": 0.034917186945676804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.74585929926252e-05, + "grad_norm": 25.460620880126953, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.858846127986908, + "num_tokens": 266951167.0, + "step": 6997 + }, + { + "epoch": 0.8902175295763898, + "ewc_loss": 0.03489578887820244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7447893696953543e-05, + "grad_norm": 25.368711471557617, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8587666153907776, + "num_tokens": 266988976.0, + "step": 6998 + }, + { + "epoch": 0.8903447398549803, + "ewc_loss": 0.03488551825284958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7442758689867333e-05, + "grad_norm": 25.40229606628418, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8591073751449585, + "num_tokens": 267022450.0, + "step": 6999 + }, + { + "epoch": 0.8904719501335708, + "ewc_loss": 0.034917283803224564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7458642105339095e-05, + "grad_norm": 25.488224029541016, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8540128469467163, + "num_tokens": 267059933.0, + "step": 7000 + }, + { + "epoch": 0.8905991604121613, + "ewc_loss": 0.03491830453276634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7459151422372088e-05, + "grad_norm": 25.348690032958984, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8514794111251831, + "num_tokens": 267092518.0, + "step": 7001 + }, + { + "epoch": 0.8907263706907518, + "ewc_loss": 0.034852515906095505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7426258636987768e-05, + "grad_norm": 25.333709716796875, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8546146750450134, + "num_tokens": 267130008.0, + "step": 7002 + }, + { + "epoch": 0.8908535809693423, + "ewc_loss": 0.034938957542181015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.746947964420542e-05, + "grad_norm": 25.497539520263672, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8626817464828491, + "num_tokens": 267171039.0, + "step": 7003 + }, + { + "epoch": 0.8909807912479328, + "ewc_loss": 0.034984927624464035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7492464394308627e-05, + "grad_norm": 25.38620948791504, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8625538349151611, + "num_tokens": 267211197.0, + "step": 7004 + }, + { + "epoch": 0.8911080015265234, + "ewc_loss": 0.03491636738181114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.74581837200094e-05, + "grad_norm": 25.463729858398438, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8659681677818298, + "num_tokens": 267247471.0, + "step": 7005 + }, + { + "epoch": 0.8912352118051139, + "ewc_loss": 0.03494380787014961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7471904357080348e-05, + "grad_norm": 25.50965118408203, + "learning_rate": 1e-06, + "loss": 0.5435, + "mean_token_accuracy": 0.8325440883636475, + "num_tokens": 267287276.0, + "step": 7006 + }, + { + "epoch": 0.8913624220837043, + "ewc_loss": 0.03492647036910057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7463235053583048e-05, + "grad_norm": 25.395416259765625, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8574610352516174, + "num_tokens": 267323827.0, + "step": 7007 + }, + { + "epoch": 0.8914896323622948, + "ewc_loss": 0.03493037074804306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.746518501022365e-05, + "grad_norm": 25.62009048461914, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8684625625610352, + "num_tokens": 267364053.0, + "step": 7008 + }, + { + "epoch": 0.8916168426408854, + "ewc_loss": 0.034965094178915024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7482547264080495e-05, + "grad_norm": 25.289213180541992, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8419625759124756, + "num_tokens": 267404393.0, + "step": 7009 + }, + { + "epoch": 0.8917440529194759, + "ewc_loss": 0.034866925328969955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.743346183502581e-05, + "grad_norm": 25.54852294921875, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.860603392124176, + "num_tokens": 267444589.0, + "step": 7010 + }, + { + "epoch": 0.8918712631980664, + "ewc_loss": 0.03497720882296562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7488604498794302e-05, + "grad_norm": 25.404775619506836, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8605238199234009, + "num_tokens": 267484460.0, + "step": 7011 + }, + { + "epoch": 0.891998473476657, + "ewc_loss": 0.034827329218387604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7413663954357617e-05, + "grad_norm": 25.48714828491211, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8524602651596069, + "num_tokens": 267517390.0, + "step": 7012 + }, + { + "epoch": 0.8921256837552475, + "ewc_loss": 0.0349702462553978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7485122953075916e-05, + "grad_norm": 25.437484741210938, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8597934246063232, + "num_tokens": 267555813.0, + "step": 7013 + }, + { + "epoch": 0.8922528940338379, + "ewc_loss": 0.034914348274469376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7457174180890433e-05, + "grad_norm": 25.396604537963867, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8568068742752075, + "num_tokens": 267595818.0, + "step": 7014 + }, + { + "epoch": 0.8923801043124284, + "ewc_loss": 0.034936029464006424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7468015357735567e-05, + "grad_norm": 25.43863296508789, + "learning_rate": 1e-06, + "loss": 0.5319, + "mean_token_accuracy": 0.833459734916687, + "num_tokens": 267633106.0, + "step": 7015 + }, + { + "epoch": 0.892507314591019, + "ewc_loss": 0.03490818291902542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7454090993851423e-05, + "grad_norm": 25.339691162109375, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8620898723602295, + "num_tokens": 267670166.0, + "step": 7016 + }, + { + "epoch": 0.8926345248696095, + "ewc_loss": 0.034934211522340775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7467105863033794e-05, + "grad_norm": 25.434228897094727, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8572077751159668, + "num_tokens": 267707284.0, + "step": 7017 + }, + { + "epoch": 0.8927617351482, + "ewc_loss": 0.03491799905896187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.745899862726219e-05, + "grad_norm": 25.323110580444336, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.860710620880127, + "num_tokens": 267742349.0, + "step": 7018 + }, + { + "epoch": 0.8928889454267905, + "ewc_loss": 0.03496376425027847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7481881513958797e-05, + "grad_norm": 25.580764770507812, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8644572496414185, + "num_tokens": 267776003.0, + "step": 7019 + }, + { + "epoch": 0.893016155705381, + "ewc_loss": 0.035048142075538635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.752407115418464e-05, + "grad_norm": 25.36863899230957, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8545316457748413, + "num_tokens": 267815887.0, + "step": 7020 + }, + { + "epoch": 0.8931433659839715, + "ewc_loss": 0.03490437567234039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7452188330935314e-05, + "grad_norm": 25.50237464904785, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8704235553741455, + "num_tokens": 267855874.0, + "step": 7021 + }, + { + "epoch": 0.893270576262562, + "ewc_loss": 0.03506055474281311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7530277546029538e-05, + "grad_norm": 25.40266227722168, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.869411289691925, + "num_tokens": 267897901.0, + "step": 7022 + }, + { + "epoch": 0.8933977865411525, + "ewc_loss": 0.03493957221508026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7469785234425217e-05, + "grad_norm": 25.454559326171875, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8703781962394714, + "num_tokens": 267938967.0, + "step": 7023 + }, + { + "epoch": 0.8935249968197431, + "ewc_loss": 0.035005584359169006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.750279261614196e-05, + "grad_norm": 25.402833938598633, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8560988306999207, + "num_tokens": 267975196.0, + "step": 7024 + }, + { + "epoch": 0.8936522070983336, + "ewc_loss": 0.03498354181647301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7491771359345876e-05, + "grad_norm": 25.466949462890625, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8731514811515808, + "num_tokens": 268014407.0, + "step": 7025 + }, + { + "epoch": 0.893779417376924, + "ewc_loss": 0.03500838577747345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.750419323798269e-05, + "grad_norm": 25.445188522338867, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.874472439289093, + "num_tokens": 268043892.0, + "step": 7026 + }, + { + "epoch": 0.8939066276555145, + "ewc_loss": 0.03496161475777626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.74808064912213e-05, + "grad_norm": 25.33150291442871, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8588688373565674, + "num_tokens": 268079651.0, + "step": 7027 + }, + { + "epoch": 0.8940338379341051, + "ewc_loss": 0.035035390406847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.751769559632521e-05, + "grad_norm": 25.524368286132812, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8627454042434692, + "num_tokens": 268114021.0, + "step": 7028 + }, + { + "epoch": 0.8941610482126956, + "ewc_loss": 0.035065192729234695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.753259675751906e-05, + "grad_norm": 25.397544860839844, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8585183024406433, + "num_tokens": 268155315.0, + "step": 7029 + }, + { + "epoch": 0.8942882584912861, + "ewc_loss": 0.03502248600125313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7511243640910834e-05, + "grad_norm": 25.424440383911133, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8562402725219727, + "num_tokens": 268199312.0, + "step": 7030 + }, + { + "epoch": 0.8944154687698767, + "ewc_loss": 0.03499811142683029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7499056411907077e-05, + "grad_norm": 25.33191680908203, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.866639256477356, + "num_tokens": 268236955.0, + "step": 7031 + }, + { + "epoch": 0.8945426790484671, + "ewc_loss": 0.034970834851264954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.748541762935929e-05, + "grad_norm": 25.386348724365234, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8567572832107544, + "num_tokens": 268272183.0, + "step": 7032 + }, + { + "epoch": 0.8946698893270576, + "ewc_loss": 0.03504597023129463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7522985217510723e-05, + "grad_norm": 25.53394317626953, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8502734899520874, + "num_tokens": 268305407.0, + "step": 7033 + }, + { + "epoch": 0.8947970996056481, + "ewc_loss": 0.03501828387379646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.750914270814974e-05, + "grad_norm": 25.433822631835938, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8702707290649414, + "num_tokens": 268340443.0, + "step": 7034 + }, + { + "epoch": 0.8949243098842387, + "ewc_loss": 0.03493572026491165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7467860743636265e-05, + "grad_norm": 25.37285614013672, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8777885437011719, + "num_tokens": 268375597.0, + "step": 7035 + }, + { + "epoch": 0.8950515201628292, + "ewc_loss": 0.03501256927847862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7506285075796768e-05, + "grad_norm": 25.487329483032227, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.854045569896698, + "num_tokens": 268412872.0, + "step": 7036 + }, + { + "epoch": 0.8951787304414197, + "ewc_loss": 0.03504178300499916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.752089156070724e-05, + "grad_norm": 25.368547439575195, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8605829477310181, + "num_tokens": 268452021.0, + "step": 7037 + }, + { + "epoch": 0.8953059407200101, + "ewc_loss": 0.03504503145813942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7522515918244608e-05, + "grad_norm": 25.418350219726562, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8679773807525635, + "num_tokens": 268486354.0, + "step": 7038 + }, + { + "epoch": 0.8954331509986007, + "ewc_loss": 0.0350319966673851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7515998479211703e-05, + "grad_norm": 25.306949615478516, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8531492948532104, + "num_tokens": 268519918.0, + "step": 7039 + }, + { + "epoch": 0.8955603612771912, + "ewc_loss": 0.03500070422887802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.75003515323624e-05, + "grad_norm": 25.409809112548828, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8502262830734253, + "num_tokens": 268554259.0, + "step": 7040 + }, + { + "epoch": 0.8956875715557817, + "ewc_loss": 0.03515828773379326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7579144696355797e-05, + "grad_norm": 25.37432289123535, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8542693853378296, + "num_tokens": 268595785.0, + "step": 7041 + }, + { + "epoch": 0.8958147818343722, + "ewc_loss": 0.035071734338998795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7535867300466634e-05, + "grad_norm": 25.335317611694336, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8447710275650024, + "num_tokens": 268640805.0, + "step": 7042 + }, + { + "epoch": 0.8959419921129628, + "ewc_loss": 0.03509114682674408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7545573427923955e-05, + "grad_norm": 25.34637451171875, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8656431436538696, + "num_tokens": 268682446.0, + "step": 7043 + }, + { + "epoch": 0.8960692023915532, + "ewc_loss": 0.03512488678097725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.756244273565244e-05, + "grad_norm": 25.38552474975586, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.868108868598938, + "num_tokens": 268720262.0, + "step": 7044 + }, + { + "epoch": 0.8961964126701437, + "ewc_loss": 0.03507626801729202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7538133761263452e-05, + "grad_norm": 25.39276885986328, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8474460244178772, + "num_tokens": 268760519.0, + "step": 7045 + }, + { + "epoch": 0.8963236229487342, + "ewc_loss": 0.03510821983218193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7554109945194796e-05, + "grad_norm": 25.31014060974121, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.852209210395813, + "num_tokens": 268796621.0, + "step": 7046 + }, + { + "epoch": 0.8964508332273248, + "ewc_loss": 0.03509918227791786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7549591575516388e-05, + "grad_norm": 25.547765731811523, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.867885172367096, + "num_tokens": 268830527.0, + "step": 7047 + }, + { + "epoch": 0.8965780435059153, + "ewc_loss": 0.035134684294462204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7567341274116188e-05, + "grad_norm": 25.241514205932617, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8644152879714966, + "num_tokens": 268862112.0, + "step": 7048 + }, + { + "epoch": 0.8967052537845058, + "ewc_loss": 0.03507104516029358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.753552169247996e-05, + "grad_norm": 25.56170082092285, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8583357930183411, + "num_tokens": 268901089.0, + "step": 7049 + }, + { + "epoch": 0.8968324640630962, + "ewc_loss": 0.03518400341272354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.759200131346006e-05, + "grad_norm": 25.333465576171875, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8620550632476807, + "num_tokens": 268942766.0, + "step": 7050 + }, + { + "epoch": 0.8969596743416868, + "ewc_loss": 0.0350198969244957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.750994852045551e-05, + "grad_norm": 25.34532356262207, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8660399913787842, + "num_tokens": 268978642.0, + "step": 7051 + }, + { + "epoch": 0.8970868846202773, + "ewc_loss": 0.03514973446726799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7574866433278657e-05, + "grad_norm": 25.44672393798828, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8654674887657166, + "num_tokens": 269016048.0, + "step": 7052 + }, + { + "epoch": 0.8972140948988678, + "ewc_loss": 0.035114891827106476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.75574459717609e-05, + "grad_norm": 25.400426864624023, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8609662055969238, + "num_tokens": 269054381.0, + "step": 7053 + }, + { + "epoch": 0.8973413051774584, + "ewc_loss": 0.03504994139075279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7524971553939395e-05, + "grad_norm": 25.372953414916992, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8639423251152039, + "num_tokens": 269090672.0, + "step": 7054 + }, + { + "epoch": 0.8974685154560489, + "ewc_loss": 0.03510301932692528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7551508790347725e-05, + "grad_norm": 25.34254264831543, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8631761074066162, + "num_tokens": 269130231.0, + "step": 7055 + }, + { + "epoch": 0.8975957257346393, + "ewc_loss": 0.035104453563690186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7552227291162126e-05, + "grad_norm": 25.39592933654785, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8635830283164978, + "num_tokens": 269166282.0, + "step": 7056 + }, + { + "epoch": 0.8977229360132298, + "ewc_loss": 0.03516648709774017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7583242879481986e-05, + "grad_norm": 25.430522918701172, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8637211918830872, + "num_tokens": 269205326.0, + "step": 7057 + }, + { + "epoch": 0.8978501462918204, + "ewc_loss": 0.03508118540048599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7540593034937046e-05, + "grad_norm": 25.486385345458984, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8622927665710449, + "num_tokens": 269243903.0, + "step": 7058 + }, + { + "epoch": 0.8979773565704109, + "ewc_loss": 0.035164229571819305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7582115106051788e-05, + "grad_norm": 25.43951416015625, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8535134792327881, + "num_tokens": 269288330.0, + "step": 7059 + }, + { + "epoch": 0.8981045668490014, + "ewc_loss": 0.035096459090709686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7548229152453132e-05, + "grad_norm": 25.39143180847168, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8500397205352783, + "num_tokens": 269328731.0, + "step": 7060 + }, + { + "epoch": 0.898231777127592, + "ewc_loss": 0.035111550241708755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.755577432049904e-05, + "grad_norm": 25.382768630981445, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8616081476211548, + "num_tokens": 269369576.0, + "step": 7061 + }, + { + "epoch": 0.8983589874061825, + "ewc_loss": 0.035065848380327225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7532924175611697e-05, + "grad_norm": 25.44195556640625, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8597339391708374, + "num_tokens": 269407734.0, + "step": 7062 + }, + { + "epoch": 0.8984861976847729, + "ewc_loss": 0.03510542958974838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7552714780322276e-05, + "grad_norm": 25.430627822875977, + "learning_rate": 1e-06, + "loss": 0.5143, + "mean_token_accuracy": 0.8386774659156799, + "num_tokens": 269444042.0, + "step": 7063 + }, + { + "epoch": 0.8986134079633634, + "ewc_loss": 0.03513704240322113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.756852179823909e-05, + "grad_norm": 25.482053756713867, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.882152259349823, + "num_tokens": 269480979.0, + "step": 7064 + }, + { + "epoch": 0.898740618241954, + "ewc_loss": 0.0351015105843544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7550755728734657e-05, + "grad_norm": 25.447582244873047, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8606914281845093, + "num_tokens": 269515594.0, + "step": 7065 + }, + { + "epoch": 0.8988678285205445, + "ewc_loss": 0.0350913368165493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.754566801537294e-05, + "grad_norm": 25.519935607910156, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.863280177116394, + "num_tokens": 269556974.0, + "step": 7066 + }, + { + "epoch": 0.898995038799135, + "ewc_loss": 0.03511103242635727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7555516024003737e-05, + "grad_norm": 25.611129760742188, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8530235290527344, + "num_tokens": 269591088.0, + "step": 7067 + }, + { + "epoch": 0.8991222490777255, + "ewc_loss": 0.03506387025117874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7531934645376168e-05, + "grad_norm": 25.4576358795166, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8694591522216797, + "num_tokens": 269627921.0, + "step": 7068 + }, + { + "epoch": 0.899249459356316, + "ewc_loss": 0.03503960371017456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7519801986054517e-05, + "grad_norm": 25.485689163208008, + "learning_rate": 1e-06, + "loss": 0.4967, + "mean_token_accuracy": 0.8447883129119873, + "num_tokens": 269666586.0, + "step": 7069 + }, + { + "epoch": 0.8993766696349065, + "ewc_loss": 0.035048019140958786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.752400930854492e-05, + "grad_norm": 25.402278900146484, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8634707927703857, + "num_tokens": 269701205.0, + "step": 7070 + }, + { + "epoch": 0.899503879913497, + "ewc_loss": 0.03504202887415886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7521015251986682e-05, + "grad_norm": 25.419958114624023, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8727410435676575, + "num_tokens": 269739982.0, + "step": 7071 + }, + { + "epoch": 0.8996310901920875, + "ewc_loss": 0.03508546203374863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7542730347486213e-05, + "grad_norm": 25.48616600036621, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8621247410774231, + "num_tokens": 269773048.0, + "step": 7072 + }, + { + "epoch": 0.8997583004706781, + "ewc_loss": 0.03511948883533478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.755974517436698e-05, + "grad_norm": 25.439075469970703, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8602108955383301, + "num_tokens": 269811343.0, + "step": 7073 + }, + { + "epoch": 0.8998855107492686, + "ewc_loss": 0.03509462997317314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.754731420078315e-05, + "grad_norm": 25.41510772705078, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8655582666397095, + "num_tokens": 269851601.0, + "step": 7074 + }, + { + "epoch": 0.900012721027859, + "ewc_loss": 0.03515930101275444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7579650375409983e-05, + "grad_norm": 25.51118278503418, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8692082166671753, + "num_tokens": 269891396.0, + "step": 7075 + }, + { + "epoch": 0.9001399313064495, + "ewc_loss": 0.03507060185074806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.753530159476213e-05, + "grad_norm": 25.52457618713379, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8601385354995728, + "num_tokens": 269930292.0, + "step": 7076 + }, + { + "epoch": 0.9002671415850401, + "ewc_loss": 0.03506811335682869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7534057406010106e-05, + "grad_norm": 25.353023529052734, + "learning_rate": 1e-06, + "loss": 0.5181, + "mean_token_accuracy": 0.8342379331588745, + "num_tokens": 269968464.0, + "step": 7077 + }, + { + "epoch": 0.9003943518636306, + "ewc_loss": 0.035072360187768936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7536180166644044e-05, + "grad_norm": 25.529434204101562, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8480813503265381, + "num_tokens": 270011569.0, + "step": 7078 + }, + { + "epoch": 0.9005215621422211, + "ewc_loss": 0.03514288365840912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.757144127623178e-05, + "grad_norm": 25.39546775817871, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8563523292541504, + "num_tokens": 270050608.0, + "step": 7079 + }, + { + "epoch": 0.9006487724208116, + "ewc_loss": 0.03503860533237457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7519301763968542e-05, + "grad_norm": 25.42838478088379, + "learning_rate": 1e-06, + "loss": 0.4881, + "mean_token_accuracy": 0.8462992906570435, + "num_tokens": 270096100.0, + "step": 7080 + }, + { + "epoch": 0.9007759826994021, + "ewc_loss": 0.03518594801425934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7592974472790956e-05, + "grad_norm": 25.42560577392578, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8709761500358582, + "num_tokens": 270140915.0, + "step": 7081 + }, + { + "epoch": 0.9009031929779926, + "ewc_loss": 0.0350676029920578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7533800928504206e-05, + "grad_norm": 25.481502532958984, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8610220551490784, + "num_tokens": 270184233.0, + "step": 7082 + }, + { + "epoch": 0.9010304032565831, + "ewc_loss": 0.03514119237661362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7570595446159132e-05, + "grad_norm": 25.342859268188477, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8590579628944397, + "num_tokens": 270223889.0, + "step": 7083 + }, + { + "epoch": 0.9011576135351737, + "ewc_loss": 0.03506609797477722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7533049685880542e-05, + "grad_norm": 25.432954788208008, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8645662665367126, + "num_tokens": 270266357.0, + "step": 7084 + }, + { + "epoch": 0.9012848238137642, + "ewc_loss": 0.03514348343014717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7571741409483366e-05, + "grad_norm": 25.423173904418945, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.868600606918335, + "num_tokens": 270298252.0, + "step": 7085 + }, + { + "epoch": 0.9014120340923547, + "ewc_loss": 0.035153940320014954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.757697100401856e-05, + "grad_norm": 25.47635841369629, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8538507223129272, + "num_tokens": 270343187.0, + "step": 7086 + }, + { + "epoch": 0.9015392443709451, + "ewc_loss": 0.03509913757443428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7549567928654142e-05, + "grad_norm": 25.36018180847168, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8494179248809814, + "num_tokens": 270385094.0, + "step": 7087 + }, + { + "epoch": 0.9016664546495357, + "ewc_loss": 0.03511616587638855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.755808261805214e-05, + "grad_norm": 25.460384368896484, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8477413654327393, + "num_tokens": 270421772.0, + "step": 7088 + }, + { + "epoch": 0.9017936649281262, + "ewc_loss": 0.035110004246234894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7555001250002533e-05, + "grad_norm": 25.381397247314453, + "learning_rate": 1e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8428264260292053, + "num_tokens": 270465033.0, + "step": 7089 + }, + { + "epoch": 0.9019208752067167, + "ewc_loss": 0.0350748710334301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.753743526933249e-05, + "grad_norm": 25.39181900024414, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8829449415206909, + "num_tokens": 270501370.0, + "step": 7090 + }, + { + "epoch": 0.9020480854853072, + "ewc_loss": 0.03515135869383812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7575679521542042e-05, + "grad_norm": 25.387544631958008, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8641915321350098, + "num_tokens": 270543571.0, + "step": 7091 + }, + { + "epoch": 0.9021752957638978, + "ewc_loss": 0.035121142864227295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.756057099555619e-05, + "grad_norm": 25.45353126525879, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8699339628219604, + "num_tokens": 270580635.0, + "step": 7092 + }, + { + "epoch": 0.9023025060424882, + "ewc_loss": 0.03518923744559288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7594618839211762e-05, + "grad_norm": 25.484203338623047, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8732731938362122, + "num_tokens": 270616534.0, + "step": 7093 + }, + { + "epoch": 0.9024297163210787, + "ewc_loss": 0.03511776402592659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7558881154400297e-05, + "grad_norm": 25.524017333984375, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.841023862361908, + "num_tokens": 270653998.0, + "step": 7094 + }, + { + "epoch": 0.9025569265996692, + "ewc_loss": 0.03507663309574127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7538315660203807e-05, + "grad_norm": 25.358583450317383, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8601127862930298, + "num_tokens": 270688023.0, + "step": 7095 + }, + { + "epoch": 0.9026841368782598, + "ewc_loss": 0.035113122314214706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7556561942910776e-05, + "grad_norm": 25.570629119873047, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8430157899856567, + "num_tokens": 270721298.0, + "step": 7096 + }, + { + "epoch": 0.9028113471568503, + "ewc_loss": 0.03516952693462372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.758476355462335e-05, + "grad_norm": 25.4879093170166, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8658273220062256, + "num_tokens": 270762182.0, + "step": 7097 + }, + { + "epoch": 0.9029385574354408, + "ewc_loss": 0.0350230410695076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7511520127300173e-05, + "grad_norm": 25.358436584472656, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8752021193504333, + "num_tokens": 270799577.0, + "step": 7098 + }, + { + "epoch": 0.9030657677140312, + "ewc_loss": 0.03512357547879219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7561787899467163e-05, + "grad_norm": 25.451190948486328, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8611851930618286, + "num_tokens": 270840218.0, + "step": 7099 + }, + { + "epoch": 0.9031929779926218, + "ewc_loss": 0.03513391688466072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7566959286341444e-05, + "grad_norm": 25.4996337890625, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8804532289505005, + "num_tokens": 270872982.0, + "step": 7100 + }, + { + "epoch": 0.9033201882712123, + "ewc_loss": 0.03511057794094086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7555288650328293e-05, + "grad_norm": 25.440059661865234, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8648264408111572, + "num_tokens": 270908381.0, + "step": 7101 + }, + { + "epoch": 0.9034473985498028, + "ewc_loss": 0.03513481467962265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7567406757734716e-05, + "grad_norm": 25.412696838378906, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8599967360496521, + "num_tokens": 270949217.0, + "step": 7102 + }, + { + "epoch": 0.9035746088283934, + "ewc_loss": 0.035137683153152466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7568841940374114e-05, + "grad_norm": 25.52265167236328, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.843822717666626, + "num_tokens": 270989667.0, + "step": 7103 + }, + { + "epoch": 0.9037018191069839, + "ewc_loss": 0.03507455810904503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7537278836243786e-05, + "grad_norm": 25.22666358947754, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8507567644119263, + "num_tokens": 271030395.0, + "step": 7104 + }, + { + "epoch": 0.9038290293855743, + "ewc_loss": 0.035141050815582275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7570526324561797e-05, + "grad_norm": 25.602527618408203, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.842072069644928, + "num_tokens": 271068239.0, + "step": 7105 + }, + { + "epoch": 0.9039562396641648, + "ewc_loss": 0.03521622344851494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7608112102607265e-05, + "grad_norm": 25.44921875, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8543687462806702, + "num_tokens": 271110294.0, + "step": 7106 + }, + { + "epoch": 0.9040834499427554, + "ewc_loss": 0.03504648059606552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.752323987602722e-05, + "grad_norm": 25.36672592163086, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8784787654876709, + "num_tokens": 271143288.0, + "step": 7107 + }, + { + "epoch": 0.9042106602213459, + "ewc_loss": 0.03524046391248703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.762023202900309e-05, + "grad_norm": 25.497549057006836, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.867190957069397, + "num_tokens": 271181183.0, + "step": 7108 + }, + { + "epoch": 0.9043378704999364, + "ewc_loss": 0.03517579659819603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.758789767336566e-05, + "grad_norm": 25.458663940429688, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8595111966133118, + "num_tokens": 271216201.0, + "step": 7109 + }, + { + "epoch": 0.9044650807785269, + "ewc_loss": 0.03518381714820862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.759190854500048e-05, + "grad_norm": 25.4879150390625, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8672327399253845, + "num_tokens": 271250158.0, + "step": 7110 + }, + { + "epoch": 0.9045922910571175, + "ewc_loss": 0.035096511244773865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7548256437294185e-05, + "grad_norm": 25.382963180541992, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8730548620223999, + "num_tokens": 271289994.0, + "step": 7111 + }, + { + "epoch": 0.9047195013357079, + "ewc_loss": 0.0351153127849102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.755765697453171e-05, + "grad_norm": 25.446496963500977, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8657260537147522, + "num_tokens": 271325782.0, + "step": 7112 + }, + { + "epoch": 0.9048467116142984, + "ewc_loss": 0.035135433077812195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.756771598593332e-05, + "grad_norm": 25.507360458374023, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8485352993011475, + "num_tokens": 271361604.0, + "step": 7113 + }, + { + "epoch": 0.9049739218928889, + "ewc_loss": 0.03519013896584511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.759506994858384e-05, + "grad_norm": 25.468582153320312, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8695651292800903, + "num_tokens": 271398520.0, + "step": 7114 + }, + { + "epoch": 0.9051011321714795, + "ewc_loss": 0.0351143516600132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.755717494233977e-05, + "grad_norm": 25.50229835510254, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8549560904502869, + "num_tokens": 271445138.0, + "step": 7115 + }, + { + "epoch": 0.90522834245007, + "ewc_loss": 0.03510512784123421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7552563804201782e-05, + "grad_norm": 25.468931198120117, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8658922910690308, + "num_tokens": 271479147.0, + "step": 7116 + }, + { + "epoch": 0.9053555527286605, + "ewc_loss": 0.035101234912872314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7550617485539988e-05, + "grad_norm": 25.389549255371094, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8627126216888428, + "num_tokens": 271512477.0, + "step": 7117 + }, + { + "epoch": 0.905482763007251, + "ewc_loss": 0.03512899950146675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.756450001266785e-05, + "grad_norm": 25.5129451751709, + "learning_rate": 1e-06, + "loss": 0.4928, + "mean_token_accuracy": 0.8472691774368286, + "num_tokens": 271552355.0, + "step": 7118 + }, + { + "epoch": 0.9056099732858415, + "ewc_loss": 0.03518998622894287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7594993551028892e-05, + "grad_norm": 25.4182071685791, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.85653156042099, + "num_tokens": 271595825.0, + "step": 7119 + }, + { + "epoch": 0.905737183564432, + "ewc_loss": 0.03510196506977081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.75509831024101e-05, + "grad_norm": 25.506492614746094, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8495782613754272, + "num_tokens": 271627384.0, + "step": 7120 + }, + { + "epoch": 0.9058643938430225, + "ewc_loss": 0.03521285206079483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7606425899430178e-05, + "grad_norm": 25.510385513305664, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8703808784484863, + "num_tokens": 271666711.0, + "step": 7121 + }, + { + "epoch": 0.9059916041216131, + "ewc_loss": 0.03512968868017197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.756484380166512e-05, + "grad_norm": 25.317773818969727, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8673714995384216, + "num_tokens": 271703145.0, + "step": 7122 + }, + { + "epoch": 0.9061188144002036, + "ewc_loss": 0.03515740856528282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.757870450092014e-05, + "grad_norm": 25.45134162902832, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8497225046157837, + "num_tokens": 271743131.0, + "step": 7123 + }, + { + "epoch": 0.906246024678794, + "ewc_loss": 0.035267092287540436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7633547031437047e-05, + "grad_norm": 25.4633846282959, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8657599091529846, + "num_tokens": 271778245.0, + "step": 7124 + }, + { + "epoch": 0.9063732349573845, + "ewc_loss": 0.035155944526195526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.757797144819051e-05, + "grad_norm": 25.379425048828125, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8518429398536682, + "num_tokens": 271817279.0, + "step": 7125 + }, + { + "epoch": 0.9065004452359751, + "ewc_loss": 0.03519200161099434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7596001271158457e-05, + "grad_norm": 25.520835876464844, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8677014112472534, + "num_tokens": 271857136.0, + "step": 7126 + }, + { + "epoch": 0.9066276555145656, + "ewc_loss": 0.035210635513067245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.760531813488342e-05, + "grad_norm": 25.386024475097656, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8694757223129272, + "num_tokens": 271891651.0, + "step": 7127 + }, + { + "epoch": 0.9067548657931561, + "ewc_loss": 0.03520960733294487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7604803360882215e-05, + "grad_norm": 25.581249237060547, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8661870956420898, + "num_tokens": 271925805.0, + "step": 7128 + }, + { + "epoch": 0.9068820760717466, + "ewc_loss": 0.03518104553222656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7590522475074977e-05, + "grad_norm": 25.476001739501953, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8652231097221375, + "num_tokens": 271964954.0, + "step": 7129 + }, + { + "epoch": 0.9070092863503371, + "ewc_loss": 0.03514707833528519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.757353857101407e-05, + "grad_norm": 25.54178237915039, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8601388931274414, + "num_tokens": 272000331.0, + "step": 7130 + }, + { + "epoch": 0.9071364966289276, + "ewc_loss": 0.03518548980355263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.759274528012611e-05, + "grad_norm": 25.39149284362793, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8692425489425659, + "num_tokens": 272037853.0, + "step": 7131 + }, + { + "epoch": 0.9072637069075181, + "ewc_loss": 0.03517241030931473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7586206013220362e-05, + "grad_norm": 25.43242073059082, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8713160753250122, + "num_tokens": 272077663.0, + "step": 7132 + }, + { + "epoch": 0.9073909171861086, + "ewc_loss": 0.03528589382767677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7642947568674572e-05, + "grad_norm": 25.40645980834961, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8557167649269104, + "num_tokens": 272116942.0, + "step": 7133 + }, + { + "epoch": 0.9075181274646992, + "ewc_loss": 0.035227589309215546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7613794625503942e-05, + "grad_norm": 25.365400314331055, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8547444343566895, + "num_tokens": 272151319.0, + "step": 7134 + }, + { + "epoch": 0.9076453377432897, + "ewc_loss": 0.035284169018268585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.764208536769729e-05, + "grad_norm": 25.483169555664062, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8473037481307983, + "num_tokens": 272190817.0, + "step": 7135 + }, + { + "epoch": 0.9077725480218801, + "ewc_loss": 0.035294827073812485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7647413187660277e-05, + "grad_norm": 25.466398239135742, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8536346554756165, + "num_tokens": 272230461.0, + "step": 7136 + }, + { + "epoch": 0.9078997583004706, + "ewc_loss": 0.03522256389260292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7611282601137646e-05, + "grad_norm": 25.453580856323242, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8544298410415649, + "num_tokens": 272268510.0, + "step": 7137 + }, + { + "epoch": 0.9080269685790612, + "ewc_loss": 0.035279564559459686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7639782527112402e-05, + "grad_norm": 25.469724655151367, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8605929017066956, + "num_tokens": 272312430.0, + "step": 7138 + }, + { + "epoch": 0.9081541788576517, + "ewc_loss": 0.03523871675133705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.761935891408939e-05, + "grad_norm": 25.451519012451172, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8588955998420715, + "num_tokens": 272348437.0, + "step": 7139 + }, + { + "epoch": 0.9082813891362422, + "ewc_loss": 0.03523387759923935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7616939658182673e-05, + "grad_norm": 25.478246688842773, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8546315431594849, + "num_tokens": 272378902.0, + "step": 7140 + }, + { + "epoch": 0.9084085994148328, + "ewc_loss": 0.03528039529919624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.764019725669641e-05, + "grad_norm": 25.648876190185547, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8741584420204163, + "num_tokens": 272421257.0, + "step": 7141 + }, + { + "epoch": 0.9085358096934232, + "ewc_loss": 0.03521085903048515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7605429093237035e-05, + "grad_norm": 25.49741554260254, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8598300218582153, + "num_tokens": 272459800.0, + "step": 7142 + }, + { + "epoch": 0.9086630199720137, + "ewc_loss": 0.035181399434804916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.759069891704712e-05, + "grad_norm": 25.60438346862793, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8798055648803711, + "num_tokens": 272501245.0, + "step": 7143 + }, + { + "epoch": 0.9087902302506042, + "ewc_loss": 0.03519361838698387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.759680890245363e-05, + "grad_norm": 25.4812068939209, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8622934818267822, + "num_tokens": 272535873.0, + "step": 7144 + }, + { + "epoch": 0.9089174405291948, + "ewc_loss": 0.03513879328966141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7569396732142195e-05, + "grad_norm": 25.556013107299805, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8589003086090088, + "num_tokens": 272575213.0, + "step": 7145 + }, + { + "epoch": 0.9090446508077853, + "ewc_loss": 0.035212330520153046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.760616578394547e-05, + "grad_norm": 25.522676467895508, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8636565208435059, + "num_tokens": 272611960.0, + "step": 7146 + }, + { + "epoch": 0.9091718610863758, + "ewc_loss": 0.03517134487628937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7585673049325123e-05, + "grad_norm": 25.476791381835938, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8607443571090698, + "num_tokens": 272650895.0, + "step": 7147 + }, + { + "epoch": 0.9092990713649662, + "ewc_loss": 0.035201545804739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7600772480363958e-05, + "grad_norm": 25.502323150634766, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8612925410270691, + "num_tokens": 272690617.0, + "step": 7148 + }, + { + "epoch": 0.9094262816435568, + "ewc_loss": 0.03517638146877289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.758819053065963e-05, + "grad_norm": 25.599475860595703, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8524602651596069, + "num_tokens": 272732526.0, + "step": 7149 + }, + { + "epoch": 0.9095534919221473, + "ewc_loss": 0.035165902227163315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.758295184117742e-05, + "grad_norm": 25.411779403686523, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8594663143157959, + "num_tokens": 272769882.0, + "step": 7150 + }, + { + "epoch": 0.9096807022007378, + "ewc_loss": 0.035155996680259705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7577998733031563e-05, + "grad_norm": 25.497215270996094, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8636317253112793, + "num_tokens": 272809614.0, + "step": 7151 + }, + { + "epoch": 0.9098079124793284, + "ewc_loss": 0.03522426635026932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7612133888178505e-05, + "grad_norm": 25.44373321533203, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.852914035320282, + "num_tokens": 272844819.0, + "step": 7152 + }, + { + "epoch": 0.9099351227579189, + "ewc_loss": 0.03512931242585182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7564656445756555e-05, + "grad_norm": 25.400575637817383, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8671627044677734, + "num_tokens": 272882798.0, + "step": 7153 + }, + { + "epoch": 0.9100623330365093, + "ewc_loss": 0.03520330786705017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7601654690224677e-05, + "grad_norm": 25.481914520263672, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8627976179122925, + "num_tokens": 272917472.0, + "step": 7154 + }, + { + "epoch": 0.9101895433150998, + "ewc_loss": 0.03524213284254074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.762106694513932e-05, + "grad_norm": 25.49275779724121, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8501617908477783, + "num_tokens": 272959947.0, + "step": 7155 + }, + { + "epoch": 0.9103167535936904, + "ewc_loss": 0.03516477346420288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7582386135472916e-05, + "grad_norm": 25.43897819519043, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.844357967376709, + "num_tokens": 272998194.0, + "step": 7156 + }, + { + "epoch": 0.9104439638722809, + "ewc_loss": 0.035174038261175156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7587019101483747e-05, + "grad_norm": 25.46785545349121, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8781944513320923, + "num_tokens": 273034688.0, + "step": 7157 + }, + { + "epoch": 0.9105711741508714, + "ewc_loss": 0.035228431224823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.761421481205616e-05, + "grad_norm": 25.491661071777344, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8720821142196655, + "num_tokens": 273071575.0, + "step": 7158 + }, + { + "epoch": 0.9106983844294619, + "ewc_loss": 0.03524911031126976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.762455576681532e-05, + "grad_norm": 25.440828323364258, + "learning_rate": 1e-06, + "loss": 0.5294, + "mean_token_accuracy": 0.835629940032959, + "num_tokens": 273106106.0, + "step": 7159 + }, + { + "epoch": 0.9108255947080524, + "ewc_loss": 0.03521398454904556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.760699160513468e-05, + "grad_norm": 25.402545928955078, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8561514616012573, + "num_tokens": 273142959.0, + "step": 7160 + }, + { + "epoch": 0.9109528049866429, + "ewc_loss": 0.035241104662418365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7620552171138115e-05, + "grad_norm": 25.393653869628906, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8464277386665344, + "num_tokens": 273182781.0, + "step": 7161 + }, + { + "epoch": 0.9110800152652334, + "ewc_loss": 0.035234563052654266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.761728162819054e-05, + "grad_norm": 25.432476043701172, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8667902946472168, + "num_tokens": 273225738.0, + "step": 7162 + }, + { + "epoch": 0.9112072255438239, + "ewc_loss": 0.03524196147918701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7620981452637352e-05, + "grad_norm": 25.484268188476562, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8683682680130005, + "num_tokens": 273269166.0, + "step": 7163 + }, + { + "epoch": 0.9113344358224145, + "ewc_loss": 0.03516574576497078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7582873624633066e-05, + "grad_norm": 25.276521682739258, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8552941083908081, + "num_tokens": 273309093.0, + "step": 7164 + }, + { + "epoch": 0.911461646101005, + "ewc_loss": 0.03524431958794594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.762216015777085e-05, + "grad_norm": 25.56212615966797, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8664090633392334, + "num_tokens": 273346857.0, + "step": 7165 + }, + { + "epoch": 0.9115888563795955, + "ewc_loss": 0.03527373820543289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.763686850608792e-05, + "grad_norm": 25.306434631347656, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8562648296356201, + "num_tokens": 273385720.0, + "step": 7166 + }, + { + "epoch": 0.9117160666581859, + "ewc_loss": 0.03523925319314003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.761962630553171e-05, + "grad_norm": 25.593456268310547, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8611592054367065, + "num_tokens": 273431079.0, + "step": 7167 + }, + { + "epoch": 0.9118432769367765, + "ewc_loss": 0.03532404825091362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7662023310549557e-05, + "grad_norm": 25.47716522216797, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8619000315666199, + "num_tokens": 273477316.0, + "step": 7168 + }, + { + "epoch": 0.911970487215367, + "ewc_loss": 0.035192713141441345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7596355974092148e-05, + "grad_norm": 25.55137062072754, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8637120723724365, + "num_tokens": 273512831.0, + "step": 7169 + }, + { + "epoch": 0.9120976974939575, + "ewc_loss": 0.03530867025256157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.765433444234077e-05, + "grad_norm": 25.550519943237305, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8546501398086548, + "num_tokens": 273551927.0, + "step": 7170 + }, + { + "epoch": 0.912224907772548, + "ewc_loss": 0.03520636633038521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7603182641323656e-05, + "grad_norm": 25.480215072631836, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8632358312606812, + "num_tokens": 273583053.0, + "step": 7171 + }, + { + "epoch": 0.9123521180511386, + "ewc_loss": 0.03523775562644005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7618878700886853e-05, + "grad_norm": 25.458887100219727, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8511803150177002, + "num_tokens": 273617953.0, + "step": 7172 + }, + { + "epoch": 0.912479328329729, + "ewc_loss": 0.03529864922165871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7649324945523404e-05, + "grad_norm": 25.54511833190918, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.846644401550293, + "num_tokens": 273649959.0, + "step": 7173 + }, + { + "epoch": 0.9126065386083195, + "ewc_loss": 0.03523276373744011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7616381228435785e-05, + "grad_norm": 25.457632064819336, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8508601188659668, + "num_tokens": 273682666.0, + "step": 7174 + }, + { + "epoch": 0.9127337488869101, + "ewc_loss": 0.035305432975292206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7652717360761017e-05, + "grad_norm": 25.495389938354492, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8790372610092163, + "num_tokens": 273718641.0, + "step": 7175 + }, + { + "epoch": 0.9128609591655006, + "ewc_loss": 0.03526647016406059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7633234165259637e-05, + "grad_norm": 25.430091857910156, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8709003329277039, + "num_tokens": 273760502.0, + "step": 7176 + }, + { + "epoch": 0.9129881694440911, + "ewc_loss": 0.03528578206896782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7642891180003062e-05, + "grad_norm": 25.51531219482422, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8551525473594666, + "num_tokens": 273795068.0, + "step": 7177 + }, + { + "epoch": 0.9131153797226816, + "ewc_loss": 0.035286128520965576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.76430639839964e-05, + "grad_norm": 25.516889572143555, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8431963920593262, + "num_tokens": 273825099.0, + "step": 7178 + }, + { + "epoch": 0.9132425900012721, + "ewc_loss": 0.03518034890294075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.75901750480989e-05, + "grad_norm": 25.379901885986328, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8667256832122803, + "num_tokens": 273862566.0, + "step": 7179 + }, + { + "epoch": 0.9133698002798626, + "ewc_loss": 0.035343363881111145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7671682144282386e-05, + "grad_norm": 25.65334701538086, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8621215224266052, + "num_tokens": 273896107.0, + "step": 7180 + }, + { + "epoch": 0.9134970105584531, + "ewc_loss": 0.03530636802315712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7653184841037728e-05, + "grad_norm": 25.458377838134766, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8662589192390442, + "num_tokens": 273934066.0, + "step": 7181 + }, + { + "epoch": 0.9136242208370436, + "ewc_loss": 0.03527968376874924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.763984255376272e-05, + "grad_norm": 25.549890518188477, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8600102663040161, + "num_tokens": 273971843.0, + "step": 7182 + }, + { + "epoch": 0.9137514311156342, + "ewc_loss": 0.035351384431123734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7675693015917204e-05, + "grad_norm": 25.57042694091797, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.854114830493927, + "num_tokens": 274001646.0, + "step": 7183 + }, + { + "epoch": 0.9138786413942247, + "ewc_loss": 0.03534439206123352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.767219691828359e-05, + "grad_norm": 25.6688289642334, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.8457528352737427, + "num_tokens": 274042656.0, + "step": 7184 + }, + { + "epoch": 0.9140058516728151, + "ewc_loss": 0.03530916944146156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7654585462878458e-05, + "grad_norm": 25.476505279541016, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8462149500846863, + "num_tokens": 274084609.0, + "step": 7185 + }, + { + "epoch": 0.9141330619514056, + "ewc_loss": 0.03526102378964424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.763051113812253e-05, + "grad_norm": 25.516286849975586, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8631410598754883, + "num_tokens": 274127503.0, + "step": 7186 + }, + { + "epoch": 0.9142602722299962, + "ewc_loss": 0.03535276651382446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7676382412901148e-05, + "grad_norm": 25.449670791625977, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8666439056396484, + "num_tokens": 274164126.0, + "step": 7187 + }, + { + "epoch": 0.9143874825085867, + "ewc_loss": 0.0352851040661335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7642552847974002e-05, + "grad_norm": 25.593955993652344, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8601714968681335, + "num_tokens": 274198104.0, + "step": 7188 + }, + { + "epoch": 0.9145146927871772, + "ewc_loss": 0.035326093435287476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7663047401583754e-05, + "grad_norm": 25.330177307128906, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8496670722961426, + "num_tokens": 274233487.0, + "step": 7189 + }, + { + "epoch": 0.9146419030657678, + "ewc_loss": 0.035319287329912186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.765964407240972e-05, + "grad_norm": 25.61932373046875, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8640890121459961, + "num_tokens": 274267317.0, + "step": 7190 + }, + { + "epoch": 0.9147691133443582, + "ewc_loss": 0.035475220531225204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7737609596224502e-05, + "grad_norm": 25.525541305541992, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8535723090171814, + "num_tokens": 274304169.0, + "step": 7191 + }, + { + "epoch": 0.9148963236229487, + "ewc_loss": 0.03532871603965759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.766435889294371e-05, + "grad_norm": 25.630388259887695, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8617614507675171, + "num_tokens": 274343265.0, + "step": 7192 + }, + { + "epoch": 0.9150235339015392, + "ewc_loss": 0.03540489077568054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7702444893075153e-05, + "grad_norm": 25.558378219604492, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8448894619941711, + "num_tokens": 274380756.0, + "step": 7193 + }, + { + "epoch": 0.9151507441801298, + "ewc_loss": 0.03528086841106415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.764043372531887e-05, + "grad_norm": 25.54109764099121, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8702020645141602, + "num_tokens": 274422035.0, + "step": 7194 + }, + { + "epoch": 0.9152779544587203, + "ewc_loss": 0.035386279225349426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7693138943286613e-05, + "grad_norm": 25.646100997924805, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8504445552825928, + "num_tokens": 274460875.0, + "step": 7195 + }, + { + "epoch": 0.9154051647373108, + "ewc_loss": 0.03528899699449539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7644499166635796e-05, + "grad_norm": 25.501724243164062, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8637832403182983, + "num_tokens": 274495807.0, + "step": 7196 + }, + { + "epoch": 0.9155323750159012, + "ewc_loss": 0.03531358018517494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7656790078035556e-05, + "grad_norm": 25.582775115966797, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8677917718887329, + "num_tokens": 274533098.0, + "step": 7197 + }, + { + "epoch": 0.9156595852944918, + "ewc_loss": 0.03530583903193474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.765291926858481e-05, + "grad_norm": 25.554378509521484, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8557370901107788, + "num_tokens": 274565282.0, + "step": 7198 + }, + { + "epoch": 0.9157867955730823, + "ewc_loss": 0.03530073165893555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7650365407462232e-05, + "grad_norm": 25.64769744873047, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.863850474357605, + "num_tokens": 274605401.0, + "step": 7199 + }, + { + "epoch": 0.9159140058516728, + "ewc_loss": 0.03529854491353035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.76492721948307e-05, + "grad_norm": 25.419172286987305, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8542747497558594, + "num_tokens": 274645018.0, + "step": 7200 + }, + { + "epoch": 0.9160412161302633, + "ewc_loss": 0.03528274595737457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.76413723238511e-05, + "grad_norm": 25.596864700317383, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.869541585445404, + "num_tokens": 274681541.0, + "step": 7201 + }, + { + "epoch": 0.9161684264088539, + "ewc_loss": 0.035382844507694244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7691421817289665e-05, + "grad_norm": 25.572967529296875, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8634841442108154, + "num_tokens": 274716883.0, + "step": 7202 + }, + { + "epoch": 0.9162956366874443, + "ewc_loss": 0.035327985882759094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7663993276073597e-05, + "grad_norm": 25.44180679321289, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.854354977607727, + "num_tokens": 274753038.0, + "step": 7203 + }, + { + "epoch": 0.9164228469660348, + "ewc_loss": 0.035325195640325546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7662598111201078e-05, + "grad_norm": 25.41379165649414, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8660190105438232, + "num_tokens": 274790760.0, + "step": 7204 + }, + { + "epoch": 0.9165500572446253, + "ewc_loss": 0.035397253930568695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.769862683431711e-05, + "grad_norm": 25.5821475982666, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8529826402664185, + "num_tokens": 274830872.0, + "step": 7205 + }, + { + "epoch": 0.9166772675232159, + "ewc_loss": 0.035355761647224426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.767788126016967e-05, + "grad_norm": 25.390756607055664, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8470418453216553, + "num_tokens": 274869832.0, + "step": 7206 + }, + { + "epoch": 0.9168044778018064, + "ewc_loss": 0.03541058674454689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7705293430481106e-05, + "grad_norm": 25.648527145385742, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8518860340118408, + "num_tokens": 274903853.0, + "step": 7207 + }, + { + "epoch": 0.9169316880803969, + "ewc_loss": 0.03550727292895317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7753636711859144e-05, + "grad_norm": 25.586368560791016, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8593220710754395, + "num_tokens": 274942911.0, + "step": 7208 + }, + { + "epoch": 0.9170588983589874, + "ewc_loss": 0.035386402159929276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7693200788926333e-05, + "grad_norm": 25.543561935424805, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8673654794692993, + "num_tokens": 274986427.0, + "step": 7209 + }, + { + "epoch": 0.9171861086375779, + "ewc_loss": 0.03539137542247772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.769568734744098e-05, + "grad_norm": 25.523887634277344, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.862600564956665, + "num_tokens": 275021557.0, + "step": 7210 + }, + { + "epoch": 0.9173133189161684, + "ewc_loss": 0.03536692261695862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.768346191965975e-05, + "grad_norm": 25.468101501464844, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8488375544548035, + "num_tokens": 275061308.0, + "step": 7211 + }, + { + "epoch": 0.9174405291947589, + "ewc_loss": 0.03537566214799881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7687831132207066e-05, + "grad_norm": 25.501230239868164, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8538710474967957, + "num_tokens": 275101497.0, + "step": 7212 + }, + { + "epoch": 0.9175677394733495, + "ewc_loss": 0.03542774170637131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7713871784508228e-05, + "grad_norm": 25.625465393066406, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8476513624191284, + "num_tokens": 275138317.0, + "step": 7213 + }, + { + "epoch": 0.91769494975194, + "ewc_loss": 0.03539310023188591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.769654954841826e-05, + "grad_norm": 25.489797592163086, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8598004579544067, + "num_tokens": 275180119.0, + "step": 7214 + }, + { + "epoch": 0.9178221600305305, + "ewc_loss": 0.035385213792324066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7692607798380777e-05, + "grad_norm": 25.586944580078125, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8585947751998901, + "num_tokens": 275212083.0, + "step": 7215 + }, + { + "epoch": 0.9179493703091209, + "ewc_loss": 0.03543704375624657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.771852112142369e-05, + "grad_norm": 25.54478645324707, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.849616527557373, + "num_tokens": 275247872.0, + "step": 7216 + }, + { + "epoch": 0.9180765805877115, + "ewc_loss": 0.03533957898616791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7669788576313294e-05, + "grad_norm": 25.427902221679688, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.864632785320282, + "num_tokens": 275287022.0, + "step": 7217 + }, + { + "epoch": 0.918203790866302, + "ewc_loss": 0.03537540137767792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.768770016497001e-05, + "grad_norm": 25.581470489501953, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8628615736961365, + "num_tokens": 275322259.0, + "step": 7218 + }, + { + "epoch": 0.9183310011448925, + "ewc_loss": 0.03542060777544975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7710304746287875e-05, + "grad_norm": 25.391233444213867, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8742319345474243, + "num_tokens": 275357410.0, + "step": 7219 + }, + { + "epoch": 0.918458211423483, + "ewc_loss": 0.03535759449005127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7678798030829057e-05, + "grad_norm": 25.522855758666992, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8708741664886475, + "num_tokens": 275388720.0, + "step": 7220 + }, + { + "epoch": 0.9185854217020736, + "ewc_loss": 0.03544542193412781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7722710254020058e-05, + "grad_norm": 25.459487915039062, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8665865659713745, + "num_tokens": 275427699.0, + "step": 7221 + }, + { + "epoch": 0.918712631980664, + "ewc_loss": 0.03543413430452347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.771706774889026e-05, + "grad_norm": 25.544832229614258, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8627307415008545, + "num_tokens": 275472220.0, + "step": 7222 + }, + { + "epoch": 0.9188398422592545, + "ewc_loss": 0.03543142229318619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7715710782795213e-05, + "grad_norm": 25.547115325927734, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.848029375076294, + "num_tokens": 275506693.0, + "step": 7223 + }, + { + "epoch": 0.918967052537845, + "ewc_loss": 0.03540092706680298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7700464013614692e-05, + "grad_norm": 25.445138931274414, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8635632395744324, + "num_tokens": 275547295.0, + "step": 7224 + }, + { + "epoch": 0.9190942628164356, + "ewc_loss": 0.0354241207242012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7712060071062297e-05, + "grad_norm": 25.523147583007812, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8570128083229065, + "num_tokens": 275591552.0, + "step": 7225 + }, + { + "epoch": 0.9192214730950261, + "ewc_loss": 0.03543612360954285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7718060917104594e-05, + "grad_norm": 25.561763763427734, + "learning_rate": 1e-06, + "loss": 0.5316, + "mean_token_accuracy": 0.8353127241134644, + "num_tokens": 275632197.0, + "step": 7226 + }, + { + "epoch": 0.9193486833736166, + "ewc_loss": 0.03533383458852768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7666916392045096e-05, + "grad_norm": 25.460777282714844, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8531444072723389, + "num_tokens": 275670680.0, + "step": 7227 + }, + { + "epoch": 0.919475893652207, + "ewc_loss": 0.035391975194215775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7695987480692565e-05, + "grad_norm": 25.478652954101562, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8583624362945557, + "num_tokens": 275709315.0, + "step": 7228 + }, + { + "epoch": 0.9196031039307976, + "ewc_loss": 0.03539803996682167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7699019736028276e-05, + "grad_norm": 25.475460052490234, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8594374656677246, + "num_tokens": 275740405.0, + "step": 7229 + }, + { + "epoch": 0.9197303142093881, + "ewc_loss": 0.035452887415885925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7726444639265537e-05, + "grad_norm": 25.47976303100586, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8778553605079651, + "num_tokens": 275780915.0, + "step": 7230 + }, + { + "epoch": 0.9198575244879786, + "ewc_loss": 0.035425540059804916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.771276947692968e-05, + "grad_norm": 25.474443435668945, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8537962436676025, + "num_tokens": 275817832.0, + "step": 7231 + }, + { + "epoch": 0.9199847347665692, + "ewc_loss": 0.03541101515293121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7705508071230724e-05, + "grad_norm": 25.533540725708008, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8641865253448486, + "num_tokens": 275857621.0, + "step": 7232 + }, + { + "epoch": 0.9201119450451597, + "ewc_loss": 0.03531069681048393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7655347619438544e-05, + "grad_norm": 25.456899642944336, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8566138744354248, + "num_tokens": 275898917.0, + "step": 7233 + }, + { + "epoch": 0.9202391553237501, + "ewc_loss": 0.03537128120660782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7685641068965197e-05, + "grad_norm": 25.507144927978516, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8639816045761108, + "num_tokens": 275930151.0, + "step": 7234 + }, + { + "epoch": 0.9203663656023406, + "ewc_loss": 0.035390984266996384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.76954927155748e-05, + "grad_norm": 25.567724227905273, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8674943447113037, + "num_tokens": 275966901.0, + "step": 7235 + }, + { + "epoch": 0.9204935758809312, + "ewc_loss": 0.03539790213108063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.769895061443094e-05, + "grad_norm": 25.473827362060547, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8824607729911804, + "num_tokens": 276002845.0, + "step": 7236 + }, + { + "epoch": 0.9206207861595217, + "ewc_loss": 0.035337645560503006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.766882269294001e-05, + "grad_norm": 25.53205108642578, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8578688502311707, + "num_tokens": 276038476.0, + "step": 7237 + }, + { + "epoch": 0.9207479964381122, + "ewc_loss": 0.03538743034005165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7693715562927537e-05, + "grad_norm": 25.589080810546875, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8558231592178345, + "num_tokens": 276077867.0, + "step": 7238 + }, + { + "epoch": 0.9208752067167028, + "ewc_loss": 0.03533902391791344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7669512089923956e-05, + "grad_norm": 25.471044540405273, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8497359752655029, + "num_tokens": 276123289.0, + "step": 7239 + }, + { + "epoch": 0.9210024169952932, + "ewc_loss": 0.035320866852998734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7660433513810858e-05, + "grad_norm": 25.55132293701172, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8473948836326599, + "num_tokens": 276157357.0, + "step": 7240 + }, + { + "epoch": 0.9211296272738837, + "ewc_loss": 0.03539303317666054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7696516806608997e-05, + "grad_norm": 25.521657943725586, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8802649974822998, + "num_tokens": 276195548.0, + "step": 7241 + }, + { + "epoch": 0.9212568375524742, + "ewc_loss": 0.035353343933820724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7676671632216312e-05, + "grad_norm": 25.536537170410156, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8622208833694458, + "num_tokens": 276233533.0, + "step": 7242 + }, + { + "epoch": 0.9213840478310648, + "ewc_loss": 0.03544210270047188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7721051335684024e-05, + "grad_norm": 25.58843231201172, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8637241721153259, + "num_tokens": 276274310.0, + "step": 7243 + }, + { + "epoch": 0.9215112581096553, + "ewc_loss": 0.03534804284572601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7674021364655346e-05, + "grad_norm": 25.40930938720703, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8567144870758057, + "num_tokens": 276310272.0, + "step": 7244 + }, + { + "epoch": 0.9216384683882458, + "ewc_loss": 0.035396773368120193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7698386727715842e-05, + "grad_norm": 25.53508758544922, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.854434609413147, + "num_tokens": 276350334.0, + "step": 7245 + }, + { + "epoch": 0.9217656786668362, + "ewc_loss": 0.03545030206441879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7725151337799616e-05, + "grad_norm": 25.519865036010742, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8544381260871887, + "num_tokens": 276393913.0, + "step": 7246 + }, + { + "epoch": 0.9218928889454268, + "ewc_loss": 0.03540777042508125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.770388553268276e-05, + "grad_norm": 25.53801727294922, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8569170832633972, + "num_tokens": 276430368.0, + "step": 7247 + }, + { + "epoch": 0.9220200992240173, + "ewc_loss": 0.03541535139083862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.770767630659975e-05, + "grad_norm": 25.483184814453125, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8536865711212158, + "num_tokens": 276470930.0, + "step": 7248 + }, + { + "epoch": 0.9221473095026078, + "ewc_loss": 0.03536021336913109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7680105884210207e-05, + "grad_norm": 25.4903621673584, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.864743709564209, + "num_tokens": 276509359.0, + "step": 7249 + }, + { + "epoch": 0.9222745197811983, + "ewc_loss": 0.03542333096265793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7711665350361727e-05, + "grad_norm": 25.44791030883789, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8522518873214722, + "num_tokens": 276546980.0, + "step": 7250 + }, + { + "epoch": 0.9224017300597889, + "ewc_loss": 0.03541148453950882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7705742720863782e-05, + "grad_norm": 25.48145294189453, + "learning_rate": 1e-06, + "loss": 0.4999, + "mean_token_accuracy": 0.8470511436462402, + "num_tokens": 276586212.0, + "step": 7251 + }, + { + "epoch": 0.9225289403383793, + "ewc_loss": 0.03547689691185951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7738448150339536e-05, + "grad_norm": 25.556800842285156, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.861918568611145, + "num_tokens": 276623946.0, + "step": 7252 + }, + { + "epoch": 0.9226561506169698, + "ewc_loss": 0.03542771562933922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7713857232593e-05, + "grad_norm": 25.53250503540039, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.85380619764328, + "num_tokens": 276667567.0, + "step": 7253 + }, + { + "epoch": 0.9227833608955603, + "ewc_loss": 0.0353759340941906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7687967556412332e-05, + "grad_norm": 25.464685440063477, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8533358573913574, + "num_tokens": 276708479.0, + "step": 7254 + }, + { + "epoch": 0.9229105711741509, + "ewc_loss": 0.03537312150001526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.768656147760339e-05, + "grad_norm": 25.51678466796875, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8546755909919739, + "num_tokens": 276744627.0, + "step": 7255 + }, + { + "epoch": 0.9230377814527414, + "ewc_loss": 0.03540316969156265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7701584511087276e-05, + "grad_norm": 25.621196746826172, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8601469993591309, + "num_tokens": 276782830.0, + "step": 7256 + }, + { + "epoch": 0.9231649917313319, + "ewc_loss": 0.03537170588970184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7685852071736008e-05, + "grad_norm": 25.49921226501465, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8578311800956726, + "num_tokens": 276825745.0, + "step": 7257 + }, + { + "epoch": 0.9232922020099223, + "ewc_loss": 0.03535836562514305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.767918365658261e-05, + "grad_norm": 25.62017059326172, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.865409255027771, + "num_tokens": 276866743.0, + "step": 7258 + }, + { + "epoch": 0.9234194122885129, + "ewc_loss": 0.03544246777892113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.772123323462438e-05, + "grad_norm": 25.6074161529541, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8550839424133301, + "num_tokens": 276903884.0, + "step": 7259 + }, + { + "epoch": 0.9235466225671034, + "ewc_loss": 0.03534746170043945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.767373032635078e-05, + "grad_norm": 25.576810836791992, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8648097515106201, + "num_tokens": 276939142.0, + "step": 7260 + }, + { + "epoch": 0.9236738328456939, + "ewc_loss": 0.035415537655353546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7707769075059332e-05, + "grad_norm": 25.57139015197754, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8608424663543701, + "num_tokens": 276979041.0, + "step": 7261 + }, + { + "epoch": 0.9238010431242845, + "ewc_loss": 0.035420503467321396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7710251995595172e-05, + "grad_norm": 25.531517028808594, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8717316389083862, + "num_tokens": 277015590.0, + "step": 7262 + }, + { + "epoch": 0.923928253402875, + "ewc_loss": 0.03542979434132576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7714897694531828e-05, + "grad_norm": 25.600040435791016, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8522685766220093, + "num_tokens": 277055745.0, + "step": 7263 + }, + { + "epoch": 0.9240554636814655, + "ewc_loss": 0.03542555123567581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.771277493389789e-05, + "grad_norm": 25.56557846069336, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8556308746337891, + "num_tokens": 277098856.0, + "step": 7264 + }, + { + "epoch": 0.9241826739600559, + "ewc_loss": 0.03535208851099014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.767604408087209e-05, + "grad_norm": 25.514938354492188, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8702996969223022, + "num_tokens": 277139165.0, + "step": 7265 + }, + { + "epoch": 0.9243098842386465, + "ewc_loss": 0.035360101610422134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.76800513145281e-05, + "grad_norm": 25.574522018432617, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8636791706085205, + "num_tokens": 277171879.0, + "step": 7266 + }, + { + "epoch": 0.924437094517237, + "ewc_loss": 0.03536677360534668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7683387341094203e-05, + "grad_norm": 25.476776123046875, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8608872294425964, + "num_tokens": 277211719.0, + "step": 7267 + }, + { + "epoch": 0.9245643047958275, + "ewc_loss": 0.035379257053136826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.768962829373777e-05, + "grad_norm": 25.66383934020996, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8511670827865601, + "num_tokens": 277252675.0, + "step": 7268 + }, + { + "epoch": 0.924691515074418, + "ewc_loss": 0.03542545065283775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7712725821183994e-05, + "grad_norm": 25.5869197845459, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8479955196380615, + "num_tokens": 277287529.0, + "step": 7269 + }, + { + "epoch": 0.9248187253530086, + "ewc_loss": 0.035360001027584076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.76800003828248e-05, + "grad_norm": 25.627338409423828, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8593499660491943, + "num_tokens": 277328392.0, + "step": 7270 + }, + { + "epoch": 0.924945935631599, + "ewc_loss": 0.035379767417907715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7689882952254266e-05, + "grad_norm": 25.554410934448242, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8694556951522827, + "num_tokens": 277368412.0, + "step": 7271 + }, + { + "epoch": 0.9250731459101895, + "ewc_loss": 0.03531505540013313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7657528587733395e-05, + "grad_norm": 25.50597381591797, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8646262884140015, + "num_tokens": 277404373.0, + "step": 7272 + }, + { + "epoch": 0.92520035618878, + "ewc_loss": 0.035392407327890396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7696203940431587e-05, + "grad_norm": 25.5989933013916, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8571528196334839, + "num_tokens": 277444988.0, + "step": 7273 + }, + { + "epoch": 0.9253275664673706, + "ewc_loss": 0.035359106957912445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.767955291143153e-05, + "grad_norm": 25.50309181213379, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8691223859786987, + "num_tokens": 277487501.0, + "step": 7274 + }, + { + "epoch": 0.9254547767459611, + "ewc_loss": 0.03537818044424057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.768908987287432e-05, + "grad_norm": 25.64631462097168, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.855736255645752, + "num_tokens": 277529056.0, + "step": 7275 + }, + { + "epoch": 0.9255819870245516, + "ewc_loss": 0.03544219583272934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7721098629408516e-05, + "grad_norm": 25.519302368164062, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8717548847198486, + "num_tokens": 277563022.0, + "step": 7276 + }, + { + "epoch": 0.925709197303142, + "ewc_loss": 0.03536263108253479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7681315512163565e-05, + "grad_norm": 25.60020637512207, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8674931526184082, + "num_tokens": 277603879.0, + "step": 7277 + }, + { + "epoch": 0.9258364075817326, + "ewc_loss": 0.03542168438434601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.771084134816192e-05, + "grad_norm": 25.594327926635742, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8552539348602295, + "num_tokens": 277643711.0, + "step": 7278 + }, + { + "epoch": 0.9259636178603231, + "ewc_loss": 0.03532009571790695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7660047888057306e-05, + "grad_norm": 25.4597110748291, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8592902421951294, + "num_tokens": 277680667.0, + "step": 7279 + }, + { + "epoch": 0.9260908281389136, + "ewc_loss": 0.03536458685994148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7682294128462672e-05, + "grad_norm": 25.714351654052734, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8572580218315125, + "num_tokens": 277715548.0, + "step": 7280 + }, + { + "epoch": 0.9262180384175042, + "ewc_loss": 0.0354485921561718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.772429641277995e-05, + "grad_norm": 25.60851287841797, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8586715459823608, + "num_tokens": 277745287.0, + "step": 7281 + }, + { + "epoch": 0.9263452486960947, + "ewc_loss": 0.035378992557525635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.768949550751131e-05, + "grad_norm": 25.538877487182617, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8522898554801941, + "num_tokens": 277781484.0, + "step": 7282 + }, + { + "epoch": 0.9264724589746851, + "ewc_loss": 0.03540385887026787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.770193011907395e-05, + "grad_norm": 25.595945358276367, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8619830012321472, + "num_tokens": 277818429.0, + "step": 7283 + }, + { + "epoch": 0.9265996692532756, + "ewc_loss": 0.03543020039796829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7715099602355622e-05, + "grad_norm": 25.57829475402832, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8675013184547424, + "num_tokens": 277853077.0, + "step": 7284 + }, + { + "epoch": 0.9267268795318662, + "ewc_loss": 0.03542974218726158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7714870409690775e-05, + "grad_norm": 25.537561416625977, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8773881793022156, + "num_tokens": 277893997.0, + "step": 7285 + }, + { + "epoch": 0.9268540898104567, + "ewc_loss": 0.035465117543935776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7732558262650855e-05, + "grad_norm": 25.571392059326172, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8491382598876953, + "num_tokens": 277934680.0, + "step": 7286 + }, + { + "epoch": 0.9269813000890472, + "ewc_loss": 0.03544320911169052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7721604308462702e-05, + "grad_norm": 25.559823989868164, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8521429300308228, + "num_tokens": 277980370.0, + "step": 7287 + }, + { + "epoch": 0.9271085103676378, + "ewc_loss": 0.03540189564228058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7700947864796035e-05, + "grad_norm": 25.61714744567871, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8719829320907593, + "num_tokens": 278017257.0, + "step": 7288 + }, + { + "epoch": 0.9272357206462282, + "ewc_loss": 0.03545350208878517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7726750229485333e-05, + "grad_norm": 25.49347686767578, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8594365119934082, + "num_tokens": 278054835.0, + "step": 7289 + }, + { + "epoch": 0.9273629309248187, + "ewc_loss": 0.035394784063100815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7697391740512103e-05, + "grad_norm": 25.611635208129883, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8583080172538757, + "num_tokens": 278089510.0, + "step": 7290 + }, + { + "epoch": 0.9274901412034092, + "ewc_loss": 0.035448960959911346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7724480130709708e-05, + "grad_norm": 25.51395606994629, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8612788319587708, + "num_tokens": 278126179.0, + "step": 7291 + }, + { + "epoch": 0.9276173514819998, + "ewc_loss": 0.03542212396860123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.771106144587975e-05, + "grad_norm": 25.557342529296875, + "learning_rate": 1e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.8373851180076599, + "num_tokens": 278170837.0, + "step": 7292 + }, + { + "epoch": 0.9277445617605903, + "ewc_loss": 0.03540465980768204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.770233029674273e-05, + "grad_norm": 25.49595069885254, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8649508953094482, + "num_tokens": 278207410.0, + "step": 7293 + }, + { + "epoch": 0.9278717720391808, + "ewc_loss": 0.035456083714962006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.772804171196185e-05, + "grad_norm": 25.544065475463867, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8791240453720093, + "num_tokens": 278240495.0, + "step": 7294 + }, + { + "epoch": 0.9279989823177712, + "ewc_loss": 0.03552338480949402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.776169301592745e-05, + "grad_norm": 25.62248992919922, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8510307669639587, + "num_tokens": 278278449.0, + "step": 7295 + }, + { + "epoch": 0.9281261925963618, + "ewc_loss": 0.03548440337181091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7742202544468455e-05, + "grad_norm": 25.524198532104492, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8697823286056519, + "num_tokens": 278321434.0, + "step": 7296 + }, + { + "epoch": 0.9282534028749523, + "ewc_loss": 0.035445958375930786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7722979464451782e-05, + "grad_norm": 25.617956161499023, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8541898727416992, + "num_tokens": 278363730.0, + "step": 7297 + }, + { + "epoch": 0.9283806131535428, + "ewc_loss": 0.03546951711177826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7734759239829145e-05, + "grad_norm": 25.571487426757812, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8612386584281921, + "num_tokens": 278395493.0, + "step": 7298 + }, + { + "epoch": 0.9285078234321333, + "ewc_loss": 0.03547216206789017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7736081645125523e-05, + "grad_norm": 25.576459884643555, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8619365692138672, + "num_tokens": 278433024.0, + "step": 7299 + }, + { + "epoch": 0.9286350337107239, + "ewc_loss": 0.035495657473802567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7747828678693622e-05, + "grad_norm": 25.551986694335938, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8619798421859741, + "num_tokens": 278471311.0, + "step": 7300 + }, + { + "epoch": 0.9287622439893143, + "ewc_loss": 0.035467974841594696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7733987988322042e-05, + "grad_norm": 25.54410743713379, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8796769380569458, + "num_tokens": 278509151.0, + "step": 7301 + }, + { + "epoch": 0.9288894542679048, + "ewc_loss": 0.035465434193611145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7732716514728963e-05, + "grad_norm": 25.674301147460938, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8675490617752075, + "num_tokens": 278548743.0, + "step": 7302 + }, + { + "epoch": 0.9290166645464953, + "ewc_loss": 0.035445231944322586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7722615666571073e-05, + "grad_norm": 25.566476821899414, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8718814849853516, + "num_tokens": 278590722.0, + "step": 7303 + }, + { + "epoch": 0.9291438748250859, + "ewc_loss": 0.03539486601948738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7697433577268384e-05, + "grad_norm": 25.58020782470703, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.846729576587677, + "num_tokens": 278628150.0, + "step": 7304 + }, + { + "epoch": 0.9292710851036764, + "ewc_loss": 0.03547034412622452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.773517215042375e-05, + "grad_norm": 25.55857276916504, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8438553810119629, + "num_tokens": 278672543.0, + "step": 7305 + }, + { + "epoch": 0.9293982953822669, + "ewc_loss": 0.035428307950496674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7714153727865778e-05, + "grad_norm": 25.524507522583008, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8691601157188416, + "num_tokens": 278707234.0, + "step": 7306 + }, + { + "epoch": 0.9295255056608573, + "ewc_loss": 0.0353831946849823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.769159825926181e-05, + "grad_norm": 25.609407424926758, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8632791042327881, + "num_tokens": 278734678.0, + "step": 7307 + }, + { + "epoch": 0.9296527159394479, + "ewc_loss": 0.03540782257914543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.770391099853441e-05, + "grad_norm": 25.451631546020508, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8695478439331055, + "num_tokens": 278778284.0, + "step": 7308 + }, + { + "epoch": 0.9297799262180384, + "ewc_loss": 0.03538045659661293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.769022856024094e-05, + "grad_norm": 25.537723541259766, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8601354956626892, + "num_tokens": 278815363.0, + "step": 7309 + }, + { + "epoch": 0.9299071364966289, + "ewc_loss": 0.03550787642598152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7753938664100133e-05, + "grad_norm": 25.61742401123047, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8485175371170044, + "num_tokens": 278860777.0, + "step": 7310 + }, + { + "epoch": 0.9300343467752195, + "ewc_loss": 0.03537992760539055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.768996298778802e-05, + "grad_norm": 25.508169174194336, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8743560314178467, + "num_tokens": 278890552.0, + "step": 7311 + }, + { + "epoch": 0.93016155705381, + "ewc_loss": 0.035510096698999405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7755048247636296e-05, + "grad_norm": 25.72640609741211, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8490270376205444, + "num_tokens": 278924741.0, + "step": 7312 + }, + { + "epoch": 0.9302887673324005, + "ewc_loss": 0.03542984277009964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7714921341394074e-05, + "grad_norm": 25.51026153564453, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8685778379440308, + "num_tokens": 278965185.0, + "step": 7313 + }, + { + "epoch": 0.9304159776109909, + "ewc_loss": 0.0354192852973938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7709642634144984e-05, + "grad_norm": 25.662899017333984, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8496453166007996, + "num_tokens": 278998961.0, + "step": 7314 + }, + { + "epoch": 0.9305431878895815, + "ewc_loss": 0.03552785888314247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7763928553904407e-05, + "grad_norm": 25.663772583007812, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.848034143447876, + "num_tokens": 279032689.0, + "step": 7315 + }, + { + "epoch": 0.930670398168172, + "ewc_loss": 0.03539394959807396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.769697519193869e-05, + "grad_norm": 25.49281883239746, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.846855103969574, + "num_tokens": 279080305.0, + "step": 7316 + }, + { + "epoch": 0.9307976084467625, + "ewc_loss": 0.03548480570316315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7742402633302845e-05, + "grad_norm": 25.628387451171875, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8612174987792969, + "num_tokens": 279120581.0, + "step": 7317 + }, + { + "epoch": 0.930924818725353, + "ewc_loss": 0.035471361130476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7735681467456743e-05, + "grad_norm": 25.468774795532227, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8720124959945679, + "num_tokens": 279165050.0, + "step": 7318 + }, + { + "epoch": 0.9310520290039436, + "ewc_loss": 0.03549090772867203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.774545307853259e-05, + "grad_norm": 25.655685424804688, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8643938302993774, + "num_tokens": 279209393.0, + "step": 7319 + }, + { + "epoch": 0.931179239282534, + "ewc_loss": 0.03548910841345787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.774455449776724e-05, + "grad_norm": 25.4979248046875, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.848422646522522, + "num_tokens": 279250569.0, + "step": 7320 + }, + { + "epoch": 0.9313064495611245, + "ewc_loss": 0.03544604033231735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.772301948221866e-05, + "grad_norm": 25.58412742614746, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8657894730567932, + "num_tokens": 279291154.0, + "step": 7321 + }, + { + "epoch": 0.931433659839715, + "ewc_loss": 0.03549840301275253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.77492020156933e-05, + "grad_norm": 25.61869239807129, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8599879741668701, + "num_tokens": 279326967.0, + "step": 7322 + }, + { + "epoch": 0.9315608701183056, + "ewc_loss": 0.03554272651672363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.77713627635967e-05, + "grad_norm": 25.51000213623047, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.873030424118042, + "num_tokens": 279367129.0, + "step": 7323 + }, + { + "epoch": 0.9316880803968961, + "ewc_loss": 0.035490263253450394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7745131117408164e-05, + "grad_norm": 25.6245059967041, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8659974932670593, + "num_tokens": 279403651.0, + "step": 7324 + }, + { + "epoch": 0.9318152906754866, + "ewc_loss": 0.03552214428782463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.776107274054084e-05, + "grad_norm": 25.60496711730957, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8528150320053101, + "num_tokens": 279439166.0, + "step": 7325 + }, + { + "epoch": 0.931942500954077, + "ewc_loss": 0.035503167659044266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.775158307282254e-05, + "grad_norm": 25.616830825805664, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8580045700073242, + "num_tokens": 279473489.0, + "step": 7326 + }, + { + "epoch": 0.9320697112326676, + "ewc_loss": 0.03552168235182762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.776084172888659e-05, + "grad_norm": 25.628250122070312, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8638159036636353, + "num_tokens": 279511709.0, + "step": 7327 + }, + { + "epoch": 0.9321969215112581, + "ewc_loss": 0.03543303161859512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7716516595100984e-05, + "grad_norm": 25.462709426879883, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8487870097160339, + "num_tokens": 279551638.0, + "step": 7328 + }, + { + "epoch": 0.9323241317898486, + "ewc_loss": 0.035463638603687286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7731819752953015e-05, + "grad_norm": 25.709415435791016, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8716976642608643, + "num_tokens": 279587304.0, + "step": 7329 + }, + { + "epoch": 0.9324513420684392, + "ewc_loss": 0.03551940247416496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7759701222530566e-05, + "grad_norm": 25.50275993347168, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8516502976417542, + "num_tokens": 279628138.0, + "step": 7330 + }, + { + "epoch": 0.9325785523470297, + "ewc_loss": 0.03542604669928551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7713024135446176e-05, + "grad_norm": 25.686777114868164, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8648453950881958, + "num_tokens": 279663064.0, + "step": 7331 + }, + { + "epoch": 0.9327057626256201, + "ewc_loss": 0.035479847341775894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7739923350745812e-05, + "grad_norm": 25.55974769592285, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8516647815704346, + "num_tokens": 279704937.0, + "step": 7332 + }, + { + "epoch": 0.9328329729042106, + "ewc_loss": 0.03542377054691315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7711885448079556e-05, + "grad_norm": 25.809640884399414, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8633560538291931, + "num_tokens": 279738215.0, + "step": 7333 + }, + { + "epoch": 0.9329601831828012, + "ewc_loss": 0.035508908331394196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7754453438101336e-05, + "grad_norm": 25.614795684814453, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8447213172912598, + "num_tokens": 279771996.0, + "step": 7334 + }, + { + "epoch": 0.9330873934613917, + "ewc_loss": 0.035399727523326874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.769986374711152e-05, + "grad_norm": 25.772565841674805, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8745731115341187, + "num_tokens": 279810037.0, + "step": 7335 + }, + { + "epoch": 0.9332146037399822, + "ewc_loss": 0.03544357419013977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7721786207403056e-05, + "grad_norm": 25.649852752685547, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8535072207450867, + "num_tokens": 279847394.0, + "step": 7336 + }, + { + "epoch": 0.9333418140185727, + "ewc_loss": 0.03540409356355667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7702046534395777e-05, + "grad_norm": 25.756622314453125, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8461855053901672, + "num_tokens": 279879057.0, + "step": 7337 + }, + { + "epoch": 0.9334690242971632, + "ewc_loss": 0.03543199971318245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7716000002110377e-05, + "grad_norm": 25.6379451751709, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.853512704372406, + "num_tokens": 279911775.0, + "step": 7338 + }, + { + "epoch": 0.9335962345757537, + "ewc_loss": 0.03542642667889595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7713213310344145e-05, + "grad_norm": 25.86357879638672, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8626692295074463, + "num_tokens": 279947661.0, + "step": 7339 + }, + { + "epoch": 0.9337234448543442, + "ewc_loss": 0.03543020784854889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.771510324033443e-05, + "grad_norm": 25.792261123657227, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.861143946647644, + "num_tokens": 279977507.0, + "step": 7340 + }, + { + "epoch": 0.9338506551329347, + "ewc_loss": 0.03530091047286987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7650454537943006e-05, + "grad_norm": 25.625080108642578, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8569256663322449, + "num_tokens": 280016749.0, + "step": 7341 + }, + { + "epoch": 0.9339778654115253, + "ewc_loss": 0.035416364669799805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7708181985653937e-05, + "grad_norm": 25.750652313232422, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8623936772346497, + "num_tokens": 280056092.0, + "step": 7342 + }, + { + "epoch": 0.9341050756901158, + "ewc_loss": 0.03546954318881035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.773477197275497e-05, + "grad_norm": 25.78571128845215, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8721640110015869, + "num_tokens": 280089803.0, + "step": 7343 + }, + { + "epoch": 0.9342322859687062, + "ewc_loss": 0.03535780310630798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.767890171322506e-05, + "grad_norm": 25.616060256958008, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.847434401512146, + "num_tokens": 280126190.0, + "step": 7344 + }, + { + "epoch": 0.9343594962472968, + "ewc_loss": 0.035398490726947784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7699245290714316e-05, + "grad_norm": 25.705827713012695, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8723926544189453, + "num_tokens": 280162743.0, + "step": 7345 + }, + { + "epoch": 0.9344867065258873, + "ewc_loss": 0.03544272482395172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.772136238287203e-05, + "grad_norm": 25.60915756225586, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8602908849716187, + "num_tokens": 280203676.0, + "step": 7346 + }, + { + "epoch": 0.9346139168044778, + "ewc_loss": 0.03543687239289284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7718435628921725e-05, + "grad_norm": 25.785009384155273, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8773044943809509, + "num_tokens": 280241765.0, + "step": 7347 + }, + { + "epoch": 0.9347411270830683, + "ewc_loss": 0.035445015877485275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7722508346196264e-05, + "grad_norm": 25.566940307617188, + "learning_rate": 1e-06, + "loss": 0.546, + "mean_token_accuracy": 0.839599609375, + "num_tokens": 280280103.0, + "step": 7348 + }, + { + "epoch": 0.9348683373616589, + "ewc_loss": 0.03537747636437416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7688738807919435e-05, + "grad_norm": 25.733362197875977, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8435064554214478, + "num_tokens": 280314674.0, + "step": 7349 + }, + { + "epoch": 0.9349955476402493, + "ewc_loss": 0.035518448799848557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7759224647306837e-05, + "grad_norm": 25.548654556274414, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8617262840270996, + "num_tokens": 280353890.0, + "step": 7350 + }, + { + "epoch": 0.9351227579188398, + "ewc_loss": 0.03551260754466057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7756303350324742e-05, + "grad_norm": 25.644886016845703, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8522500991821289, + "num_tokens": 280396403.0, + "step": 7351 + }, + { + "epoch": 0.9352499681974303, + "ewc_loss": 0.035561420023441315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.778071055014152e-05, + "grad_norm": 25.598073959350586, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.861355185508728, + "num_tokens": 280436154.0, + "step": 7352 + }, + { + "epoch": 0.9353771784760209, + "ewc_loss": 0.03558250516653061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7791253412724473e-05, + "grad_norm": 25.601289749145508, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8680371046066284, + "num_tokens": 280476480.0, + "step": 7353 + }, + { + "epoch": 0.9355043887546114, + "ewc_loss": 0.035470619797706604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.773531039361842e-05, + "grad_norm": 25.608888626098633, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8644511699676514, + "num_tokens": 280505976.0, + "step": 7354 + }, + { + "epoch": 0.9356315990332019, + "ewc_loss": 0.03555018827319145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.777509351086337e-05, + "grad_norm": 25.585845947265625, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8569798469543457, + "num_tokens": 280548365.0, + "step": 7355 + }, + { + "epoch": 0.9357588093117923, + "ewc_loss": 0.03552732616662979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.776366298145149e-05, + "grad_norm": 25.6391658782959, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8783226013183594, + "num_tokens": 280588074.0, + "step": 7356 + }, + { + "epoch": 0.9358860195903829, + "ewc_loss": 0.035588063299655914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7794031009543687e-05, + "grad_norm": 25.684171676635742, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8706192970275879, + "num_tokens": 280626915.0, + "step": 7357 + }, + { + "epoch": 0.9360132298689734, + "ewc_loss": 0.035599883645772934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7799940906115808e-05, + "grad_norm": 25.70817756652832, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8546469211578369, + "num_tokens": 280660088.0, + "step": 7358 + }, + { + "epoch": 0.9361404401475639, + "ewc_loss": 0.03554821386933327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.777410761860665e-05, + "grad_norm": 25.584257125854492, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8593710660934448, + "num_tokens": 280701326.0, + "step": 7359 + }, + { + "epoch": 0.9362676504261545, + "ewc_loss": 0.03554414585232735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.777207216946408e-05, + "grad_norm": 25.701871871948242, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8702639937400818, + "num_tokens": 280740256.0, + "step": 7360 + }, + { + "epoch": 0.936394860704745, + "ewc_loss": 0.03554082661867142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7770413251128048e-05, + "grad_norm": 25.618934631347656, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8748331069946289, + "num_tokens": 280773515.0, + "step": 7361 + }, + { + "epoch": 0.9365220709833355, + "ewc_loss": 0.035541582852602005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7770791600923985e-05, + "grad_norm": 25.6298885345459, + "learning_rate": 1e-06, + "loss": 0.5225, + "mean_token_accuracy": 0.8351131677627563, + "num_tokens": 280817207.0, + "step": 7362 + }, + { + "epoch": 0.9366492812619259, + "ewc_loss": 0.03551207855343819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.775603959686123e-05, + "grad_norm": 25.608001708984375, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.8477445840835571, + "num_tokens": 280860253.0, + "step": 7363 + }, + { + "epoch": 0.9367764915405165, + "ewc_loss": 0.03554548695683479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.777274337655399e-05, + "grad_norm": 25.665006637573242, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8571845293045044, + "num_tokens": 280892634.0, + "step": 7364 + }, + { + "epoch": 0.936903701819107, + "ewc_loss": 0.03548601642251015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7743008356774226e-05, + "grad_norm": 25.567535400390625, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8691133856773376, + "num_tokens": 280930912.0, + "step": 7365 + }, + { + "epoch": 0.9370309120976975, + "ewc_loss": 0.035541437566280365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7770718841347843e-05, + "grad_norm": 25.65090560913086, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8639991879463196, + "num_tokens": 280971588.0, + "step": 7366 + }, + { + "epoch": 0.937158122376288, + "ewc_loss": 0.03563389554619789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.781694845703896e-05, + "grad_norm": 25.646839141845703, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8692570328712463, + "num_tokens": 281009898.0, + "step": 7367 + }, + { + "epoch": 0.9372853326548786, + "ewc_loss": 0.03552110493183136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7760552509571426e-05, + "grad_norm": 25.76173210144043, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8532132506370544, + "num_tokens": 281046726.0, + "step": 7368 + }, + { + "epoch": 0.937412542933469, + "ewc_loss": 0.03557130694389343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7785652744350955e-05, + "grad_norm": 25.61397933959961, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8744242787361145, + "num_tokens": 281085163.0, + "step": 7369 + }, + { + "epoch": 0.9375397532120595, + "ewc_loss": 0.035536982119083405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.77684905793285e-05, + "grad_norm": 25.679996490478516, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8663421869277954, + "num_tokens": 281128660.0, + "step": 7370 + }, + { + "epoch": 0.93766696349065, + "ewc_loss": 0.035539206117391586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.776960380084347e-05, + "grad_norm": 25.70037078857422, + "learning_rate": 1e-06, + "loss": 0.5196, + "mean_token_accuracy": 0.8416454792022705, + "num_tokens": 281163780.0, + "step": 7371 + }, + { + "epoch": 0.9377941737692406, + "ewc_loss": 0.03551146388053894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.775573218765203e-05, + "grad_norm": 25.692522048950195, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8686134219169617, + "num_tokens": 281204247.0, + "step": 7372 + }, + { + "epoch": 0.9379213840478311, + "ewc_loss": 0.035518549382686615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7759273760020733e-05, + "grad_norm": 25.608564376831055, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8848884105682373, + "num_tokens": 281239518.0, + "step": 7373 + }, + { + "epoch": 0.9380485943264216, + "ewc_loss": 0.03551439940929413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.775720011210069e-05, + "grad_norm": 25.68734359741211, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.869458019733429, + "num_tokens": 281275916.0, + "step": 7374 + }, + { + "epoch": 0.938175804605012, + "ewc_loss": 0.035512879490852356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.775643977453001e-05, + "grad_norm": 25.711219787597656, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8715319633483887, + "num_tokens": 281307615.0, + "step": 7375 + }, + { + "epoch": 0.9383030148836026, + "ewc_loss": 0.03548417240381241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7742086129146628e-05, + "grad_norm": 25.65777015686035, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8653209209442139, + "num_tokens": 281342677.0, + "step": 7376 + }, + { + "epoch": 0.9384302251621931, + "ewc_loss": 0.03554510697722435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.777255420165602e-05, + "grad_norm": 25.709331512451172, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.856706976890564, + "num_tokens": 281379545.0, + "step": 7377 + }, + { + "epoch": 0.9385574354407836, + "ewc_loss": 0.03555908426642418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7779542758944444e-05, + "grad_norm": 25.655353546142578, + "learning_rate": 1e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8418295979499817, + "num_tokens": 281424169.0, + "step": 7378 + }, + { + "epoch": 0.9386846457193742, + "ewc_loss": 0.035493191331624985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7746595403878018e-05, + "grad_norm": 25.67971420288086, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8810268044471741, + "num_tokens": 281461306.0, + "step": 7379 + }, + { + "epoch": 0.9388118559979647, + "ewc_loss": 0.03554267808794975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7771339116734453e-05, + "grad_norm": 25.636850357055664, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8730392456054688, + "num_tokens": 281498753.0, + "step": 7380 + }, + { + "epoch": 0.9389390662765551, + "ewc_loss": 0.03551296889781952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7756485249265097e-05, + "grad_norm": 25.718830108642578, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8699741363525391, + "num_tokens": 281537163.0, + "step": 7381 + }, + { + "epoch": 0.9390662765551456, + "ewc_loss": 0.03551773354411125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.775886630639434e-05, + "grad_norm": 25.705385208129883, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8524147272109985, + "num_tokens": 281577926.0, + "step": 7382 + }, + { + "epoch": 0.9391934868337362, + "ewc_loss": 0.035483527928590775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.77417641680222e-05, + "grad_norm": 25.599517822265625, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8796648383140564, + "num_tokens": 281621283.0, + "step": 7383 + }, + { + "epoch": 0.9393206971123267, + "ewc_loss": 0.035533174872398376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.776658791641239e-05, + "grad_norm": 25.79828643798828, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8639839887619019, + "num_tokens": 281658336.0, + "step": 7384 + }, + { + "epoch": 0.9394479073909172, + "ewc_loss": 0.035515010356903076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7757505702320486e-05, + "grad_norm": 25.547550201416016, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8642375469207764, + "num_tokens": 281695137.0, + "step": 7385 + }, + { + "epoch": 0.9395751176695077, + "ewc_loss": 0.0355091318488121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7754566215444356e-05, + "grad_norm": 25.776453018188477, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8751049637794495, + "num_tokens": 281733570.0, + "step": 7386 + }, + { + "epoch": 0.9397023279480982, + "ewc_loss": 0.035538386553525925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.776919270923827e-05, + "grad_norm": 25.57353401184082, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8633089065551758, + "num_tokens": 281771163.0, + "step": 7387 + }, + { + "epoch": 0.9398295382266887, + "ewc_loss": 0.035540059208869934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.77700294443639e-05, + "grad_norm": 25.75537109375, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8638226985931396, + "num_tokens": 281813715.0, + "step": 7388 + }, + { + "epoch": 0.9399567485052792, + "ewc_loss": 0.03555530682206154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.777765282895416e-05, + "grad_norm": 25.663278579711914, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8597434759140015, + "num_tokens": 281856921.0, + "step": 7389 + }, + { + "epoch": 0.9400839587838697, + "ewc_loss": 0.03541097790002823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.770548988133669e-05, + "grad_norm": 25.687105178833008, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8530991077423096, + "num_tokens": 281893273.0, + "step": 7390 + }, + { + "epoch": 0.9402111690624603, + "ewc_loss": 0.03549332544207573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.774666270648595e-05, + "grad_norm": 25.63401222229004, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8659929633140564, + "num_tokens": 281927452.0, + "step": 7391 + }, + { + "epoch": 0.9403383793410508, + "ewc_loss": 0.035469669848680496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7734835637384094e-05, + "grad_norm": 25.652841567993164, + "learning_rate": 1e-06, + "loss": 0.36, + "mean_token_accuracy": 0.8868574500083923, + "num_tokens": 281965359.0, + "step": 7392 + }, + { + "epoch": 0.9404655896196412, + "ewc_loss": 0.03548053652048111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7740268958732486e-05, + "grad_norm": 25.777725219726562, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8586850166320801, + "num_tokens": 281999209.0, + "step": 7393 + }, + { + "epoch": 0.9405927998982317, + "ewc_loss": 0.03548389673233032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.774194788595196e-05, + "grad_norm": 25.576032638549805, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.850856363773346, + "num_tokens": 282042129.0, + "step": 7394 + }, + { + "epoch": 0.9407200101768223, + "ewc_loss": 0.03553599491715431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.776799763320014e-05, + "grad_norm": 25.867298126220703, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8530443906784058, + "num_tokens": 282087941.0, + "step": 7395 + }, + { + "epoch": 0.9408472204554128, + "ewc_loss": 0.03549088165163994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7745440345606767e-05, + "grad_norm": 25.618711471557617, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8680161833763123, + "num_tokens": 282123130.0, + "step": 7396 + }, + { + "epoch": 0.9409744307340033, + "ewc_loss": 0.03538500517606735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.769250229699537e-05, + "grad_norm": 25.640899658203125, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8570029139518738, + "num_tokens": 282161142.0, + "step": 7397 + }, + { + "epoch": 0.9411016410125939, + "ewc_loss": 0.03556478023529053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7782389477360994e-05, + "grad_norm": 25.67841148376465, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8853264451026917, + "num_tokens": 282196363.0, + "step": 7398 + }, + { + "epoch": 0.9412288512911843, + "ewc_loss": 0.03548359498381615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7741796909831464e-05, + "grad_norm": 25.662696838378906, + "learning_rate": 1e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8419579267501831, + "num_tokens": 282233202.0, + "step": 7399 + }, + { + "epoch": 0.9413560615697748, + "ewc_loss": 0.03551977127790451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7759884940460324e-05, + "grad_norm": 25.594436645507812, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8622066974639893, + "num_tokens": 282270976.0, + "step": 7400 + }, + { + "epoch": 0.9414832718483653, + "ewc_loss": 0.03550061210989952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7750306142261252e-05, + "grad_norm": 25.567331314086914, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8718318343162537, + "num_tokens": 282312223.0, + "step": 7401 + }, + { + "epoch": 0.9416104821269559, + "ewc_loss": 0.03557989001274109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7789945559343323e-05, + "grad_norm": 25.590320587158203, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8448159098625183, + "num_tokens": 282350418.0, + "step": 7402 + }, + { + "epoch": 0.9417376924055464, + "ewc_loss": 0.035561997443437576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.778099795046728e-05, + "grad_norm": 25.731998443603516, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8619968891143799, + "num_tokens": 282380751.0, + "step": 7403 + }, + { + "epoch": 0.9418649026841369, + "ewc_loss": 0.035590119659900665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7795060557546094e-05, + "grad_norm": 25.6711483001709, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8725149035453796, + "num_tokens": 282420248.0, + "step": 7404 + }, + { + "epoch": 0.9419921129627273, + "ewc_loss": 0.035610418766736984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7805208699428476e-05, + "grad_norm": 25.636425018310547, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8665060997009277, + "num_tokens": 282457951.0, + "step": 7405 + }, + { + "epoch": 0.9421193232413179, + "ewc_loss": 0.03551186993718147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7755934095475823e-05, + "grad_norm": 25.736160278320312, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.860710084438324, + "num_tokens": 282507379.0, + "step": 7406 + }, + { + "epoch": 0.9422465335199084, + "ewc_loss": 0.035597994923591614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7798996850615367e-05, + "grad_norm": 25.548555374145508, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8500797748565674, + "num_tokens": 282546905.0, + "step": 7407 + }, + { + "epoch": 0.9423737437984989, + "ewc_loss": 0.03554961830377579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7774809748516418e-05, + "grad_norm": 25.673845291137695, + "learning_rate": 1e-06, + "loss": 0.5411, + "mean_token_accuracy": 0.8331607580184937, + "num_tokens": 282591368.0, + "step": 7408 + }, + { + "epoch": 0.9425009540770894, + "ewc_loss": 0.03563836216926575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7819180357037112e-05, + "grad_norm": 25.6939754486084, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8590360879898071, + "num_tokens": 282617331.0, + "step": 7409 + }, + { + "epoch": 0.94262816435568, + "ewc_loss": 0.03553481772542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.776740828063339e-05, + "grad_norm": 25.648881912231445, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8742542862892151, + "num_tokens": 282652346.0, + "step": 7410 + }, + { + "epoch": 0.9427553746342705, + "ewc_loss": 0.035550110042095184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7775055312085897e-05, + "grad_norm": 25.558103561401367, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8651741147041321, + "num_tokens": 282691564.0, + "step": 7411 + }, + { + "epoch": 0.9428825849128609, + "ewc_loss": 0.03558262065052986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7791309801395983e-05, + "grad_norm": 25.593814849853516, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8711158037185669, + "num_tokens": 282728186.0, + "step": 7412 + }, + { + "epoch": 0.9430097951914514, + "ewc_loss": 0.03564386069774628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7821930669015273e-05, + "grad_norm": 25.693342208862305, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.854352593421936, + "num_tokens": 282770491.0, + "step": 7413 + }, + { + "epoch": 0.943137005470042, + "ewc_loss": 0.035617854446172714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7808926713769324e-05, + "grad_norm": 25.554813385009766, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8541408777236938, + "num_tokens": 282809824.0, + "step": 7414 + }, + { + "epoch": 0.9432642157486325, + "ewc_loss": 0.03561171144247055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.780585625965614e-05, + "grad_norm": 25.569820404052734, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.866452157497406, + "num_tokens": 282846735.0, + "step": 7415 + }, + { + "epoch": 0.943391426027223, + "ewc_loss": 0.03569157421588898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7845786715042777e-05, + "grad_norm": 25.70503807067871, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8806480765342712, + "num_tokens": 282882242.0, + "step": 7416 + }, + { + "epoch": 0.9435186363058136, + "ewc_loss": 0.035673778504133224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7836890037870035e-05, + "grad_norm": 25.659622192382812, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8476676940917969, + "num_tokens": 282918774.0, + "step": 7417 + }, + { + "epoch": 0.943645846584404, + "ewc_loss": 0.03561574965715408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7807875337894075e-05, + "grad_norm": 25.632112503051758, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.853121280670166, + "num_tokens": 282957766.0, + "step": 7418 + }, + { + "epoch": 0.9437730568629945, + "ewc_loss": 0.035624537616968155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7812268197303638e-05, + "grad_norm": 25.660951614379883, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8495583534240723, + "num_tokens": 282995980.0, + "step": 7419 + }, + { + "epoch": 0.943900267141585, + "ewc_loss": 0.03560272604227066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.780136335582938e-05, + "grad_norm": 25.72106170654297, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8626339435577393, + "num_tokens": 283033148.0, + "step": 7420 + }, + { + "epoch": 0.9440274774201756, + "ewc_loss": 0.035614773631095886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.780738602974452e-05, + "grad_norm": 25.669090270996094, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8664014339447021, + "num_tokens": 283069781.0, + "step": 7421 + }, + { + "epoch": 0.9441546876987661, + "ewc_loss": 0.03558859974145889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7794300219975412e-05, + "grad_norm": 25.643796920776367, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8535603880882263, + "num_tokens": 283107270.0, + "step": 7422 + }, + { + "epoch": 0.9442818979773566, + "ewc_loss": 0.035668883472681046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7834441678132862e-05, + "grad_norm": 25.697574615478516, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8731218576431274, + "num_tokens": 283143571.0, + "step": 7423 + }, + { + "epoch": 0.944409108255947, + "ewc_loss": 0.03566955775022507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7834778191172518e-05, + "grad_norm": 25.784343719482422, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.865792453289032, + "num_tokens": 283179995.0, + "step": 7424 + }, + { + "epoch": 0.9445363185345376, + "ewc_loss": 0.03566743806004524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7833719539339654e-05, + "grad_norm": 25.630647659301758, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8612885475158691, + "num_tokens": 283218370.0, + "step": 7425 + }, + { + "epoch": 0.9446635288131281, + "ewc_loss": 0.03565657511353493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7828288036980666e-05, + "grad_norm": 25.810890197753906, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8550256490707397, + "num_tokens": 283247879.0, + "step": 7426 + }, + { + "epoch": 0.9447907390917186, + "ewc_loss": 0.03564342111349106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7821710571297444e-05, + "grad_norm": 25.516536712646484, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8772167563438416, + "num_tokens": 283285027.0, + "step": 7427 + }, + { + "epoch": 0.9449179493703092, + "ewc_loss": 0.035557668656110764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7778835172066465e-05, + "grad_norm": 25.658283233642578, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8651834726333618, + "num_tokens": 283324934.0, + "step": 7428 + }, + { + "epoch": 0.9450451596488997, + "ewc_loss": 0.03575053811073303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.787526889529545e-05, + "grad_norm": 25.680835723876953, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8692188858985901, + "num_tokens": 283357540.0, + "step": 7429 + }, + { + "epoch": 0.9451723699274901, + "ewc_loss": 0.03566139191389084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.783069637895096e-05, + "grad_norm": 25.699188232421875, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8590266704559326, + "num_tokens": 283393470.0, + "step": 7430 + }, + { + "epoch": 0.9452995802060806, + "ewc_loss": 0.03570491075515747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7852455130196176e-05, + "grad_norm": 25.69926643371582, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8621469140052795, + "num_tokens": 283429916.0, + "step": 7431 + }, + { + "epoch": 0.9454267904846712, + "ewc_loss": 0.03574162721633911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7870814190246165e-05, + "grad_norm": 25.781150817871094, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8499822616577148, + "num_tokens": 283467997.0, + "step": 7432 + }, + { + "epoch": 0.9455540007632617, + "ewc_loss": 0.03574405983090401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7872029275167733e-05, + "grad_norm": 25.73748016357422, + "learning_rate": 1e-06, + "loss": 0.5478, + "mean_token_accuracy": 0.8267989754676819, + "num_tokens": 283504741.0, + "step": 7433 + }, + { + "epoch": 0.9456812110418522, + "ewc_loss": 0.03577123582363129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.788561712601222e-05, + "grad_norm": 25.739065170288086, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8710612058639526, + "num_tokens": 283534535.0, + "step": 7434 + }, + { + "epoch": 0.9458084213204427, + "ewc_loss": 0.03569677099585533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7848386050900444e-05, + "grad_norm": 25.729215621948242, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8738851547241211, + "num_tokens": 283571600.0, + "step": 7435 + }, + { + "epoch": 0.9459356315990332, + "ewc_loss": 0.03575430065393448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7877149730338715e-05, + "grad_norm": 25.69668197631836, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8475667238235474, + "num_tokens": 283605662.0, + "step": 7436 + }, + { + "epoch": 0.9460628418776237, + "ewc_loss": 0.035697802901268005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7848900824901648e-05, + "grad_norm": 25.769634246826172, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8614108562469482, + "num_tokens": 283642868.0, + "step": 7437 + }, + { + "epoch": 0.9461900521562142, + "ewc_loss": 0.03573925420641899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.786962639016565e-05, + "grad_norm": 25.70201301574707, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8518246412277222, + "num_tokens": 283675604.0, + "step": 7438 + }, + { + "epoch": 0.9463172624348047, + "ewc_loss": 0.03571624681353569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7858123101177625e-05, + "grad_norm": 25.83159065246582, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8820946216583252, + "num_tokens": 283709459.0, + "step": 7439 + }, + { + "epoch": 0.9464444727133953, + "ewc_loss": 0.03571292757987976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.785646418284159e-05, + "grad_norm": 25.578922271728516, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.860748827457428, + "num_tokens": 283748020.0, + "step": 7440 + }, + { + "epoch": 0.9465716829919858, + "ewc_loss": 0.03568728640675545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.78436439455254e-05, + "grad_norm": 25.917634963989258, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8563879132270813, + "num_tokens": 283778514.0, + "step": 7441 + }, + { + "epoch": 0.9466988932705762, + "ewc_loss": 0.03577534109354019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7887670765048824e-05, + "grad_norm": 25.61418342590332, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8652623295783997, + "num_tokens": 283819764.0, + "step": 7442 + }, + { + "epoch": 0.9468261035491667, + "ewc_loss": 0.035676103085279465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.78380523720989e-05, + "grad_norm": 25.669477462768555, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8458755016326904, + "num_tokens": 283853897.0, + "step": 7443 + }, + { + "epoch": 0.9469533138277573, + "ewc_loss": 0.0358210913836956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7910546375787817e-05, + "grad_norm": 25.73486328125, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8741324543952942, + "num_tokens": 283891826.0, + "step": 7444 + }, + { + "epoch": 0.9470805241063478, + "ewc_loss": 0.03568620979785919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.784310552466195e-05, + "grad_norm": 25.65326690673828, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8609073758125305, + "num_tokens": 283931690.0, + "step": 7445 + }, + { + "epoch": 0.9472077343849383, + "ewc_loss": 0.035667840391397476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7833919628174044e-05, + "grad_norm": 25.607385635375977, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8615358471870422, + "num_tokens": 283971107.0, + "step": 7446 + }, + { + "epoch": 0.9473349446635289, + "ewc_loss": 0.03574160486459732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7870803276309744e-05, + "grad_norm": 25.80616569519043, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8692644238471985, + "num_tokens": 284005286.0, + "step": 7447 + }, + { + "epoch": 0.9474621549421193, + "ewc_loss": 0.03574473038315773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.787236578820739e-05, + "grad_norm": 25.63526725769043, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.87553870677948, + "num_tokens": 284044089.0, + "step": 7448 + }, + { + "epoch": 0.9475893652207098, + "ewc_loss": 0.03567292168736458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7836460756370798e-05, + "grad_norm": 25.68764877319336, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8560799360275269, + "num_tokens": 284081172.0, + "step": 7449 + }, + { + "epoch": 0.9477165754993003, + "ewc_loss": 0.03579641506075859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7898208170663565e-05, + "grad_norm": 25.726722717285156, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.86883145570755, + "num_tokens": 284113362.0, + "step": 7450 + }, + { + "epoch": 0.9478437857778909, + "ewc_loss": 0.035732872784137726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.786643588275183e-05, + "grad_norm": 25.542993545532227, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8717529773712158, + "num_tokens": 284155272.0, + "step": 7451 + }, + { + "epoch": 0.9479709960564814, + "ewc_loss": 0.03572997450828552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7864987967186607e-05, + "grad_norm": 25.579179763793945, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8694852590560913, + "num_tokens": 284188218.0, + "step": 7452 + }, + { + "epoch": 0.9480982063350719, + "ewc_loss": 0.035771965980529785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7885982742882334e-05, + "grad_norm": 25.615459442138672, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8648068308830261, + "num_tokens": 284228838.0, + "step": 7453 + }, + { + "epoch": 0.9482254166136623, + "ewc_loss": 0.03584803268313408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7924015992321074e-05, + "grad_norm": 25.650354385375977, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8598366975784302, + "num_tokens": 284265570.0, + "step": 7454 + }, + { + "epoch": 0.9483526268922529, + "ewc_loss": 0.03578992933034897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7894964912557043e-05, + "grad_norm": 25.566661834716797, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8524752259254456, + "num_tokens": 284307062.0, + "step": 7455 + }, + { + "epoch": 0.9484798371708434, + "ewc_loss": 0.03579083830118179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.789541965990793e-05, + "grad_norm": 25.625642776489258, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8598692417144775, + "num_tokens": 284340488.0, + "step": 7456 + }, + { + "epoch": 0.9486070474494339, + "ewc_loss": 0.03586345538496971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.793172850739211e-05, + "grad_norm": 25.759395599365234, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.860630989074707, + "num_tokens": 284382399.0, + "step": 7457 + }, + { + "epoch": 0.9487342577280244, + "ewc_loss": 0.03582536801695824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7912683688336983e-05, + "grad_norm": 25.714529037475586, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8488456010818481, + "num_tokens": 284416391.0, + "step": 7458 + }, + { + "epoch": 0.948861468006615, + "ewc_loss": 0.035780325531959534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7890162780531682e-05, + "grad_norm": 25.609712600708008, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8614364862442017, + "num_tokens": 284457049.0, + "step": 7459 + }, + { + "epoch": 0.9489886782852054, + "ewc_loss": 0.03577294200658798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7886470232042484e-05, + "grad_norm": 25.616548538208008, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8596293926239014, + "num_tokens": 284494426.0, + "step": 7460 + }, + { + "epoch": 0.9491158885637959, + "ewc_loss": 0.03581143915653229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7905718777910806e-05, + "grad_norm": 25.670148849487305, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8798892498016357, + "num_tokens": 284533349.0, + "step": 7461 + }, + { + "epoch": 0.9492430988423864, + "ewc_loss": 0.03585810214281082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.792905095499009e-05, + "grad_norm": 25.705013275146484, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8680135607719421, + "num_tokens": 284567777.0, + "step": 7462 + }, + { + "epoch": 0.949370309120977, + "ewc_loss": 0.03576570004224777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7882850443129428e-05, + "grad_norm": 25.70197105407715, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8552007675170898, + "num_tokens": 284609120.0, + "step": 7463 + }, + { + "epoch": 0.9494975193995675, + "ewc_loss": 0.03578707203269005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7893535186885856e-05, + "grad_norm": 25.61810874938965, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8541214466094971, + "num_tokens": 284649221.0, + "step": 7464 + }, + { + "epoch": 0.949624729678158, + "ewc_loss": 0.03576316311955452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7881580788525753e-05, + "grad_norm": 25.705123901367188, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8661922216415405, + "num_tokens": 284682595.0, + "step": 7465 + }, + { + "epoch": 0.9497519399567486, + "ewc_loss": 0.0358128696680069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7906435459735803e-05, + "grad_norm": 25.656274795532227, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8579994440078735, + "num_tokens": 284722903.0, + "step": 7466 + }, + { + "epoch": 0.949879150235339, + "ewc_loss": 0.03585481643676758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7927408407558687e-05, + "grad_norm": 25.724992752075195, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.867382287979126, + "num_tokens": 284759166.0, + "step": 7467 + }, + { + "epoch": 0.9500063605139295, + "ewc_loss": 0.03582080453634262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7910402675624937e-05, + "grad_norm": 25.678739547729492, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8574622869491577, + "num_tokens": 284803192.0, + "step": 7468 + }, + { + "epoch": 0.95013357079252, + "ewc_loss": 0.03575412184000015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.787706059985794e-05, + "grad_norm": 25.666706085205078, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8639835119247437, + "num_tokens": 284837088.0, + "step": 7469 + }, + { + "epoch": 0.9502607810711106, + "ewc_loss": 0.035787902772426605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7893951735459268e-05, + "grad_norm": 25.72960662841797, + "learning_rate": 1e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8423076868057251, + "num_tokens": 284879641.0, + "step": 7470 + }, + { + "epoch": 0.9503879913497011, + "ewc_loss": 0.03573577478528023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.786788743629586e-05, + "grad_norm": 25.646398544311523, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8501596450805664, + "num_tokens": 284917519.0, + "step": 7471 + }, + { + "epoch": 0.9505152016282916, + "ewc_loss": 0.035682354122400284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7841177395894192e-05, + "grad_norm": 25.679670333862305, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8521610498428345, + "num_tokens": 284950422.0, + "step": 7472 + }, + { + "epoch": 0.950642411906882, + "ewc_loss": 0.03575485572218895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7877428035717458e-05, + "grad_norm": 25.60444450378418, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8687011003494263, + "num_tokens": 284986392.0, + "step": 7473 + }, + { + "epoch": 0.9507696221854726, + "ewc_loss": 0.03577975928783417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.788987901818473e-05, + "grad_norm": 25.716482162475586, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8639917969703674, + "num_tokens": 285027684.0, + "step": 7474 + }, + { + "epoch": 0.9508968324640631, + "ewc_loss": 0.035792239010334015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7896119970828295e-05, + "grad_norm": 25.614713668823242, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8523306250572205, + "num_tokens": 285059551.0, + "step": 7475 + }, + { + "epoch": 0.9510240427426536, + "ewc_loss": 0.03572636470198631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7863181710708886e-05, + "grad_norm": 25.65791130065918, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8633365631103516, + "num_tokens": 285100197.0, + "step": 7476 + }, + { + "epoch": 0.9511512530212441, + "ewc_loss": 0.035792071372270584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.789603629731573e-05, + "grad_norm": 25.67916488647461, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8620299696922302, + "num_tokens": 285130115.0, + "step": 7477 + }, + { + "epoch": 0.9512784632998347, + "ewc_loss": 0.035810455679893494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.790522765077185e-05, + "grad_norm": 25.672407150268555, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8627115488052368, + "num_tokens": 285169069.0, + "step": 7478 + }, + { + "epoch": 0.9514056735784251, + "ewc_loss": 0.035736583173274994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7868291251943447e-05, + "grad_norm": 25.6871337890625, + "learning_rate": 1e-06, + "loss": 0.5237, + "mean_token_accuracy": 0.8413621783256531, + "num_tokens": 285209971.0, + "step": 7479 + }, + { + "epoch": 0.9515328838570156, + "ewc_loss": 0.035860575735569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.79302878677845e-05, + "grad_norm": 25.710941314697266, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8574733734130859, + "num_tokens": 285250024.0, + "step": 7480 + }, + { + "epoch": 0.9516600941356061, + "ewc_loss": 0.03578728064894676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7893640688271262e-05, + "grad_norm": 25.682498931884766, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8574192523956299, + "num_tokens": 285284900.0, + "step": 7481 + }, + { + "epoch": 0.9517873044141967, + "ewc_loss": 0.03578909486532211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7894546544994228e-05, + "grad_norm": 25.717004776000977, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.866620659828186, + "num_tokens": 285325657.0, + "step": 7482 + }, + { + "epoch": 0.9519145146927872, + "ewc_loss": 0.03576291725039482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7881458916235715e-05, + "grad_norm": 25.655197143554688, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8618797063827515, + "num_tokens": 285356362.0, + "step": 7483 + }, + { + "epoch": 0.9520417249713777, + "ewc_loss": 0.035845816135406494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7922908227774315e-05, + "grad_norm": 25.826696395874023, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8616673946380615, + "num_tokens": 285394321.0, + "step": 7484 + }, + { + "epoch": 0.9521689352499682, + "ewc_loss": 0.03578914701938629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.789457382983528e-05, + "grad_norm": 25.55938148498535, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8804080486297607, + "num_tokens": 285435260.0, + "step": 7485 + }, + { + "epoch": 0.9522961455285587, + "ewc_loss": 0.03579897806048393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.789948873920366e-05, + "grad_norm": 25.791231155395508, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8703412413597107, + "num_tokens": 285479449.0, + "step": 7486 + }, + { + "epoch": 0.9524233558071492, + "ewc_loss": 0.03579900041222572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7899499653140083e-05, + "grad_norm": 25.58237075805664, + "learning_rate": 1e-06, + "loss": 0.5214, + "mean_token_accuracy": 0.8407273292541504, + "num_tokens": 285520492.0, + "step": 7487 + }, + { + "epoch": 0.9525505660857397, + "ewc_loss": 0.03573279455304146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7866397683974355e-05, + "grad_norm": 25.78792953491211, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8569413423538208, + "num_tokens": 285558410.0, + "step": 7488 + }, + { + "epoch": 0.9526777763643303, + "ewc_loss": 0.03583163395524025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7915817807079293e-05, + "grad_norm": 25.577245712280273, + "learning_rate": 1e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8410897850990295, + "num_tokens": 285598871.0, + "step": 7489 + }, + { + "epoch": 0.9528049866429208, + "ewc_loss": 0.03568394109606743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7841970475274138e-05, + "grad_norm": 25.632877349853516, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8541392087936401, + "num_tokens": 285641374.0, + "step": 7490 + }, + { + "epoch": 0.9529321969215112, + "ewc_loss": 0.035798463970422745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7899232261697762e-05, + "grad_norm": 25.575729370117188, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8721761703491211, + "num_tokens": 285675416.0, + "step": 7491 + }, + { + "epoch": 0.9530594072001017, + "ewc_loss": 0.0357881560921669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7894077245728113e-05, + "grad_norm": 25.797576904296875, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8673784732818604, + "num_tokens": 285707491.0, + "step": 7492 + }, + { + "epoch": 0.9531866174786923, + "ewc_loss": 0.03583172336220741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7915861462824978e-05, + "grad_norm": 25.588499069213867, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8502811193466187, + "num_tokens": 285747571.0, + "step": 7493 + }, + { + "epoch": 0.9533138277572828, + "ewc_loss": 0.035751573741436005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.787578730727546e-05, + "grad_norm": 25.81391716003418, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8838086724281311, + "num_tokens": 285786362.0, + "step": 7494 + }, + { + "epoch": 0.9534410380358733, + "ewc_loss": 0.03581977263092995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.790988608263433e-05, + "grad_norm": 25.640623092651367, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8691806793212891, + "num_tokens": 285829401.0, + "step": 7495 + }, + { + "epoch": 0.9535682483144639, + "ewc_loss": 0.035727523267269135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7863761968328618e-05, + "grad_norm": 25.72425079345703, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8596351742744446, + "num_tokens": 285867853.0, + "step": 7496 + }, + { + "epoch": 0.9536954585930543, + "ewc_loss": 0.03588066250085831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7940330508281477e-05, + "grad_norm": 25.87572479248047, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8718688488006592, + "num_tokens": 285908497.0, + "step": 7497 + }, + { + "epoch": 0.9538226688716448, + "ewc_loss": 0.03573393076658249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7866965208668262e-05, + "grad_norm": 25.67122459411621, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8492659330368042, + "num_tokens": 285940528.0, + "step": 7498 + }, + { + "epoch": 0.9539498791502353, + "ewc_loss": 0.03575662523508072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.787831206456758e-05, + "grad_norm": 25.84005355834961, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8628029227256775, + "num_tokens": 285980206.0, + "step": 7499 + }, + { + "epoch": 0.9540770894288259, + "ewc_loss": 0.03573555126786232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7867776477942243e-05, + "grad_norm": 25.694377899169922, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8562616109848022, + "num_tokens": 286019409.0, + "step": 7500 + }, + { + "epoch": 0.9542042997074164, + "ewc_loss": 0.035677846521139145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.78389236680232e-05, + "grad_norm": 25.710147857666016, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8701610565185547, + "num_tokens": 286050955.0, + "step": 7501 + }, + { + "epoch": 0.9543315099860069, + "ewc_loss": 0.035713229328393936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7856615158962086e-05, + "grad_norm": 25.65161895751953, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8664437532424927, + "num_tokens": 286087567.0, + "step": 7502 + }, + { + "epoch": 0.9544587202645973, + "ewc_loss": 0.03571299836039543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.785649874364026e-05, + "grad_norm": 25.71000099182129, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8416943550109863, + "num_tokens": 286126893.0, + "step": 7503 + }, + { + "epoch": 0.9545859305431879, + "ewc_loss": 0.03568454086780548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7842270608525723e-05, + "grad_norm": 25.619401931762695, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8555620908737183, + "num_tokens": 286165818.0, + "step": 7504 + }, + { + "epoch": 0.9547131408217784, + "ewc_loss": 0.03575912117958069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7879559891298413e-05, + "grad_norm": 25.65508460998535, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8634757995605469, + "num_tokens": 286208026.0, + "step": 7505 + }, + { + "epoch": 0.9548403511003689, + "ewc_loss": 0.03574482351541519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7872411262942478e-05, + "grad_norm": 25.747278213500977, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.861000657081604, + "num_tokens": 286245468.0, + "step": 7506 + }, + { + "epoch": 0.9549675613789594, + "ewc_loss": 0.035781312733888626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7890655726660043e-05, + "grad_norm": 25.59490394592285, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8654013276100159, + "num_tokens": 286283955.0, + "step": 7507 + }, + { + "epoch": 0.95509477165755, + "ewc_loss": 0.03571535646915436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7857677448773757e-05, + "grad_norm": 25.688270568847656, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8682367205619812, + "num_tokens": 286326560.0, + "step": 7508 + }, + { + "epoch": 0.9552219819361404, + "ewc_loss": 0.03583918884396553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7919594029081054e-05, + "grad_norm": 25.640296936035156, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8608964681625366, + "num_tokens": 286365791.0, + "step": 7509 + }, + { + "epoch": 0.9553491922147309, + "ewc_loss": 0.035740822553634644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7870410374598578e-05, + "grad_norm": 25.694032669067383, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8722313642501831, + "num_tokens": 286404132.0, + "step": 7510 + }, + { + "epoch": 0.9554764024933214, + "ewc_loss": 0.03579133003950119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7895665223477408e-05, + "grad_norm": 25.59119987487793, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8515207767486572, + "num_tokens": 286444828.0, + "step": 7511 + }, + { + "epoch": 0.955603612771912, + "ewc_loss": 0.03575967997312546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.787984001566656e-05, + "grad_norm": 25.68221664428711, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8667480945587158, + "num_tokens": 286481725.0, + "step": 7512 + }, + { + "epoch": 0.9557308230505025, + "ewc_loss": 0.035829149186611176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.791457543731667e-05, + "grad_norm": 25.68402862548828, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.861916184425354, + "num_tokens": 286520354.0, + "step": 7513 + }, + { + "epoch": 0.955858033329093, + "ewc_loss": 0.035755567252635956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7877784557640553e-05, + "grad_norm": 25.608957290649414, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8590983152389526, + "num_tokens": 286564314.0, + "step": 7514 + }, + { + "epoch": 0.9559852436076836, + "ewc_loss": 0.03580699488520622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7903497791849077e-05, + "grad_norm": 25.636890411376953, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8602666854858398, + "num_tokens": 286606955.0, + "step": 7515 + }, + { + "epoch": 0.956112453886274, + "ewc_loss": 0.03578461706638336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7892309188027866e-05, + "grad_norm": 25.62931251525879, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.866969108581543, + "num_tokens": 286640091.0, + "step": 7516 + }, + { + "epoch": 0.9562396641648645, + "ewc_loss": 0.035775862634181976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.788793088053353e-05, + "grad_norm": 25.60747528076172, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8608156442642212, + "num_tokens": 286679090.0, + "step": 7517 + }, + { + "epoch": 0.956366874443455, + "ewc_loss": 0.03580315411090851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7901576939038932e-05, + "grad_norm": 25.571815490722656, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8650063872337341, + "num_tokens": 286720260.0, + "step": 7518 + }, + { + "epoch": 0.9564940847220456, + "ewc_loss": 0.035814739763736725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7907370420289226e-05, + "grad_norm": 25.627695083618164, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8676615953445435, + "num_tokens": 286763992.0, + "step": 7519 + }, + { + "epoch": 0.9566212950006361, + "ewc_loss": 0.035867124795913696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7933562048710883e-05, + "grad_norm": 25.71364974975586, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8421896696090698, + "num_tokens": 286807068.0, + "step": 7520 + }, + { + "epoch": 0.9567485052792266, + "ewc_loss": 0.03573726862668991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7868635040940717e-05, + "grad_norm": 25.601943969726562, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8684009313583374, + "num_tokens": 286853020.0, + "step": 7521 + }, + { + "epoch": 0.956875715557817, + "ewc_loss": 0.03577488660812378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.788744339137338e-05, + "grad_norm": 25.650541305541992, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8517439365386963, + "num_tokens": 286896098.0, + "step": 7522 + }, + { + "epoch": 0.9570029258364076, + "ewc_loss": 0.035862259566783905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7931130059878342e-05, + "grad_norm": 25.843303680419922, + "learning_rate": 1e-06, + "loss": 0.4921, + "mean_token_accuracy": 0.8457841277122498, + "num_tokens": 286938873.0, + "step": 7523 + }, + { + "epoch": 0.9571301361149981, + "ewc_loss": 0.03576567396521568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7882837710203603e-05, + "grad_norm": 25.594524383544922, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8653962016105652, + "num_tokens": 286972941.0, + "step": 7524 + }, + { + "epoch": 0.9572573463935886, + "ewc_loss": 0.035760972648859024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7880485756904818e-05, + "grad_norm": 25.757369995117188, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.85053551197052, + "num_tokens": 287010818.0, + "step": 7525 + }, + { + "epoch": 0.9573845566721791, + "ewc_loss": 0.035792041569948196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.78960199264111e-05, + "grad_norm": 25.648366928100586, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8658875226974487, + "num_tokens": 287053841.0, + "step": 7526 + }, + { + "epoch": 0.9575117669507697, + "ewc_loss": 0.03570375591516495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7851878510555252e-05, + "grad_norm": 25.707056045532227, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8678462505340576, + "num_tokens": 287090900.0, + "step": 7527 + }, + { + "epoch": 0.9576389772293601, + "ewc_loss": 0.03580540046095848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7902701074490324e-05, + "grad_norm": 25.646116256713867, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8595322370529175, + "num_tokens": 287131203.0, + "step": 7528 + }, + { + "epoch": 0.9577661875079506, + "ewc_loss": 0.03570608049631119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7853040844784118e-05, + "grad_norm": 25.673778533935547, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8715665340423584, + "num_tokens": 287165949.0, + "step": 7529 + }, + { + "epoch": 0.9578933977865411, + "ewc_loss": 0.035802289843559265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7901145838550292e-05, + "grad_norm": 25.6680908203125, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8618719577789307, + "num_tokens": 287203376.0, + "step": 7530 + }, + { + "epoch": 0.9580206080651317, + "ewc_loss": 0.035734523087739944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.786726170394104e-05, + "grad_norm": 25.66991424560547, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8519322872161865, + "num_tokens": 287240391.0, + "step": 7531 + }, + { + "epoch": 0.9581478183437222, + "ewc_loss": 0.035746488720178604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7873244360089302e-05, + "grad_norm": 25.710039138793945, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8583352565765381, + "num_tokens": 287278485.0, + "step": 7532 + }, + { + "epoch": 0.9582750286223127, + "ewc_loss": 0.035778045654296875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.788902227417566e-05, + "grad_norm": 25.77834701538086, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8562746047973633, + "num_tokens": 287313626.0, + "step": 7533 + }, + { + "epoch": 0.9584022389009031, + "ewc_loss": 0.035713836550712585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7856918930192478e-05, + "grad_norm": 25.683544158935547, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8765577077865601, + "num_tokens": 287344532.0, + "step": 7534 + }, + { + "epoch": 0.9585294491794937, + "ewc_loss": 0.0357075110077858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7853755707619712e-05, + "grad_norm": 25.653554916381836, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8429242372512817, + "num_tokens": 287383069.0, + "step": 7535 + }, + { + "epoch": 0.9586566594580842, + "ewc_loss": 0.03572411835193634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.78620593942469e-05, + "grad_norm": 25.678165435791016, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8603833913803101, + "num_tokens": 287418554.0, + "step": 7536 + }, + { + "epoch": 0.9587838697366747, + "ewc_loss": 0.03583928197622299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7919641322805546e-05, + "grad_norm": 25.628551483154297, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8608138561248779, + "num_tokens": 287452449.0, + "step": 7537 + }, + { + "epoch": 0.9589110800152653, + "ewc_loss": 0.03578478842973709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7892394680529833e-05, + "grad_norm": 25.654285430908203, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8466311097145081, + "num_tokens": 287492247.0, + "step": 7538 + }, + { + "epoch": 0.9590382902938558, + "ewc_loss": 0.03583192080259323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.791595968825277e-05, + "grad_norm": 25.636524200439453, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.867423951625824, + "num_tokens": 287528651.0, + "step": 7539 + }, + { + "epoch": 0.9591655005724462, + "ewc_loss": 0.03586791083216667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.793395495042205e-05, + "grad_norm": 25.681184768676758, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8521385192871094, + "num_tokens": 287565837.0, + "step": 7540 + }, + { + "epoch": 0.9592927108510367, + "ewc_loss": 0.035818688571453094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7909344023792073e-05, + "grad_norm": 25.793272018432617, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8592137098312378, + "num_tokens": 287599469.0, + "step": 7541 + }, + { + "epoch": 0.9594199211296273, + "ewc_loss": 0.03582853823900223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7914269847096875e-05, + "grad_norm": 25.681440353393555, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8593887686729431, + "num_tokens": 287637104.0, + "step": 7542 + }, + { + "epoch": 0.9595471314082178, + "ewc_loss": 0.035840630531311035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7920316167874262e-05, + "grad_norm": 25.792146682739258, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8626707792282104, + "num_tokens": 287679134.0, + "step": 7543 + }, + { + "epoch": 0.9596743416868083, + "ewc_loss": 0.03586022928357124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7930115063791163e-05, + "grad_norm": 25.682950973510742, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8682584762573242, + "num_tokens": 287720340.0, + "step": 7544 + }, + { + "epoch": 0.9598015519653988, + "ewc_loss": 0.03585008531808853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7925041902344674e-05, + "grad_norm": 25.71894073486328, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8655308485031128, + "num_tokens": 287760931.0, + "step": 7545 + }, + { + "epoch": 0.9599287622439893, + "ewc_loss": 0.035862039774656296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7931019101524726e-05, + "grad_norm": 25.65530014038086, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8471038341522217, + "num_tokens": 287801723.0, + "step": 7546 + }, + { + "epoch": 0.9600559725225798, + "ewc_loss": 0.03581606596708298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7908032532432117e-05, + "grad_norm": 25.659128189086914, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8588329553604126, + "num_tokens": 287837010.0, + "step": 7547 + }, + { + "epoch": 0.9601831828011703, + "ewc_loss": 0.035865090787410736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.79325452336343e-05, + "grad_norm": 25.698612213134766, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8631253242492676, + "num_tokens": 287874381.0, + "step": 7548 + }, + { + "epoch": 0.9603103930797608, + "ewc_loss": 0.03581181541085243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7905907952808775e-05, + "grad_norm": 25.675167083740234, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8538243174552917, + "num_tokens": 287916673.0, + "step": 7549 + }, + { + "epoch": 0.9604376033583514, + "ewc_loss": 0.035845473408699036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.792273724277038e-05, + "grad_norm": 25.67458152770996, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8569915294647217, + "num_tokens": 287956764.0, + "step": 7550 + }, + { + "epoch": 0.9605648136369419, + "ewc_loss": 0.03585223853588104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7926118744071573e-05, + "grad_norm": 25.68142318725586, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8691269755363464, + "num_tokens": 287996373.0, + "step": 7551 + }, + { + "epoch": 0.9606920239155323, + "ewc_loss": 0.035876885056495667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7938442397280596e-05, + "grad_norm": 25.702545166015625, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8571574687957764, + "num_tokens": 288033671.0, + "step": 7552 + }, + { + "epoch": 0.9608192341941229, + "ewc_loss": 0.03587010130286217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7935049982042983e-05, + "grad_norm": 25.755794525146484, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8701313734054565, + "num_tokens": 288068254.0, + "step": 7553 + }, + { + "epoch": 0.9609464444727134, + "ewc_loss": 0.035889655351638794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7944827050087042e-05, + "grad_norm": 25.82374382019043, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.847015380859375, + "num_tokens": 288102833.0, + "step": 7554 + }, + { + "epoch": 0.9610736547513039, + "ewc_loss": 0.03579411283135414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.789705675037112e-05, + "grad_norm": 25.803857803344727, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8743339776992798, + "num_tokens": 288135927.0, + "step": 7555 + }, + { + "epoch": 0.9612008650298944, + "ewc_loss": 0.035830795764923096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7915397620527074e-05, + "grad_norm": 25.834909439086914, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8808785676956177, + "num_tokens": 288173842.0, + "step": 7556 + }, + { + "epoch": 0.961328075308485, + "ewc_loss": 0.03581175208091736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.790587521099951e-05, + "grad_norm": 25.609596252441406, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8669065833091736, + "num_tokens": 288215007.0, + "step": 7557 + }, + { + "epoch": 0.9614552855870754, + "ewc_loss": 0.03579854592680931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.789927227946464e-05, + "grad_norm": 25.895774841308594, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8506636619567871, + "num_tokens": 288251598.0, + "step": 7558 + }, + { + "epoch": 0.9615824958656659, + "ewc_loss": 0.03582756221294403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.791378053894732e-05, + "grad_norm": 25.620885848999023, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8629438877105713, + "num_tokens": 288292811.0, + "step": 7559 + }, + { + "epoch": 0.9617097061442564, + "ewc_loss": 0.03578020632266998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7890102753881365e-05, + "grad_norm": 25.808399200439453, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8664998412132263, + "num_tokens": 288337790.0, + "step": 7560 + }, + { + "epoch": 0.961836916422847, + "ewc_loss": 0.03587229177355766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7936145013663918e-05, + "grad_norm": 25.83837890625, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8582314252853394, + "num_tokens": 288373347.0, + "step": 7561 + }, + { + "epoch": 0.9619641267014375, + "ewc_loss": 0.035748694092035294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.787434666766785e-05, + "grad_norm": 25.742576599121094, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.881193995475769, + "num_tokens": 288405650.0, + "step": 7562 + }, + { + "epoch": 0.962091336980028, + "ewc_loss": 0.035805363208055496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7902681065606885e-05, + "grad_norm": 25.722436904907227, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8668991327285767, + "num_tokens": 288443849.0, + "step": 7563 + }, + { + "epoch": 0.9622185472586186, + "ewc_loss": 0.03584456816315651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.79222843144089e-05, + "grad_norm": 25.868200302124023, + "learning_rate": 1e-06, + "loss": 0.5115, + "mean_token_accuracy": 0.839977502822876, + "num_tokens": 288487557.0, + "step": 7564 + }, + { + "epoch": 0.962345757537209, + "ewc_loss": 0.03574720770120621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7873604519991204e-05, + "grad_norm": 25.58013916015625, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8514685034751892, + "num_tokens": 288529932.0, + "step": 7565 + }, + { + "epoch": 0.9624729678157995, + "ewc_loss": 0.035774972289800644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7887485228129663e-05, + "grad_norm": 25.758975982666016, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8567915558815002, + "num_tokens": 288568276.0, + "step": 7566 + }, + { + "epoch": 0.96260017809439, + "ewc_loss": 0.035865988582372665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7932994524016976e-05, + "grad_norm": 25.748348236083984, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8634984493255615, + "num_tokens": 288602387.0, + "step": 7567 + }, + { + "epoch": 0.9627273883729806, + "ewc_loss": 0.035762205719947815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.788110239431262e-05, + "grad_norm": 25.76663589477539, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8697618246078491, + "num_tokens": 288638157.0, + "step": 7568 + }, + { + "epoch": 0.9628545986515711, + "ewc_loss": 0.035862457007169724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7931228285306133e-05, + "grad_norm": 25.83439064025879, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8580628633499146, + "num_tokens": 288675054.0, + "step": 7569 + }, + { + "epoch": 0.9629818089301616, + "ewc_loss": 0.03577418252825737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7887090507429093e-05, + "grad_norm": 25.755577087402344, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8672059774398804, + "num_tokens": 288714222.0, + "step": 7570 + }, + { + "epoch": 0.963109019208752, + "ewc_loss": 0.03577988222241402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.788994086382445e-05, + "grad_norm": 25.91182518005371, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8520712852478027, + "num_tokens": 288752772.0, + "step": 7571 + }, + { + "epoch": 0.9632362294873426, + "ewc_loss": 0.03583214059472084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7916070646606386e-05, + "grad_norm": 25.725780487060547, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.845738410949707, + "num_tokens": 288792067.0, + "step": 7572 + }, + { + "epoch": 0.9633634397659331, + "ewc_loss": 0.03575320169329643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7876600395538844e-05, + "grad_norm": 25.690898895263672, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8737058043479919, + "num_tokens": 288826172.0, + "step": 7573 + }, + { + "epoch": 0.9634906500445236, + "ewc_loss": 0.035851992666721344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7925996871781535e-05, + "grad_norm": 25.761974334716797, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8721914291381836, + "num_tokens": 288863774.0, + "step": 7574 + }, + { + "epoch": 0.9636178603231141, + "ewc_loss": 0.03584970906376839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.792485454643611e-05, + "grad_norm": 25.724674224853516, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8552854657173157, + "num_tokens": 288901892.0, + "step": 7575 + }, + { + "epoch": 0.9637450706017047, + "ewc_loss": 0.035776928067207336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.788846384442877e-05, + "grad_norm": 25.683490753173828, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8507762551307678, + "num_tokens": 288941470.0, + "step": 7576 + }, + { + "epoch": 0.9638722808802951, + "ewc_loss": 0.035830188542604446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.791509384929668e-05, + "grad_norm": 25.65884780883789, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8478716611862183, + "num_tokens": 288988616.0, + "step": 7577 + }, + { + "epoch": 0.9639994911588856, + "ewc_loss": 0.035891737788915634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7945869331015274e-05, + "grad_norm": 25.67168617248535, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.878645658493042, + "num_tokens": 289024589.0, + "step": 7578 + }, + { + "epoch": 0.9641267014374761, + "ewc_loss": 0.03586871922016144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7934358766069636e-05, + "grad_norm": 25.76935386657715, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8633259534835815, + "num_tokens": 289064334.0, + "step": 7579 + }, + { + "epoch": 0.9642539117160667, + "ewc_loss": 0.035908471792936325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7954236682271585e-05, + "grad_norm": 25.833797454833984, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8501061201095581, + "num_tokens": 289099385.0, + "step": 7580 + }, + { + "epoch": 0.9643811219946572, + "ewc_loss": 0.035889849066734314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7944925275514834e-05, + "grad_norm": 25.754446029663086, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8649215698242188, + "num_tokens": 289139143.0, + "step": 7581 + }, + { + "epoch": 0.9645083322732477, + "ewc_loss": 0.035815827548503876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7907914298120886e-05, + "grad_norm": 25.70233726501465, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8561649322509766, + "num_tokens": 289179303.0, + "step": 7582 + }, + { + "epoch": 0.9646355425518381, + "ewc_loss": 0.035864830017089844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7932414266397245e-05, + "grad_norm": 25.806522369384766, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8537835478782654, + "num_tokens": 289214218.0, + "step": 7583 + }, + { + "epoch": 0.9647627528304287, + "ewc_loss": 0.03585211932659149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.792606053641066e-05, + "grad_norm": 25.693241119384766, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8584896922111511, + "num_tokens": 289240991.0, + "step": 7584 + }, + { + "epoch": 0.9648899631090192, + "ewc_loss": 0.03582907095551491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7914535419549793e-05, + "grad_norm": 25.79994773864746, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8556188941001892, + "num_tokens": 289273374.0, + "step": 7585 + }, + { + "epoch": 0.9650171733876097, + "ewc_loss": 0.03588062897324562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7940314137376845e-05, + "grad_norm": 25.788381576538086, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8476545810699463, + "num_tokens": 289314904.0, + "step": 7586 + }, + { + "epoch": 0.9651443836662003, + "ewc_loss": 0.03586828336119652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7934142306330614e-05, + "grad_norm": 25.713882446289062, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8537992835044861, + "num_tokens": 289358110.0, + "step": 7587 + }, + { + "epoch": 0.9652715939447908, + "ewc_loss": 0.03591681644320488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7958407624973916e-05, + "grad_norm": 25.902511596679688, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8520859479904175, + "num_tokens": 289391581.0, + "step": 7588 + }, + { + "epoch": 0.9653988042233812, + "ewc_loss": 0.03587489202618599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7937445591087453e-05, + "grad_norm": 25.68942642211914, + "learning_rate": 1e-06, + "loss": 0.5214, + "mean_token_accuracy": 0.8350903987884521, + "num_tokens": 289432280.0, + "step": 7589 + }, + { + "epoch": 0.9655260145019717, + "ewc_loss": 0.035855282098054886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.792764123820234e-05, + "grad_norm": 25.681921005249023, + "learning_rate": 1e-06, + "loss": 0.5019, + "mean_token_accuracy": 0.8453957438468933, + "num_tokens": 289473222.0, + "step": 7590 + }, + { + "epoch": 0.9656532247805623, + "ewc_loss": 0.0360056534409523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8002827346208505e-05, + "grad_norm": 25.75055694580078, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8569867610931396, + "num_tokens": 289512898.0, + "step": 7591 + }, + { + "epoch": 0.9657804350591528, + "ewc_loss": 0.0359337218105793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7966860468732193e-05, + "grad_norm": 25.64385223388672, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8663225769996643, + "num_tokens": 289554394.0, + "step": 7592 + }, + { + "epoch": 0.9659076453377433, + "ewc_loss": 0.035953596234321594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7976797607843764e-05, + "grad_norm": 25.713735580444336, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8580934405326843, + "num_tokens": 289594871.0, + "step": 7593 + }, + { + "epoch": 0.9660348556163338, + "ewc_loss": 0.03594301640987396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7971507986658253e-05, + "grad_norm": 25.704450607299805, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8496317863464355, + "num_tokens": 289629165.0, + "step": 7594 + }, + { + "epoch": 0.9661620658949243, + "ewc_loss": 0.03596091642975807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.79804574145237e-05, + "grad_norm": 25.673416137695312, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.859919011592865, + "num_tokens": 289670131.0, + "step": 7595 + }, + { + "epoch": 0.9662892761735148, + "ewc_loss": 0.03598451614379883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.79922571987845e-05, + "grad_norm": 25.739524841308594, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8633527755737305, + "num_tokens": 289705238.0, + "step": 7596 + }, + { + "epoch": 0.9664164864521053, + "ewc_loss": 0.03595279902219772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7976399249164388e-05, + "grad_norm": 25.677413940429688, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8635715246200562, + "num_tokens": 289740146.0, + "step": 7597 + }, + { + "epoch": 0.9665436967306958, + "ewc_loss": 0.036014754325151443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8007376638706774e-05, + "grad_norm": 25.807096481323242, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8647993206977844, + "num_tokens": 289781145.0, + "step": 7598 + }, + { + "epoch": 0.9666709070092864, + "ewc_loss": 0.035964515060186386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.798225821403321e-05, + "grad_norm": 25.67406463623047, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8443707227706909, + "num_tokens": 289816347.0, + "step": 7599 + }, + { + "epoch": 0.9667981172878769, + "ewc_loss": 0.03600844740867615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.800422433007043e-05, + "grad_norm": 25.79559898376465, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8467473983764648, + "num_tokens": 289852974.0, + "step": 7600 + }, + { + "epoch": 0.9669253275664673, + "ewc_loss": 0.03600437939167023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.800218888092786e-05, + "grad_norm": 25.702260971069336, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8495017290115356, + "num_tokens": 289894553.0, + "step": 7601 + }, + { + "epoch": 0.9670525378450578, + "ewc_loss": 0.036019276827573776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.800963764253538e-05, + "grad_norm": 25.881771087646484, + "learning_rate": 1e-06, + "loss": 0.5123, + "mean_token_accuracy": 0.8422775268554688, + "num_tokens": 289930641.0, + "step": 7602 + }, + { + "epoch": 0.9671797481236484, + "ewc_loss": 0.03605642169713974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8028211343334988e-05, + "grad_norm": 25.816389083862305, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8537911772727966, + "num_tokens": 289972690.0, + "step": 7603 + }, + { + "epoch": 0.9673069584022389, + "ewc_loss": 0.035905152559280396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7952575944946148e-05, + "grad_norm": 25.817739486694336, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8542987108230591, + "num_tokens": 290006518.0, + "step": 7604 + }, + { + "epoch": 0.9674341686808294, + "ewc_loss": 0.03598453104496002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7992266293731518e-05, + "grad_norm": 25.70311164855957, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.846616268157959, + "num_tokens": 290045206.0, + "step": 7605 + }, + { + "epoch": 0.96756137895942, + "ewc_loss": 0.03596527874469757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7982640201807953e-05, + "grad_norm": 25.89951515197754, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8652890920639038, + "num_tokens": 290086976.0, + "step": 7606 + }, + { + "epoch": 0.9676885892380104, + "ewc_loss": 0.03599405661225319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.799702840799e-05, + "grad_norm": 25.77756118774414, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8736022710800171, + "num_tokens": 290126894.0, + "step": 7607 + }, + { + "epoch": 0.9678157995166009, + "ewc_loss": 0.035913482308387756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7956741430680268e-05, + "grad_norm": 25.862274169921875, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8743695020675659, + "num_tokens": 290161684.0, + "step": 7608 + }, + { + "epoch": 0.9679430097951914, + "ewc_loss": 0.03596148267388344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.798074117687065e-05, + "grad_norm": 25.75775146484375, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8624220490455627, + "num_tokens": 290203407.0, + "step": 7609 + }, + { + "epoch": 0.968070220073782, + "ewc_loss": 0.03598114103078842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7990570995607413e-05, + "grad_norm": 25.793434143066406, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.867176353931427, + "num_tokens": 290238645.0, + "step": 7610 + }, + { + "epoch": 0.9681974303523725, + "ewc_loss": 0.03602057695388794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.801028884074185e-05, + "grad_norm": 25.819616317749023, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8611276149749756, + "num_tokens": 290280279.0, + "step": 7611 + }, + { + "epoch": 0.968324640630963, + "ewc_loss": 0.035931993275880814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.796599644876551e-05, + "grad_norm": 25.71575164794922, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8588658571243286, + "num_tokens": 290317739.0, + "step": 7612 + }, + { + "epoch": 0.9684518509095535, + "ewc_loss": 0.03604022413492203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8020111383520998e-05, + "grad_norm": 25.830720901489258, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.858656644821167, + "num_tokens": 290360384.0, + "step": 7613 + }, + { + "epoch": 0.968579061188144, + "ewc_loss": 0.03600386902689934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8001934222411364e-05, + "grad_norm": 25.82714080810547, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8673577308654785, + "num_tokens": 290398706.0, + "step": 7614 + }, + { + "epoch": 0.9687062714667345, + "ewc_loss": 0.03592146188020706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7960730474442244e-05, + "grad_norm": 25.86305809020996, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8749988079071045, + "num_tokens": 290439524.0, + "step": 7615 + }, + { + "epoch": 0.968833481745325, + "ewc_loss": 0.03592592850327492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7962964193429798e-05, + "grad_norm": 25.839792251586914, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.855711817741394, + "num_tokens": 290472125.0, + "step": 7616 + }, + { + "epoch": 0.9689606920239155, + "ewc_loss": 0.035928141325712204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7964070138987154e-05, + "grad_norm": 25.81601333618164, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8438690304756165, + "num_tokens": 290514348.0, + "step": 7617 + }, + { + "epoch": 0.9690879023025061, + "ewc_loss": 0.03586360439658165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7931803085957654e-05, + "grad_norm": 25.800525665283203, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8554705381393433, + "num_tokens": 290551699.0, + "step": 7618 + }, + { + "epoch": 0.9692151125810966, + "ewc_loss": 0.035935480147600174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.796774085960351e-05, + "grad_norm": 25.854156494140625, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8619449734687805, + "num_tokens": 290588644.0, + "step": 7619 + }, + { + "epoch": 0.969342322859687, + "ewc_loss": 0.03586161509156227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7930808098753914e-05, + "grad_norm": 25.758865356445312, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8627636432647705, + "num_tokens": 290629546.0, + "step": 7620 + }, + { + "epoch": 0.9694695331382776, + "ewc_loss": 0.03592371568083763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7961858247872442e-05, + "grad_norm": 25.849817276000977, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8686861991882324, + "num_tokens": 290665527.0, + "step": 7621 + }, + { + "epoch": 0.9695967434168681, + "ewc_loss": 0.035893168300390244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7946584193850867e-05, + "grad_norm": 25.834556579589844, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8517840504646301, + "num_tokens": 290706667.0, + "step": 7622 + }, + { + "epoch": 0.9697239536954586, + "ewc_loss": 0.03590410202741623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7952050257008523e-05, + "grad_norm": 25.81224250793457, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8557238578796387, + "num_tokens": 290745698.0, + "step": 7623 + }, + { + "epoch": 0.9698511639740491, + "ewc_loss": 0.03586983680725098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7934919014805928e-05, + "grad_norm": 25.736244201660156, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8661360740661621, + "num_tokens": 290777983.0, + "step": 7624 + }, + { + "epoch": 0.9699783742526397, + "ewc_loss": 0.03589794784784317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7948974345927127e-05, + "grad_norm": 25.7924861907959, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8542609214782715, + "num_tokens": 290814184.0, + "step": 7625 + }, + { + "epoch": 0.9701055845312301, + "ewc_loss": 0.035968340933322906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7984169971896335e-05, + "grad_norm": 25.791715621948242, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8524570465087891, + "num_tokens": 290854054.0, + "step": 7626 + }, + { + "epoch": 0.9702327948098206, + "ewc_loss": 0.03587150201201439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.793575029296335e-05, + "grad_norm": 25.710790634155273, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8586510419845581, + "num_tokens": 290891325.0, + "step": 7627 + }, + { + "epoch": 0.9703600050884111, + "ewc_loss": 0.03596418350934982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7982092685997486e-05, + "grad_norm": 25.856340408325195, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8593829870223999, + "num_tokens": 290930101.0, + "step": 7628 + }, + { + "epoch": 0.9704872153670017, + "ewc_loss": 0.035922687500715256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.796134347387124e-05, + "grad_norm": 25.801786422729492, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8607407808303833, + "num_tokens": 290964691.0, + "step": 7629 + }, + { + "epoch": 0.9706144256455922, + "ewc_loss": 0.035863690078258514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7931844922713935e-05, + "grad_norm": 25.767478942871094, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8572980761528015, + "num_tokens": 291008140.0, + "step": 7630 + }, + { + "epoch": 0.9707416359241827, + "ewc_loss": 0.035930998623371124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.796549986465834e-05, + "grad_norm": 25.788835525512695, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8551124930381775, + "num_tokens": 291041484.0, + "step": 7631 + }, + { + "epoch": 0.9708688462027731, + "ewc_loss": 0.03594263270497322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.797131699277088e-05, + "grad_norm": 25.74380111694336, + "learning_rate": 1e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.842099666595459, + "num_tokens": 291082337.0, + "step": 7632 + }, + { + "epoch": 0.9709960564813637, + "ewc_loss": 0.03592546284198761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7962731362786144e-05, + "grad_norm": 25.859817504882812, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8595093488693237, + "num_tokens": 291115921.0, + "step": 7633 + }, + { + "epoch": 0.9711232667599542, + "ewc_loss": 0.036010727286338806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8005363017437048e-05, + "grad_norm": 25.82503890991211, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.86805260181427, + "num_tokens": 291154367.0, + "step": 7634 + }, + { + "epoch": 0.9712504770385447, + "ewc_loss": 0.035980913788080215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.799045639927499e-05, + "grad_norm": 25.75183868408203, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8610883951187134, + "num_tokens": 291198919.0, + "step": 7635 + }, + { + "epoch": 0.9713776873171353, + "ewc_loss": 0.03601991757750511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.800995960365981e-05, + "grad_norm": 25.872303009033203, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8490080833435059, + "num_tokens": 291237062.0, + "step": 7636 + }, + { + "epoch": 0.9715048975957258, + "ewc_loss": 0.03598461300134659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7992306311498396e-05, + "grad_norm": 25.893341064453125, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8724156618118286, + "num_tokens": 291275676.0, + "step": 7637 + }, + { + "epoch": 0.9716321078743162, + "ewc_loss": 0.03591049462556839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7955248040379956e-05, + "grad_norm": 25.761140823364258, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8531949520111084, + "num_tokens": 291316785.0, + "step": 7638 + }, + { + "epoch": 0.9717593181529067, + "ewc_loss": 0.03592899441719055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7964497601496987e-05, + "grad_norm": 25.825523376464844, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8581300377845764, + "num_tokens": 291360448.0, + "step": 7639 + }, + { + "epoch": 0.9718865284314973, + "ewc_loss": 0.035924281924963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.796214019122999e-05, + "grad_norm": 25.79051971435547, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8569459915161133, + "num_tokens": 291401650.0, + "step": 7640 + }, + { + "epoch": 0.9720137387100878, + "ewc_loss": 0.03594360500574112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7971802662941627e-05, + "grad_norm": 25.777387619018555, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8612184524536133, + "num_tokens": 291439736.0, + "step": 7641 + }, + { + "epoch": 0.9721409489886783, + "ewc_loss": 0.03593900427222252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7969501641346142e-05, + "grad_norm": 25.84069061279297, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8655235767364502, + "num_tokens": 291479077.0, + "step": 7642 + }, + { + "epoch": 0.9722681592672688, + "ewc_loss": 0.035903200507164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7951600966625847e-05, + "grad_norm": 25.806468963623047, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8697940707206726, + "num_tokens": 291512470.0, + "step": 7643 + }, + { + "epoch": 0.9723953695458593, + "ewc_loss": 0.03591485321521759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7957427189685404e-05, + "grad_norm": 25.842388153076172, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8712153434753418, + "num_tokens": 291552797.0, + "step": 7644 + }, + { + "epoch": 0.9725225798244498, + "ewc_loss": 0.035890351980924606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7945176296052523e-05, + "grad_norm": 25.755870819091797, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.864467978477478, + "num_tokens": 291589885.0, + "step": 7645 + }, + { + "epoch": 0.9726497901030403, + "ewc_loss": 0.035868000239133835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7934000425157137e-05, + "grad_norm": 25.746557235717773, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8597270250320435, + "num_tokens": 291633005.0, + "step": 7646 + }, + { + "epoch": 0.9727770003816308, + "ewc_loss": 0.03591813147068024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7959066099138e-05, + "grad_norm": 25.780824661254883, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8562239408493042, + "num_tokens": 291666083.0, + "step": 7647 + }, + { + "epoch": 0.9729042106602214, + "ewc_loss": 0.03597065806388855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7985328668146394e-05, + "grad_norm": 25.78900909423828, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8666567802429199, + "num_tokens": 291706421.0, + "step": 7648 + }, + { + "epoch": 0.9730314209388119, + "ewc_loss": 0.03597137704491615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7985688828048296e-05, + "grad_norm": 25.80340003967285, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8656972646713257, + "num_tokens": 291744641.0, + "step": 7649 + }, + { + "epoch": 0.9731586312174023, + "ewc_loss": 0.03599216789007187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.799608435248956e-05, + "grad_norm": 25.834735870361328, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8865785598754883, + "num_tokens": 291780586.0, + "step": 7650 + }, + { + "epoch": 0.9732858414959928, + "ewc_loss": 0.035890161991119385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.794508170860354e-05, + "grad_norm": 25.741130828857422, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8569188117980957, + "num_tokens": 291820482.0, + "step": 7651 + }, + { + "epoch": 0.9734130517745834, + "ewc_loss": 0.035913608968257904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.795680509530939e-05, + "grad_norm": 25.869550704956055, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8697652816772461, + "num_tokens": 291858568.0, + "step": 7652 + }, + { + "epoch": 0.9735402620531739, + "ewc_loss": 0.035980962216854095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.799048186512664e-05, + "grad_norm": 25.86867904663086, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8537258505821228, + "num_tokens": 291889383.0, + "step": 7653 + }, + { + "epoch": 0.9736674723317644, + "ewc_loss": 0.03593243286013603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7966216546483338e-05, + "grad_norm": 25.745628356933594, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8701426982879639, + "num_tokens": 291927541.0, + "step": 7654 + }, + { + "epoch": 0.973794682610355, + "ewc_loss": 0.035938747227191925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7969374312087893e-05, + "grad_norm": 25.874683380126953, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8454943895339966, + "num_tokens": 291969074.0, + "step": 7655 + }, + { + "epoch": 0.9739218928889454, + "ewc_loss": 0.0358906090259552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.794530362531077e-05, + "grad_norm": 25.704761505126953, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8588755130767822, + "num_tokens": 292008311.0, + "step": 7656 + }, + { + "epoch": 0.9740491031675359, + "ewc_loss": 0.035889286547899246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7944643332157284e-05, + "grad_norm": 25.74652671813965, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.851921558380127, + "num_tokens": 292051810.0, + "step": 7657 + }, + { + "epoch": 0.9741763134461264, + "ewc_loss": 0.035957857966423035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.797892946342472e-05, + "grad_norm": 25.74282455444336, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8669605255126953, + "num_tokens": 292090411.0, + "step": 7658 + }, + { + "epoch": 0.974303523724717, + "ewc_loss": 0.035913098603487015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.795654861780349e-05, + "grad_norm": 25.836505889892578, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8528709411621094, + "num_tokens": 292124411.0, + "step": 7659 + }, + { + "epoch": 0.9744307340033075, + "ewc_loss": 0.03600456565618515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8002283468376845e-05, + "grad_norm": 25.72127914428711, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8601678609848022, + "num_tokens": 292166428.0, + "step": 7660 + }, + { + "epoch": 0.974557944281898, + "ewc_loss": 0.03595314919948578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7976573872147128e-05, + "grad_norm": 25.843799591064453, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8627620935440063, + "num_tokens": 292202687.0, + "step": 7661 + }, + { + "epoch": 0.9746851545604885, + "ewc_loss": 0.0360109806060791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8005490346695296e-05, + "grad_norm": 25.694543838500977, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8689864873886108, + "num_tokens": 292246364.0, + "step": 7662 + }, + { + "epoch": 0.974812364839079, + "ewc_loss": 0.035910073667764664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7955037037609145e-05, + "grad_norm": 25.82278060913086, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8618646860122681, + "num_tokens": 292285789.0, + "step": 7663 + }, + { + "epoch": 0.9749395751176695, + "ewc_loss": 0.03599575534462929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7997877876041457e-05, + "grad_norm": 25.72109031677246, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8554801940917969, + "num_tokens": 292320755.0, + "step": 7664 + }, + { + "epoch": 0.97506678539626, + "ewc_loss": 0.03598126024007797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7990629203268327e-05, + "grad_norm": 25.836597442626953, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8700395226478577, + "num_tokens": 292362818.0, + "step": 7665 + }, + { + "epoch": 0.9751939956748505, + "ewc_loss": 0.03601226955652237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8006134268944152e-05, + "grad_norm": 25.786766052246094, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8527820110321045, + "num_tokens": 292404016.0, + "step": 7666 + }, + { + "epoch": 0.9753212059534411, + "ewc_loss": 0.035942938178777695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.797146978788078e-05, + "grad_norm": 25.809289932250977, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8465142250061035, + "num_tokens": 292442117.0, + "step": 7667 + }, + { + "epoch": 0.9754484162320316, + "ewc_loss": 0.036032382398843765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8016191461356357e-05, + "grad_norm": 25.812725067138672, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8681396842002869, + "num_tokens": 292472523.0, + "step": 7668 + }, + { + "epoch": 0.975575626510622, + "ewc_loss": 0.035920653492212296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7960326658794656e-05, + "grad_norm": 25.8020076751709, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8646061420440674, + "num_tokens": 292508571.0, + "step": 7669 + }, + { + "epoch": 0.9757028367892125, + "ewc_loss": 0.03597601503133774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7988008039537817e-05, + "grad_norm": 25.928359985351562, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8574655055999756, + "num_tokens": 292545358.0, + "step": 7670 + }, + { + "epoch": 0.9758300470678031, + "ewc_loss": 0.03591153770685196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.795576827134937e-05, + "grad_norm": 25.86335563659668, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8458700180053711, + "num_tokens": 292583962.0, + "step": 7671 + }, + { + "epoch": 0.9759572573463936, + "ewc_loss": 0.035936735570430756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7968368410947733e-05, + "grad_norm": 25.776607513427734, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8595730066299438, + "num_tokens": 292616186.0, + "step": 7672 + }, + { + "epoch": 0.9760844676249841, + "ewc_loss": 0.036020755767822266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8010377971222624e-05, + "grad_norm": 25.863014221191406, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8560978770256042, + "num_tokens": 292657378.0, + "step": 7673 + }, + { + "epoch": 0.9762116779035747, + "ewc_loss": 0.035916831344366074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.795841490093153e-05, + "grad_norm": 25.749711990356445, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.874456524848938, + "num_tokens": 292692041.0, + "step": 7674 + }, + { + "epoch": 0.9763388881821651, + "ewc_loss": 0.03599347174167633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.799673555069603e-05, + "grad_norm": 25.84963035583496, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8578435778617859, + "num_tokens": 292725062.0, + "step": 7675 + }, + { + "epoch": 0.9764660984607556, + "ewc_loss": 0.03600732982158661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8003664081334136e-05, + "grad_norm": 25.732603073120117, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8671231269836426, + "num_tokens": 292766522.0, + "step": 7676 + }, + { + "epoch": 0.9765933087393461, + "ewc_loss": 0.03604907542467117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8024536984739825e-05, + "grad_norm": 25.904720306396484, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.859687328338623, + "num_tokens": 292811371.0, + "step": 7677 + }, + { + "epoch": 0.9767205190179367, + "ewc_loss": 0.036036934703588486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8018467017100193e-05, + "grad_norm": 25.665800094604492, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8607240915298462, + "num_tokens": 292850237.0, + "step": 7678 + }, + { + "epoch": 0.9768477292965272, + "ewc_loss": 0.0359671413898468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.798357152438257e-05, + "grad_norm": 25.944799423217773, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8647997379302979, + "num_tokens": 292887291.0, + "step": 7679 + }, + { + "epoch": 0.9769749395751177, + "ewc_loss": 0.036065295338630676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8032647858490236e-05, + "grad_norm": 25.79838752746582, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8736320734024048, + "num_tokens": 292928396.0, + "step": 7680 + }, + { + "epoch": 0.9771021498537081, + "ewc_loss": 0.03598008677363396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7990043488680385e-05, + "grad_norm": 25.77626609802246, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8644005060195923, + "num_tokens": 292962748.0, + "step": 7681 + }, + { + "epoch": 0.9772293601322987, + "ewc_loss": 0.0360424667596817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8021233699982986e-05, + "grad_norm": 25.88484001159668, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8720652461051941, + "num_tokens": 293002330.0, + "step": 7682 + }, + { + "epoch": 0.9773565704108892, + "ewc_loss": 0.03606720268726349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8033601008937694e-05, + "grad_norm": 25.897470474243164, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8604483008384705, + "num_tokens": 293038288.0, + "step": 7683 + }, + { + "epoch": 0.9774837806894797, + "ewc_loss": 0.03603193163871765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8015965906670317e-05, + "grad_norm": 25.842710494995117, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8623519539833069, + "num_tokens": 293078121.0, + "step": 7684 + }, + { + "epoch": 0.9776109909680702, + "ewc_loss": 0.03600558638572693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8002792785409838e-05, + "grad_norm": 25.95738410949707, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8579742312431335, + "num_tokens": 293116846.0, + "step": 7685 + }, + { + "epoch": 0.9777382012466608, + "ewc_loss": 0.03599852696061134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.799926394596696e-05, + "grad_norm": 25.892385482788086, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8568825721740723, + "num_tokens": 293162237.0, + "step": 7686 + }, + { + "epoch": 0.9778654115252512, + "ewc_loss": 0.035915736109018326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7957867385121062e-05, + "grad_norm": 25.940275192260742, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8576454520225525, + "num_tokens": 293196631.0, + "step": 7687 + }, + { + "epoch": 0.9779926218038417, + "ewc_loss": 0.03595515340566635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7977576135308482e-05, + "grad_norm": 25.75537872314453, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8718137145042419, + "num_tokens": 293238883.0, + "step": 7688 + }, + { + "epoch": 0.9781198320824323, + "ewc_loss": 0.03590182587504387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7950913388631307e-05, + "grad_norm": 25.892005920410156, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8505651950836182, + "num_tokens": 293270802.0, + "step": 7689 + }, + { + "epoch": 0.9782470423610228, + "ewc_loss": 0.03594638407230377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7973192370845936e-05, + "grad_norm": 25.77413558959961, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8475338220596313, + "num_tokens": 293314910.0, + "step": 7690 + }, + { + "epoch": 0.9783742526396133, + "ewc_loss": 0.035945404320955276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7972703062696382e-05, + "grad_norm": 25.792436599731445, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8738502860069275, + "num_tokens": 293353083.0, + "step": 7691 + }, + { + "epoch": 0.9785014629182038, + "ewc_loss": 0.035966042429208755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7983020370593295e-05, + "grad_norm": 25.79636001586914, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8638392686843872, + "num_tokens": 293389473.0, + "step": 7692 + }, + { + "epoch": 0.9786286731967943, + "ewc_loss": 0.03593645617365837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.796822834876366e-05, + "grad_norm": 25.83915901184082, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8753049969673157, + "num_tokens": 293426893.0, + "step": 7693 + }, + { + "epoch": 0.9787558834753848, + "ewc_loss": 0.03600919991731644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8004600860876963e-05, + "grad_norm": 25.78229331970215, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8659942150115967, + "num_tokens": 293468116.0, + "step": 7694 + }, + { + "epoch": 0.9788830937539753, + "ewc_loss": 0.03595228120684624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7976140952669084e-05, + "grad_norm": 25.73779296875, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8632506132125854, + "num_tokens": 293503050.0, + "step": 7695 + }, + { + "epoch": 0.9790103040325658, + "ewc_loss": 0.035944338887929916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7972170098801143e-05, + "grad_norm": 25.75554656982422, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8506830930709839, + "num_tokens": 293541673.0, + "step": 7696 + }, + { + "epoch": 0.9791375143111564, + "ewc_loss": 0.03596532344818115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7982662029680796e-05, + "grad_norm": 25.769380569458008, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8587594032287598, + "num_tokens": 293578472.0, + "step": 7697 + }, + { + "epoch": 0.9792647245897469, + "ewc_loss": 0.03601091727614403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8005459423875436e-05, + "grad_norm": 25.713850021362305, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8623255491256714, + "num_tokens": 293611204.0, + "step": 7698 + }, + { + "epoch": 0.9793919348683373, + "ewc_loss": 0.03597056865692139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.798528501240071e-05, + "grad_norm": 25.638891220092773, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8510090708732605, + "num_tokens": 293651352.0, + "step": 7699 + }, + { + "epoch": 0.9795191451469278, + "ewc_loss": 0.03605769947171211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8028849808615632e-05, + "grad_norm": 25.83836555480957, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8720619082450867, + "num_tokens": 293687396.0, + "step": 7700 + }, + { + "epoch": 0.9796463554255184, + "ewc_loss": 0.03613008186221123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.806504042178858e-05, + "grad_norm": 25.82598114013672, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8574608564376831, + "num_tokens": 293725083.0, + "step": 7701 + }, + { + "epoch": 0.9797735657041089, + "ewc_loss": 0.03597391024231911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.7986954844673164e-05, + "grad_norm": 25.667348861694336, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8620621562004089, + "num_tokens": 293766136.0, + "step": 7702 + }, + { + "epoch": 0.9799007759826994, + "ewc_loss": 0.03610479459166527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8052396626444533e-05, + "grad_norm": 25.85017204284668, + "learning_rate": 1e-06, + "loss": 0.5032, + "mean_token_accuracy": 0.8448752164840698, + "num_tokens": 293805810.0, + "step": 7703 + }, + { + "epoch": 0.98002798626129, + "ewc_loss": 0.0361008495092392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.805042484193109e-05, + "grad_norm": 25.703676223754883, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8528692126274109, + "num_tokens": 293839041.0, + "step": 7704 + }, + { + "epoch": 0.9801551965398804, + "ewc_loss": 0.03604787215590477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8023936718236655e-05, + "grad_norm": 25.785642623901367, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8762421607971191, + "num_tokens": 293876541.0, + "step": 7705 + }, + { + "epoch": 0.9802824068184709, + "ewc_loss": 0.03615136444568634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8075681509799324e-05, + "grad_norm": 25.763668060302734, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8750039339065552, + "num_tokens": 293918141.0, + "step": 7706 + }, + { + "epoch": 0.9804096170970614, + "ewc_loss": 0.03617506101727486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.808753040677402e-05, + "grad_norm": 25.867528915405273, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.849859893321991, + "num_tokens": 293958078.0, + "step": 7707 + }, + { + "epoch": 0.980536827375652, + "ewc_loss": 0.03613945469260216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.806972795748152e-05, + "grad_norm": 25.766822814941406, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8707524538040161, + "num_tokens": 293990347.0, + "step": 7708 + }, + { + "epoch": 0.9806640376542425, + "ewc_loss": 0.036155883222818375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.807794251362793e-05, + "grad_norm": 25.918249130249023, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.866693913936615, + "num_tokens": 294032136.0, + "step": 7709 + }, + { + "epoch": 0.980791247932833, + "ewc_loss": 0.036179542541503906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.808977140171919e-05, + "grad_norm": 25.780363082885742, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8490034341812134, + "num_tokens": 294065136.0, + "step": 7710 + }, + { + "epoch": 0.9809184582114235, + "ewc_loss": 0.03613976389169693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.806988257158082e-05, + "grad_norm": 25.842060089111328, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8752573132514954, + "num_tokens": 294104539.0, + "step": 7711 + }, + { + "epoch": 0.981045668490014, + "ewc_loss": 0.03613777831196785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8068889403366484e-05, + "grad_norm": 25.783000946044922, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8584500551223755, + "num_tokens": 294148962.0, + "step": 7712 + }, + { + "epoch": 0.9811728787686045, + "ewc_loss": 0.03617138788104057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.808569322747644e-05, + "grad_norm": 25.95037078857422, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8649865388870239, + "num_tokens": 294181866.0, + "step": 7713 + }, + { + "epoch": 0.981300089047195, + "ewc_loss": 0.03613590449094772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8067952623823658e-05, + "grad_norm": 25.797401428222656, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8696684837341309, + "num_tokens": 294223558.0, + "step": 7714 + }, + { + "epoch": 0.9814272993257855, + "ewc_loss": 0.036140408366918564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8070204532705247e-05, + "grad_norm": 25.940887451171875, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.865894079208374, + "num_tokens": 294260487.0, + "step": 7715 + }, + { + "epoch": 0.9815545096043761, + "ewc_loss": 0.03614956513047218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8074782929033972e-05, + "grad_norm": 25.846702575683594, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8620322942733765, + "num_tokens": 294302642.0, + "step": 7716 + }, + { + "epoch": 0.9816817198829666, + "ewc_loss": 0.036044035106897354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8022017684415914e-05, + "grad_norm": 25.87590789794922, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8527882099151611, + "num_tokens": 294338376.0, + "step": 7717 + }, + { + "epoch": 0.981808930161557, + "ewc_loss": 0.036067020148038864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8033510059467517e-05, + "grad_norm": 25.86618423461914, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8555768728256226, + "num_tokens": 294372089.0, + "step": 7718 + }, + { + "epoch": 0.9819361404401475, + "ewc_loss": 0.0360645093023777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.803225495677907e-05, + "grad_norm": 25.87722396850586, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8658981919288635, + "num_tokens": 294403801.0, + "step": 7719 + }, + { + "epoch": 0.9820633507187381, + "ewc_loss": 0.036047980189323425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8023989468929358e-05, + "grad_norm": 25.887510299682617, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.864240288734436, + "num_tokens": 294440301.0, + "step": 7720 + }, + { + "epoch": 0.9821905609973286, + "ewc_loss": 0.03602847456932068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.801423786673695e-05, + "grad_norm": 25.78177261352539, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8619674444198608, + "num_tokens": 294472967.0, + "step": 7721 + }, + { + "epoch": 0.9823177712759191, + "ewc_loss": 0.036074474453926086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8037237168755382e-05, + "grad_norm": 25.8338623046875, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8512714505195618, + "num_tokens": 294518248.0, + "step": 7722 + }, + { + "epoch": 0.9824449815545097, + "ewc_loss": 0.03606578707695007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8032893422059715e-05, + "grad_norm": 25.84532928466797, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.860019326210022, + "num_tokens": 294553899.0, + "step": 7723 + }, + { + "epoch": 0.9825721918331001, + "ewc_loss": 0.0361262783408165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8063139577861875e-05, + "grad_norm": 25.842758178710938, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.8390059471130371, + "num_tokens": 294591382.0, + "step": 7724 + }, + { + "epoch": 0.9826994021116906, + "ewc_loss": 0.0361209511756897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.806047475838568e-05, + "grad_norm": 25.923194885253906, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8614463210105896, + "num_tokens": 294629606.0, + "step": 7725 + }, + { + "epoch": 0.9828266123902811, + "ewc_loss": 0.036102015525102615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8051006918540224e-05, + "grad_norm": 25.820392608642578, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8643487691879272, + "num_tokens": 294669417.0, + "step": 7726 + }, + { + "epoch": 0.9829538226688717, + "ewc_loss": 0.036109648644924164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8054824977298267e-05, + "grad_norm": 25.828369140625, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8705388307571411, + "num_tokens": 294709876.0, + "step": 7727 + }, + { + "epoch": 0.9830810329474622, + "ewc_loss": 0.03606931120157242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.803465602279175e-05, + "grad_norm": 25.76053810119629, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8515838980674744, + "num_tokens": 294750565.0, + "step": 7728 + }, + { + "epoch": 0.9832082432260527, + "ewc_loss": 0.036128751933574677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8064376490656286e-05, + "grad_norm": 25.83207893371582, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8619707822799683, + "num_tokens": 294790586.0, + "step": 7729 + }, + { + "epoch": 0.9833354535046431, + "ewc_loss": 0.03611431643366814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8057158740703017e-05, + "grad_norm": 25.71811866760254, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8595211505889893, + "num_tokens": 294830176.0, + "step": 7730 + }, + { + "epoch": 0.9834626637832337, + "ewc_loss": 0.036126937717199326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8063468814943917e-05, + "grad_norm": 25.856592178344727, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8486285209655762, + "num_tokens": 294874199.0, + "step": 7731 + }, + { + "epoch": 0.9835898740618242, + "ewc_loss": 0.036216143518686295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8108072254108265e-05, + "grad_norm": 25.820537567138672, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.868550717830658, + "num_tokens": 294914219.0, + "step": 7732 + }, + { + "epoch": 0.9837170843404147, + "ewc_loss": 0.036100152879953384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.805007559596561e-05, + "grad_norm": 25.90835189819336, + "learning_rate": 1e-06, + "loss": 0.5104, + "mean_token_accuracy": 0.8384958505630493, + "num_tokens": 294950726.0, + "step": 7733 + }, + { + "epoch": 0.9838442946190052, + "ewc_loss": 0.03609742596745491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8048713172902353e-05, + "grad_norm": 25.72907066345215, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8607152700424194, + "num_tokens": 294993815.0, + "step": 7734 + }, + { + "epoch": 0.9839715048975958, + "ewc_loss": 0.03609549626708031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.804774728952907e-05, + "grad_norm": 25.853883743286133, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8687721490859985, + "num_tokens": 295029275.0, + "step": 7735 + }, + { + "epoch": 0.9840987151761862, + "ewc_loss": 0.03609585016965866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8047925550490618e-05, + "grad_norm": 25.828325271606445, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8609338998794556, + "num_tokens": 295066166.0, + "step": 7736 + }, + { + "epoch": 0.9842259254547767, + "ewc_loss": 0.036173418164253235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.808670822356362e-05, + "grad_norm": 25.970626831054688, + "learning_rate": 1e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8393267393112183, + "num_tokens": 295100436.0, + "step": 7737 + }, + { + "epoch": 0.9843531357333672, + "ewc_loss": 0.03614431619644165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8072158127324656e-05, + "grad_norm": 25.758333206176758, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8623745441436768, + "num_tokens": 295141756.0, + "step": 7738 + }, + { + "epoch": 0.9844803460119578, + "ewc_loss": 0.03607996925711632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8039983842754737e-05, + "grad_norm": 25.859516143798828, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.878307044506073, + "num_tokens": 295178894.0, + "step": 7739 + }, + { + "epoch": 0.9846075562905483, + "ewc_loss": 0.03608620539307594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8043103409581818e-05, + "grad_norm": 25.847700119018555, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8509699106216431, + "num_tokens": 295219133.0, + "step": 7740 + }, + { + "epoch": 0.9847347665691388, + "ewc_loss": 0.03608119487762451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.804059684218373e-05, + "grad_norm": 25.737123489379883, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8626514673233032, + "num_tokens": 295260322.0, + "step": 7741 + }, + { + "epoch": 0.9848619768477292, + "ewc_loss": 0.036105163395404816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8052582163363695e-05, + "grad_norm": 25.933486938476562, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8550223112106323, + "num_tokens": 295299675.0, + "step": 7742 + }, + { + "epoch": 0.9849891871263198, + "ewc_loss": 0.0361696258187294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8084812836605124e-05, + "grad_norm": 25.827157974243164, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8669462203979492, + "num_tokens": 295336281.0, + "step": 7743 + }, + { + "epoch": 0.9851163974049103, + "ewc_loss": 0.03611154481768608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8055772670777515e-05, + "grad_norm": 25.793298721313477, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8598374128341675, + "num_tokens": 295376932.0, + "step": 7744 + }, + { + "epoch": 0.9852436076835008, + "ewc_loss": 0.03611773997545242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8058870409731753e-05, + "grad_norm": 25.90955924987793, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8581076860427856, + "num_tokens": 295421904.0, + "step": 7745 + }, + { + "epoch": 0.9853708179620914, + "ewc_loss": 0.036099839955568314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8049919162876904e-05, + "grad_norm": 25.85782814025879, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8637795448303223, + "num_tokens": 295456688.0, + "step": 7746 + }, + { + "epoch": 0.9854980282406819, + "ewc_loss": 0.036067843437194824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8033921151072718e-05, + "grad_norm": 25.960491180419922, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8779154419898987, + "num_tokens": 295500022.0, + "step": 7747 + }, + { + "epoch": 0.9856252385192723, + "ewc_loss": 0.036099888384342194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8049944628728554e-05, + "grad_norm": 25.870988845825195, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8808897733688354, + "num_tokens": 295538106.0, + "step": 7748 + }, + { + "epoch": 0.9857524487978628, + "ewc_loss": 0.03601031005382538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8005155652645044e-05, + "grad_norm": 25.8840389251709, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8574334383010864, + "num_tokens": 295572465.0, + "step": 7749 + }, + { + "epoch": 0.9858796590764534, + "ewc_loss": 0.03600082919001579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8000415366259404e-05, + "grad_norm": 25.77450942993164, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8670444488525391, + "num_tokens": 295614577.0, + "step": 7750 + }, + { + "epoch": 0.9860068693550439, + "ewc_loss": 0.036052148789167404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.802607403078582e-05, + "grad_norm": 25.83274269104004, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8658509850502014, + "num_tokens": 295654780.0, + "step": 7751 + }, + { + "epoch": 0.9861340796336344, + "ewc_loss": 0.035990308970212936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.799515484890435e-05, + "grad_norm": 25.665956497192383, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8602718710899353, + "num_tokens": 295695880.0, + "step": 7752 + }, + { + "epoch": 0.986261289912225, + "ewc_loss": 0.03603072464466095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.801536200218834e-05, + "grad_norm": 25.746902465820312, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8612488508224487, + "num_tokens": 295738176.0, + "step": 7753 + }, + { + "epoch": 0.9863885001908154, + "ewc_loss": 0.03613802045583725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8069009456667118e-05, + "grad_norm": 25.866586685180664, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8505141139030457, + "num_tokens": 295777562.0, + "step": 7754 + }, + { + "epoch": 0.9865157104694059, + "ewc_loss": 0.036078453063964844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.803922714316286e-05, + "grad_norm": 25.818603515625, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8766728043556213, + "num_tokens": 295815013.0, + "step": 7755 + }, + { + "epoch": 0.9866429207479964, + "ewc_loss": 0.03611426800489426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8057133274851367e-05, + "grad_norm": 25.90538787841797, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8498201370239258, + "num_tokens": 295857751.0, + "step": 7756 + }, + { + "epoch": 0.986770131026587, + "ewc_loss": 0.03601330146193504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.800665086193476e-05, + "grad_norm": 25.81911849975586, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8660666942596436, + "num_tokens": 295896033.0, + "step": 7757 + }, + { + "epoch": 0.9868973413051775, + "ewc_loss": 0.036153536289930344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.807676744647324e-05, + "grad_norm": 25.93989372253418, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8635914921760559, + "num_tokens": 295936838.0, + "step": 7758 + }, + { + "epoch": 0.987024551583768, + "ewc_loss": 0.036044519394636154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8022259610006586e-05, + "grad_norm": 25.79317855834961, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.866059422492981, + "num_tokens": 295972307.0, + "step": 7759 + }, + { + "epoch": 0.9871517618623584, + "ewc_loss": 0.03609127551317215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8045637261820957e-05, + "grad_norm": 25.81631851196289, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8591723442077637, + "num_tokens": 296009966.0, + "step": 7760 + }, + { + "epoch": 0.987278972140949, + "ewc_loss": 0.03605033829808235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.802516999305226e-05, + "grad_norm": 25.817279815673828, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8579666018486023, + "num_tokens": 296047874.0, + "step": 7761 + }, + { + "epoch": 0.9874061824195395, + "ewc_loss": 0.03607020154595375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.803510167519562e-05, + "grad_norm": 25.72283172607422, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8682108521461487, + "num_tokens": 296090591.0, + "step": 7762 + }, + { + "epoch": 0.98753339269813, + "ewc_loss": 0.03609554097056389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8047770936391316e-05, + "grad_norm": 25.85269546508789, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8553478717803955, + "num_tokens": 296133024.0, + "step": 7763 + }, + { + "epoch": 0.9876606029767205, + "ewc_loss": 0.036099329590797424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8049664504360408e-05, + "grad_norm": 25.81844711303711, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8502875566482544, + "num_tokens": 296177347.0, + "step": 7764 + }, + { + "epoch": 0.9877878132553111, + "ewc_loss": 0.03609162196516991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8045811884803697e-05, + "grad_norm": 25.86086654663086, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8519799709320068, + "num_tokens": 296215532.0, + "step": 7765 + }, + { + "epoch": 0.9879150235339016, + "ewc_loss": 0.03610165789723396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8050828657578677e-05, + "grad_norm": 25.813079833984375, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8491318225860596, + "num_tokens": 296249494.0, + "step": 7766 + }, + { + "epoch": 0.988042233812492, + "ewc_loss": 0.03609849140048027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8049246136797592e-05, + "grad_norm": 25.818899154663086, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8527268767356873, + "num_tokens": 296289370.0, + "step": 7767 + }, + { + "epoch": 0.9881694440910825, + "ewc_loss": 0.03610755875706673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.805377905839123e-05, + "grad_norm": 25.73311424255371, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8633151054382324, + "num_tokens": 296329376.0, + "step": 7768 + }, + { + "epoch": 0.9882966543696731, + "ewc_loss": 0.036154963076114655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8077482309308834e-05, + "grad_norm": 25.821693420410156, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8677660226821899, + "num_tokens": 296368875.0, + "step": 7769 + }, + { + "epoch": 0.9884238646482636, + "ewc_loss": 0.03617248311638832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8086240743286908e-05, + "grad_norm": 25.859031677246094, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8612666726112366, + "num_tokens": 296404766.0, + "step": 7770 + }, + { + "epoch": 0.9885510749268541, + "ewc_loss": 0.03612741827964783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8063708921545185e-05, + "grad_norm": 25.78082847595215, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8600285053253174, + "num_tokens": 296441705.0, + "step": 7771 + }, + { + "epoch": 0.9886782852054447, + "ewc_loss": 0.03618389368057251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.809194691304583e-05, + "grad_norm": 25.796302795410156, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8679175972938538, + "num_tokens": 296481075.0, + "step": 7772 + }, + { + "epoch": 0.9888054954840351, + "ewc_loss": 0.0361395962536335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8069798898068257e-05, + "grad_norm": 25.84454917907715, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8574475049972534, + "num_tokens": 296511404.0, + "step": 7773 + }, + { + "epoch": 0.9889327057626256, + "ewc_loss": 0.036174118518829346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8087059288518503e-05, + "grad_norm": 25.781414031982422, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8526334762573242, + "num_tokens": 296546080.0, + "step": 7774 + }, + { + "epoch": 0.9890599160412161, + "ewc_loss": 0.03614287078380585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8071435988531448e-05, + "grad_norm": 25.773195266723633, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8627414703369141, + "num_tokens": 296586119.0, + "step": 7775 + }, + { + "epoch": 0.9891871263198067, + "ewc_loss": 0.03621562942862511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.810781395761296e-05, + "grad_norm": 25.848052978515625, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8587425947189331, + "num_tokens": 296629602.0, + "step": 7776 + }, + { + "epoch": 0.9893143365983972, + "ewc_loss": 0.036161262542009354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8080630979966372e-05, + "grad_norm": 25.868337631225586, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8543999791145325, + "num_tokens": 296666709.0, + "step": 7777 + }, + { + "epoch": 0.9894415468769877, + "ewc_loss": 0.03619437292218208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8097185602528043e-05, + "grad_norm": 25.897178649902344, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8617189526557922, + "num_tokens": 296702611.0, + "step": 7778 + }, + { + "epoch": 0.9895687571555781, + "ewc_loss": 0.036163486540317535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8081744201481342e-05, + "grad_norm": 25.790922164916992, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8654522895812988, + "num_tokens": 296737649.0, + "step": 7779 + }, + { + "epoch": 0.9896959674341687, + "ewc_loss": 0.036173827946186066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.808691376936622e-05, + "grad_norm": 25.846433639526367, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8642398715019226, + "num_tokens": 296776915.0, + "step": 7780 + }, + { + "epoch": 0.9898231777127592, + "ewc_loss": 0.03619766980409622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.809883542591706e-05, + "grad_norm": 25.877151489257812, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8669275045394897, + "num_tokens": 296811491.0, + "step": 7781 + }, + { + "epoch": 0.9899503879913497, + "ewc_loss": 0.03620373085141182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8101865862263367e-05, + "grad_norm": 25.877838134765625, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8652500510215759, + "num_tokens": 296847602.0, + "step": 7782 + }, + { + "epoch": 0.9900775982699402, + "ewc_loss": 0.036200374364852905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8100186935043894e-05, + "grad_norm": 25.864887237548828, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8444172143936157, + "num_tokens": 296884737.0, + "step": 7783 + }, + { + "epoch": 0.9902048085485308, + "ewc_loss": 0.036217041313648224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.810852154449094e-05, + "grad_norm": 25.86026382446289, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8489438891410828, + "num_tokens": 296927780.0, + "step": 7784 + }, + { + "epoch": 0.9903320188271212, + "ewc_loss": 0.03622257336974144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.811128640838433e-05, + "grad_norm": 25.879695892333984, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8748669624328613, + "num_tokens": 296964200.0, + "step": 7785 + }, + { + "epoch": 0.9904592291057117, + "ewc_loss": 0.03621796891093254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.810898356779944e-05, + "grad_norm": 25.85454750061035, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.852814793586731, + "num_tokens": 297009927.0, + "step": 7786 + }, + { + "epoch": 0.9905864393843022, + "ewc_loss": 0.036209315061569214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.810465801099781e-05, + "grad_norm": 25.90707015991211, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8486045598983765, + "num_tokens": 297050788.0, + "step": 7787 + }, + { + "epoch": 0.9907136496628928, + "ewc_loss": 0.03618337959051132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.809169043553993e-05, + "grad_norm": 25.910140991210938, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8666238784790039, + "num_tokens": 297092398.0, + "step": 7788 + }, + { + "epoch": 0.9908408599414833, + "ewc_loss": 0.03622538968920708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8112694306182675e-05, + "grad_norm": 25.869604110717773, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8588032722473145, + "num_tokens": 297130728.0, + "step": 7789 + }, + { + "epoch": 0.9909680702200738, + "ewc_loss": 0.03613027557730675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8065138647216372e-05, + "grad_norm": 25.880525588989258, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8845367431640625, + "num_tokens": 297169124.0, + "step": 7790 + }, + { + "epoch": 0.9910952804986642, + "ewc_loss": 0.03619445487856865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8097227439284325e-05, + "grad_norm": 25.86630630493164, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.870336651802063, + "num_tokens": 297204914.0, + "step": 7791 + }, + { + "epoch": 0.9912224907772548, + "ewc_loss": 0.03615550696849823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8077753338729963e-05, + "grad_norm": 25.882795333862305, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.85877925157547, + "num_tokens": 297240551.0, + "step": 7792 + }, + { + "epoch": 0.9913497010558453, + "ewc_loss": 0.03617607057094574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8088036085828207e-05, + "grad_norm": 25.73890495300293, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8533111810684204, + "num_tokens": 297287160.0, + "step": 7793 + }, + { + "epoch": 0.9914769113344358, + "ewc_loss": 0.0362003818154335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.81001905730227e-05, + "grad_norm": 25.950542449951172, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8627105951309204, + "num_tokens": 297323394.0, + "step": 7794 + }, + { + "epoch": 0.9916041216130264, + "ewc_loss": 0.03620978817343712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.810489447962027e-05, + "grad_norm": 25.76325035095215, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8393417000770569, + "num_tokens": 297357808.0, + "step": 7795 + }, + { + "epoch": 0.9917313318916169, + "ewc_loss": 0.03608635067939758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.804317616915796e-05, + "grad_norm": 25.878679275512695, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8561604022979736, + "num_tokens": 297397300.0, + "step": 7796 + }, + { + "epoch": 0.9918585421702073, + "ewc_loss": 0.036205656826496124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8102828107657842e-05, + "grad_norm": 25.802011489868164, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8696774244308472, + "num_tokens": 297434699.0, + "step": 7797 + }, + { + "epoch": 0.9919857524487978, + "ewc_loss": 0.036173753440380096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.808687738957815e-05, + "grad_norm": 25.950119018554688, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8605124950408936, + "num_tokens": 297473939.0, + "step": 7798 + }, + { + "epoch": 0.9921129627273884, + "ewc_loss": 0.03619743883609772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8098719010595232e-05, + "grad_norm": 25.87860679626465, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8529443740844727, + "num_tokens": 297511154.0, + "step": 7799 + }, + { + "epoch": 0.9922401730059789, + "ewc_loss": 0.03605630621314049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8028153135674074e-05, + "grad_norm": 25.789276123046875, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8521701693534851, + "num_tokens": 297556092.0, + "step": 7800 + }, + { + "epoch": 0.9923673832845694, + "ewc_loss": 0.036185335367918015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8092667232849635e-05, + "grad_norm": 25.888206481933594, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8656653165817261, + "num_tokens": 297598160.0, + "step": 7801 + }, + { + "epoch": 0.9924945935631599, + "ewc_loss": 0.03611097112298012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8055485270451754e-05, + "grad_norm": 25.7955379486084, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8768832683563232, + "num_tokens": 297638706.0, + "step": 7802 + }, + { + "epoch": 0.9926218038417504, + "ewc_loss": 0.03613300994038582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8066504708258435e-05, + "grad_norm": 25.88848304748535, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8656715154647827, + "num_tokens": 297675955.0, + "step": 7803 + }, + { + "epoch": 0.9927490141203409, + "ewc_loss": 0.03618735820055008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8093678590958007e-05, + "grad_norm": 25.863296508789062, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8561472296714783, + "num_tokens": 297715362.0, + "step": 7804 + }, + { + "epoch": 0.9928762243989314, + "ewc_loss": 0.036140840500593185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.807042099244427e-05, + "grad_norm": 25.906307220458984, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8462647795677185, + "num_tokens": 297750931.0, + "step": 7805 + }, + { + "epoch": 0.993003434677522, + "ewc_loss": 0.0361831858754158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.809159221011214e-05, + "grad_norm": 25.806575775146484, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8527587652206421, + "num_tokens": 297786107.0, + "step": 7806 + }, + { + "epoch": 0.9931306449561125, + "ewc_loss": 0.03615017235279083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8075086700264364e-05, + "grad_norm": 25.917510986328125, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8603916168212891, + "num_tokens": 297824782.0, + "step": 7807 + }, + { + "epoch": 0.993257855234703, + "ewc_loss": 0.03618649020791054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8093245671479963e-05, + "grad_norm": 25.93766975402832, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8565691709518433, + "num_tokens": 297861748.0, + "step": 7808 + }, + { + "epoch": 0.9933850655132934, + "ewc_loss": 0.03614190220832825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.80709503183607e-05, + "grad_norm": 25.809616088867188, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8606566190719604, + "num_tokens": 297902722.0, + "step": 7809 + }, + { + "epoch": 0.993512275791884, + "ewc_loss": 0.03614896163344383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8074480976792984e-05, + "grad_norm": 25.80095100402832, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8620691895484924, + "num_tokens": 297948709.0, + "step": 7810 + }, + { + "epoch": 0.9936394860704745, + "ewc_loss": 0.03622773289680481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.811386573535856e-05, + "grad_norm": 25.93644142150879, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8627298474311829, + "num_tokens": 297986446.0, + "step": 7811 + }, + { + "epoch": 0.993766696349065, + "ewc_loss": 0.03617284074425697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.808642082323786e-05, + "grad_norm": 25.787687301635742, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8532471656799316, + "num_tokens": 298024004.0, + "step": 7812 + }, + { + "epoch": 0.9938939066276555, + "ewc_loss": 0.036152034997940063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.807601802283898e-05, + "grad_norm": 25.80896759033203, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8510524034500122, + "num_tokens": 298065518.0, + "step": 7813 + }, + { + "epoch": 0.9940211169062461, + "ewc_loss": 0.03622505068778992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8112525140168145e-05, + "grad_norm": 25.777984619140625, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8535333871841431, + "num_tokens": 298108138.0, + "step": 7814 + }, + { + "epoch": 0.9941483271848366, + "ewc_loss": 0.03629323095083237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8146614820579998e-05, + "grad_norm": 26.10177993774414, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8599652051925659, + "num_tokens": 298149348.0, + "step": 7815 + }, + { + "epoch": 0.994275537463427, + "ewc_loss": 0.036250751465559006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8125376300304197e-05, + "grad_norm": 25.82917022705078, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8586120009422302, + "num_tokens": 298186347.0, + "step": 7816 + }, + { + "epoch": 0.9944027477420175, + "ewc_loss": 0.03616119176149368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8080596419167705e-05, + "grad_norm": 25.997703552246094, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8402965068817139, + "num_tokens": 298223322.0, + "step": 7817 + }, + { + "epoch": 0.9945299580206081, + "ewc_loss": 0.036278434097766876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8139216990675777e-05, + "grad_norm": 25.918643951416016, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8552470207214355, + "num_tokens": 298258794.0, + "step": 7818 + }, + { + "epoch": 0.9946571682991986, + "ewc_loss": 0.03612861409783363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8064307369058952e-05, + "grad_norm": 25.862693786621094, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8547389507293701, + "num_tokens": 298297510.0, + "step": 7819 + }, + { + "epoch": 0.9947843785777891, + "ewc_loss": 0.03617675229907036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.808837623684667e-05, + "grad_norm": 25.790422439575195, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.864346444606781, + "num_tokens": 298333784.0, + "step": 7820 + }, + { + "epoch": 0.9949115888563796, + "ewc_loss": 0.03622213751077652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8111068129655905e-05, + "grad_norm": 25.97820281982422, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8666695356369019, + "num_tokens": 298371241.0, + "step": 7821 + }, + { + "epoch": 0.9950387991349701, + "ewc_loss": 0.03627156838774681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8135784557671286e-05, + "grad_norm": 25.838138580322266, + "learning_rate": 1e-06, + "loss": 0.4982, + "mean_token_accuracy": 0.8463003635406494, + "num_tokens": 298407902.0, + "step": 7822 + }, + { + "epoch": 0.9951660094135606, + "ewc_loss": 0.03621215000748634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.810607500374317e-05, + "grad_norm": 25.95576286315918, + "learning_rate": 1e-06, + "loss": 0.5262, + "mean_token_accuracy": 0.835628867149353, + "num_tokens": 298452339.0, + "step": 7823 + }, + { + "epoch": 0.9952932196921511, + "ewc_loss": 0.03621289134025574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8106446077581495e-05, + "grad_norm": 25.79978370666504, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8509287238121033, + "num_tokens": 298495854.0, + "step": 7824 + }, + { + "epoch": 0.9954204299707416, + "ewc_loss": 0.036239899694919586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.811995025491342e-05, + "grad_norm": 25.92619514465332, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8544467091560364, + "num_tokens": 298529642.0, + "step": 7825 + }, + { + "epoch": 0.9955476402493322, + "ewc_loss": 0.036302484571933746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.815124232962262e-05, + "grad_norm": 25.892385482788086, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8623858094215393, + "num_tokens": 298574396.0, + "step": 7826 + }, + { + "epoch": 0.9956748505279227, + "ewc_loss": 0.03622058406472206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.811029142118059e-05, + "grad_norm": 25.848655700683594, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8466572761535645, + "num_tokens": 298612457.0, + "step": 7827 + }, + { + "epoch": 0.9958020608065131, + "ewc_loss": 0.03616941347718239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.808470733521972e-05, + "grad_norm": 25.89252471923828, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8605458736419678, + "num_tokens": 298645061.0, + "step": 7828 + }, + { + "epoch": 0.9959292710851037, + "ewc_loss": 0.03624001145362854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8120004824595526e-05, + "grad_norm": 25.853364944458008, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.858370304107666, + "num_tokens": 298688014.0, + "step": 7829 + }, + { + "epoch": 0.9960564813636942, + "ewc_loss": 0.03623638674616814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8118193111149594e-05, + "grad_norm": 25.94328498840332, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8635324239730835, + "num_tokens": 298727815.0, + "step": 7830 + }, + { + "epoch": 0.9961836916422847, + "ewc_loss": 0.03619357943534851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.809678906283807e-05, + "grad_norm": 25.834579467773438, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8808261156082153, + "num_tokens": 298765663.0, + "step": 7831 + }, + { + "epoch": 0.9963109019208752, + "ewc_loss": 0.03624453768134117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.812226946640294e-05, + "grad_norm": 25.849353790283203, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8638882637023926, + "num_tokens": 298799360.0, + "step": 7832 + }, + { + "epoch": 0.9964381121994658, + "ewc_loss": 0.036245208233594894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8122604160453193e-05, + "grad_norm": 25.924238204956055, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.859898030757904, + "num_tokens": 298835669.0, + "step": 7833 + }, + { + "epoch": 0.9965653224780562, + "ewc_loss": 0.03619732707738876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8098664440913126e-05, + "grad_norm": 25.759517669677734, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8581784963607788, + "num_tokens": 298866871.0, + "step": 7834 + }, + { + "epoch": 0.9966925327566467, + "ewc_loss": 0.0362425372004509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.812126902223099e-05, + "grad_norm": 25.975793838500977, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8502377867698669, + "num_tokens": 298904136.0, + "step": 7835 + }, + { + "epoch": 0.9968197430352372, + "ewc_loss": 0.03631005808711052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.81550294655608e-05, + "grad_norm": 25.829601287841797, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8721356987953186, + "num_tokens": 298937069.0, + "step": 7836 + }, + { + "epoch": 0.9969469533138278, + "ewc_loss": 0.03624274954199791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8121374523616396e-05, + "grad_norm": 25.805055618286133, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8521980047225952, + "num_tokens": 298976778.0, + "step": 7837 + }, + { + "epoch": 0.9970741635924183, + "ewc_loss": 0.036320917308330536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8160459148930386e-05, + "grad_norm": 25.907371520996094, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8708780407905579, + "num_tokens": 299010808.0, + "step": 7838 + }, + { + "epoch": 0.9972013738710088, + "ewc_loss": 0.036310262978076935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.81551313289674e-05, + "grad_norm": 26.008323669433594, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8495222330093384, + "num_tokens": 299047611.0, + "step": 7839 + }, + { + "epoch": 0.9973285841495992, + "ewc_loss": 0.03629060834646225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.814530332922004e-05, + "grad_norm": 25.73075294494629, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8654391765594482, + "num_tokens": 299088668.0, + "step": 7840 + }, + { + "epoch": 0.9974557944281898, + "ewc_loss": 0.0363025963306427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.815129871829413e-05, + "grad_norm": 26.056724548339844, + "learning_rate": 1e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8401715755462646, + "num_tokens": 299126572.0, + "step": 7841 + }, + { + "epoch": 0.9975830047067803, + "ewc_loss": 0.03638264909386635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8191323761129752e-05, + "grad_norm": 25.83150291442871, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8717881441116333, + "num_tokens": 299164806.0, + "step": 7842 + }, + { + "epoch": 0.9977102149853708, + "ewc_loss": 0.03626827895641327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.813414019125048e-05, + "grad_norm": 25.880828857421875, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8560464978218079, + "num_tokens": 299207039.0, + "step": 7843 + }, + { + "epoch": 0.9978374252639614, + "ewc_loss": 0.036357562988996506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8178781829192303e-05, + "grad_norm": 25.850339889526367, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8699188232421875, + "num_tokens": 299243242.0, + "step": 7844 + }, + { + "epoch": 0.9979646355425519, + "ewc_loss": 0.0362992100417614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8149605239159428e-05, + "grad_norm": 25.93631362915039, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8660807609558105, + "num_tokens": 299282326.0, + "step": 7845 + }, + { + "epoch": 0.9980918458211423, + "ewc_loss": 0.036353886127471924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8176942830905318e-05, + "grad_norm": 25.86734390258789, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8885951042175293, + "num_tokens": 299316645.0, + "step": 7846 + }, + { + "epoch": 0.9982190560997328, + "ewc_loss": 0.03628765419125557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8143826309824362e-05, + "grad_norm": 25.913297653198242, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8620458841323853, + "num_tokens": 299349932.0, + "step": 7847 + }, + { + "epoch": 0.9983462663783234, + "ewc_loss": 0.03638344630599022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8191723938798532e-05, + "grad_norm": 25.86835289001465, + "learning_rate": 1e-06, + "loss": 0.5052, + "mean_token_accuracy": 0.839126467704773, + "num_tokens": 299392452.0, + "step": 7848 + }, + { + "epoch": 0.9984734766569139, + "ewc_loss": 0.036290667951107025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8145334252039902e-05, + "grad_norm": 25.805986404418945, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8653131723403931, + "num_tokens": 299431479.0, + "step": 7849 + }, + { + "epoch": 0.9986006869355044, + "ewc_loss": 0.03635086864233017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.817543488868978e-05, + "grad_norm": 25.831172943115234, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8578943014144897, + "num_tokens": 299475585.0, + "step": 7850 + }, + { + "epoch": 0.9987278972140949, + "ewc_loss": 0.036317214369773865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8158607417717576e-05, + "grad_norm": 25.803272247314453, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.875617265701294, + "num_tokens": 299514026.0, + "step": 7851 + }, + { + "epoch": 0.9988551074926854, + "ewc_loss": 0.03632406145334244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.816203075577505e-05, + "grad_norm": 25.703630447387695, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8498239517211914, + "num_tokens": 299550858.0, + "step": 7852 + }, + { + "epoch": 0.9989823177712759, + "ewc_loss": 0.03634290024638176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.817144948290661e-05, + "grad_norm": 25.822185516357422, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8530186414718628, + "num_tokens": 299586831.0, + "step": 7853 + }, + { + "epoch": 0.9991095280498664, + "ewc_loss": 0.03640655428171158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.820327634050045e-05, + "grad_norm": 25.85651969909668, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8706862926483154, + "num_tokens": 299623116.0, + "step": 7854 + }, + { + "epoch": 0.9992367383284569, + "ewc_loss": 0.03640948235988617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8204740626970306e-05, + "grad_norm": 25.913789749145508, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8630008101463318, + "num_tokens": 299662165.0, + "step": 7855 + }, + { + "epoch": 0.9993639486070475, + "ewc_loss": 0.03638123720884323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8190617993241176e-05, + "grad_norm": 25.856311798095703, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8572869896888733, + "num_tokens": 299702581.0, + "step": 7856 + }, + { + "epoch": 0.999491158885638, + "ewc_loss": 0.03630942851305008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8154714780393988e-05, + "grad_norm": 25.842777252197266, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8461699485778809, + "num_tokens": 299742559.0, + "step": 7857 + }, + { + "epoch": 0.9996183691642284, + "ewc_loss": 0.03632114455103874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8160571926273406e-05, + "grad_norm": 25.895090103149414, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8766567707061768, + "num_tokens": 299778637.0, + "step": 7858 + }, + { + "epoch": 0.9997455794428189, + "ewc_loss": 0.03638872504234314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8194363292423077e-05, + "grad_norm": 25.90788459777832, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8701120615005493, + "num_tokens": 299812808.0, + "step": 7859 + }, + { + "epoch": 0.9998727897214095, + "ewc_loss": 0.036321498453617096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.816074836824555e-05, + "grad_norm": 25.829757690429688, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8568214178085327, + "num_tokens": 299848987.0, + "step": 7860 + }, + { + "epoch": 1.0, + "ewc_loss": 0.036318663507699966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8159331375500187e-05, + "grad_norm": 25.88335418701172, + "learning_rate": 1e-06, + "loss": 0.5212, + "mean_token_accuracy": 0.8421339988708496, + "num_tokens": 299886286.0, + "step": 7861 + }, + { + "epoch": 1.0001272102785905, + "ewc_loss": 0.03637489676475525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.81874493137002e-05, + "grad_norm": 25.745737075805664, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8542734384536743, + "num_tokens": 299925456.0, + "step": 7862 + }, + { + "epoch": 1.000254420557181, + "ewc_loss": 0.03629593923687935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.814796996768564e-05, + "grad_norm": 25.930360794067383, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8579927682876587, + "num_tokens": 299965936.0, + "step": 7863 + }, + { + "epoch": 1.0003816308357716, + "ewc_loss": 0.03646111488342285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8230557543574832e-05, + "grad_norm": 25.893251419067383, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8735625743865967, + "num_tokens": 300003181.0, + "step": 7864 + }, + { + "epoch": 1.0005088411143621, + "ewc_loss": 0.03630710020661354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.815355062717572e-05, + "grad_norm": 25.897960662841797, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.859700620174408, + "num_tokens": 300040502.0, + "step": 7865 + }, + { + "epoch": 1.0006360513929526, + "ewc_loss": 0.03635621443390846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8178106984123588e-05, + "grad_norm": 25.765901565551758, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8760826587677002, + "num_tokens": 300078795.0, + "step": 7866 + }, + { + "epoch": 1.0007632616715432, + "ewc_loss": 0.03627479448914528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.813739800127223e-05, + "grad_norm": 25.92633819580078, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8713443875312805, + "num_tokens": 300115188.0, + "step": 7867 + }, + { + "epoch": 1.0008904719501335, + "ewc_loss": 0.03636348992586136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.818174496293068e-05, + "grad_norm": 25.81600570678711, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8629633188247681, + "num_tokens": 300158583.0, + "step": 7868 + }, + { + "epoch": 1.001017682228724, + "ewc_loss": 0.03634696453809738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8173483113059774e-05, + "grad_norm": 25.889341354370117, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8668054938316345, + "num_tokens": 300194415.0, + "step": 7869 + }, + { + "epoch": 1.0011448925073145, + "ewc_loss": 0.0363926962018013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8196347809862345e-05, + "grad_norm": 25.93131446838379, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8574007153511047, + "num_tokens": 300235066.0, + "step": 7870 + }, + { + "epoch": 1.001272102785905, + "ewc_loss": 0.03631182760000229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8155913494410925e-05, + "grad_norm": 25.910886764526367, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8647710084915161, + "num_tokens": 300272167.0, + "step": 7871 + }, + { + "epoch": 1.0013993130644956, + "ewc_loss": 0.036321528255939484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.816076473915018e-05, + "grad_norm": 25.89879608154297, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8526330590248108, + "num_tokens": 300306032.0, + "step": 7872 + }, + { + "epoch": 1.0015265233430861, + "ewc_loss": 0.03630722314119339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.815361247281544e-05, + "grad_norm": 25.9156436920166, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8577727675437927, + "num_tokens": 300338433.0, + "step": 7873 + }, + { + "epoch": 1.0016537336216766, + "ewc_loss": 0.036316026002168655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8158012608182617e-05, + "grad_norm": 25.857818603515625, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.851330578327179, + "num_tokens": 300376527.0, + "step": 7874 + }, + { + "epoch": 1.0017809439002672, + "ewc_loss": 0.0363224558532238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8161228581448086e-05, + "grad_norm": 25.83868980407715, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8778629302978516, + "num_tokens": 300417309.0, + "step": 7875 + }, + { + "epoch": 1.0019081541788577, + "ewc_loss": 0.03632638230919838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.816319127101451e-05, + "grad_norm": 25.875898361206055, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8630077838897705, + "num_tokens": 300459438.0, + "step": 7876 + }, + { + "epoch": 1.0020353644574482, + "ewc_loss": 0.03629976138472557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8149879906559363e-05, + "grad_norm": 25.89872169494629, + "learning_rate": 1e-06, + "loss": 0.5178, + "mean_token_accuracy": 0.8384045362472534, + "num_tokens": 300501443.0, + "step": 7877 + }, + { + "epoch": 1.0021625747360388, + "ewc_loss": 0.03628629073500633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8143146007787436e-05, + "grad_norm": 25.803619384765625, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8691234588623047, + "num_tokens": 300531918.0, + "step": 7878 + }, + { + "epoch": 1.0022897850146293, + "ewc_loss": 0.03627429157495499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.813714516174514e-05, + "grad_norm": 25.99742889404297, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8551648855209351, + "num_tokens": 300575084.0, + "step": 7879 + }, + { + "epoch": 1.0024169952932196, + "ewc_loss": 0.036374326795339584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8187163732363842e-05, + "grad_norm": 25.87607192993164, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8614866733551025, + "num_tokens": 300608737.0, + "step": 7880 + }, + { + "epoch": 1.0025442055718101, + "ewc_loss": 0.03629699721932411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8148499293602072e-05, + "grad_norm": 25.915191650390625, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.849888265132904, + "num_tokens": 300647855.0, + "step": 7881 + }, + { + "epoch": 1.0026714158504006, + "ewc_loss": 0.03640118986368179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8200595150119625e-05, + "grad_norm": 25.916162490844727, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8616310358047485, + "num_tokens": 300689763.0, + "step": 7882 + }, + { + "epoch": 1.0027986261289912, + "ewc_loss": 0.0363258458673954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8162922060582787e-05, + "grad_norm": 25.95965003967285, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8724672198295593, + "num_tokens": 300725838.0, + "step": 7883 + }, + { + "epoch": 1.0029258364075817, + "ewc_loss": 0.036292847245931625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8146423826692626e-05, + "grad_norm": 25.763776779174805, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8676178455352783, + "num_tokens": 300760423.0, + "step": 7884 + }, + { + "epoch": 1.0030530466861722, + "ewc_loss": 0.0363524854183197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8176242519984953e-05, + "grad_norm": 25.885066986083984, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8684861660003662, + "num_tokens": 300796236.0, + "step": 7885 + }, + { + "epoch": 1.0031802569647628, + "ewc_loss": 0.036370690912008286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8185344742960297e-05, + "grad_norm": 25.870716094970703, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8665608167648315, + "num_tokens": 300835213.0, + "step": 7886 + }, + { + "epoch": 1.0033074672433533, + "ewc_loss": 0.036360692232847214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8180346160079353e-05, + "grad_norm": 25.8997745513916, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8693331480026245, + "num_tokens": 300873363.0, + "step": 7887 + }, + { + "epoch": 1.0034346775219438, + "ewc_loss": 0.03639664873480797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8198325051344e-05, + "grad_norm": 25.820390701293945, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8573194742202759, + "num_tokens": 300910233.0, + "step": 7888 + }, + { + "epoch": 1.0035618878005343, + "ewc_loss": 0.036344464868307114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8172231648350134e-05, + "grad_norm": 25.941450119018555, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8698005080223083, + "num_tokens": 300946986.0, + "step": 7889 + }, + { + "epoch": 1.0036890980791249, + "ewc_loss": 0.036447037011384964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.822351805458311e-05, + "grad_norm": 25.959394454956055, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8632800579071045, + "num_tokens": 300985750.0, + "step": 7890 + }, + { + "epoch": 1.0038163083577154, + "ewc_loss": 0.03632355108857155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8161776097258553e-05, + "grad_norm": 25.913007736206055, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8616295456886292, + "num_tokens": 301032820.0, + "step": 7891 + }, + { + "epoch": 1.0039435186363057, + "ewc_loss": 0.03629497438669205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.81474879354937e-05, + "grad_norm": 25.96868324279785, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8523648977279663, + "num_tokens": 301070825.0, + "step": 7892 + }, + { + "epoch": 1.0040707289148962, + "ewc_loss": 0.0363776758313179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8188837202615105e-05, + "grad_norm": 25.98920440673828, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8581403493881226, + "num_tokens": 301110764.0, + "step": 7893 + }, + { + "epoch": 1.0041979391934868, + "ewc_loss": 0.036272820085287094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8136410290026106e-05, + "grad_norm": 25.870315551757812, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8653776049613953, + "num_tokens": 301152421.0, + "step": 7894 + }, + { + "epoch": 1.0043251494720773, + "ewc_loss": 0.03629734367132187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.814867209759541e-05, + "grad_norm": 25.965530395507812, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8627718687057495, + "num_tokens": 301192780.0, + "step": 7895 + }, + { + "epoch": 1.0044523597506678, + "ewc_loss": 0.03631240501999855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.815620271372609e-05, + "grad_norm": 25.903165817260742, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.859602153301239, + "num_tokens": 301228147.0, + "step": 7896 + }, + { + "epoch": 1.0045795700292584, + "ewc_loss": 0.03625568747520447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8127842849935405e-05, + "grad_norm": 25.906429290771484, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8719472885131836, + "num_tokens": 301259568.0, + "step": 7897 + }, + { + "epoch": 1.0047067803078489, + "ewc_loss": 0.03637279197573662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8186396118835546e-05, + "grad_norm": 26.113298416137695, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8628315329551697, + "num_tokens": 301296483.0, + "step": 7898 + }, + { + "epoch": 1.0048339905864394, + "ewc_loss": 0.03634447604417801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.817223892430775e-05, + "grad_norm": 25.865400314331055, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8647080659866333, + "num_tokens": 301337437.0, + "step": 7899 + }, + { + "epoch": 1.00496120086503, + "ewc_loss": 0.036250513046979904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8125256247003563e-05, + "grad_norm": 26.101097106933594, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8610812425613403, + "num_tokens": 301368777.0, + "step": 7900 + }, + { + "epoch": 1.0050884111436205, + "ewc_loss": 0.03634416311979294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8172082491219044e-05, + "grad_norm": 26.010311126708984, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8593626022338867, + "num_tokens": 301404885.0, + "step": 7901 + }, + { + "epoch": 1.005215621422211, + "ewc_loss": 0.03626853972673416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8134269339498132e-05, + "grad_norm": 25.98862075805664, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8512154817581177, + "num_tokens": 301445308.0, + "step": 7902 + }, + { + "epoch": 1.0053428317008015, + "ewc_loss": 0.036307454109191895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8153727069147862e-05, + "grad_norm": 26.01425552368164, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8584995269775391, + "num_tokens": 301484845.0, + "step": 7903 + }, + { + "epoch": 1.0054700419793918, + "ewc_loss": 0.03632168099284172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.816084113670513e-05, + "grad_norm": 25.97056007385254, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8592169284820557, + "num_tokens": 301524498.0, + "step": 7904 + }, + { + "epoch": 1.0055972522579824, + "ewc_loss": 0.03622501716017723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8112508769263513e-05, + "grad_norm": 25.901214599609375, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8673142194747925, + "num_tokens": 301565522.0, + "step": 7905 + }, + { + "epoch": 1.0057244625365729, + "ewc_loss": 0.03634883835911751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.81744198926026e-05, + "grad_norm": 25.934022903442383, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8570441007614136, + "num_tokens": 301603724.0, + "step": 7906 + }, + { + "epoch": 1.0058516728151634, + "ewc_loss": 0.03637699410319328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.818849705159664e-05, + "grad_norm": 26.12535858154297, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8596181869506836, + "num_tokens": 301636650.0, + "step": 7907 + }, + { + "epoch": 1.005978883093754, + "ewc_loss": 0.03635609149932861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8178045138483867e-05, + "grad_norm": 25.86932945251465, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8519067764282227, + "num_tokens": 301675328.0, + "step": 7908 + }, + { + "epoch": 1.0061060933723445, + "ewc_loss": 0.03629566356539726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.814783172449097e-05, + "grad_norm": 26.038793563842773, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8545280694961548, + "num_tokens": 301710058.0, + "step": 7909 + }, + { + "epoch": 1.006233303650935, + "ewc_loss": 0.03634139150381088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8170696421293542e-05, + "grad_norm": 25.823057174682617, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8553502559661865, + "num_tokens": 301753567.0, + "step": 7910 + }, + { + "epoch": 1.0063605139295255, + "ewc_loss": 0.036292146891355515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.814607276173774e-05, + "grad_norm": 25.90477180480957, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.869152307510376, + "num_tokens": 301792895.0, + "step": 7911 + }, + { + "epoch": 1.006487724208116, + "ewc_loss": 0.03637842461466789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8189211914432235e-05, + "grad_norm": 25.86142921447754, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8733164668083191, + "num_tokens": 301833001.0, + "step": 7912 + }, + { + "epoch": 1.0066149344867066, + "ewc_loss": 0.03628158196806908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8140790416509844e-05, + "grad_norm": 25.83523178100586, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8657782077789307, + "num_tokens": 301871623.0, + "step": 7913 + }, + { + "epoch": 1.006742144765297, + "ewc_loss": 0.03637031838297844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8185159206041135e-05, + "grad_norm": 25.88797378540039, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8725637793540955, + "num_tokens": 301903750.0, + "step": 7914 + }, + { + "epoch": 1.0068693550438876, + "ewc_loss": 0.03638807311654091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.819403587433044e-05, + "grad_norm": 25.938518524169922, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8635339736938477, + "num_tokens": 301943079.0, + "step": 7915 + }, + { + "epoch": 1.0069965653224782, + "ewc_loss": 0.036419689655303955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8209844711236656e-05, + "grad_norm": 25.971176147460938, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8666253089904785, + "num_tokens": 301980031.0, + "step": 7916 + }, + { + "epoch": 1.0071237756010685, + "ewc_loss": 0.03640809282660484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.820404577301815e-05, + "grad_norm": 26.014450073242188, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8609834909439087, + "num_tokens": 302020454.0, + "step": 7917 + }, + { + "epoch": 1.007250985879659, + "ewc_loss": 0.036356255412101746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8178126993007027e-05, + "grad_norm": 25.998334884643555, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8558593392372131, + "num_tokens": 302064391.0, + "step": 7918 + }, + { + "epoch": 1.0073781961582495, + "ewc_loss": 0.036395736038684845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.819786848500371e-05, + "grad_norm": 25.95700454711914, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8619450330734253, + "num_tokens": 302101598.0, + "step": 7919 + }, + { + "epoch": 1.00750540643684, + "ewc_loss": 0.03636321425437927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.818160671973601e-05, + "grad_norm": 26.04717445373535, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8663005232810974, + "num_tokens": 302138803.0, + "step": 7920 + }, + { + "epoch": 1.0076326167154306, + "ewc_loss": 0.0363212451338768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8160622857976705e-05, + "grad_norm": 25.82379722595215, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8593850135803223, + "num_tokens": 302173682.0, + "step": 7921 + }, + { + "epoch": 1.0077598269940211, + "ewc_loss": 0.03630753234028816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.815376708691474e-05, + "grad_norm": 25.992582321166992, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8590608239173889, + "num_tokens": 302207390.0, + "step": 7922 + }, + { + "epoch": 1.0078870372726116, + "ewc_loss": 0.036396726965904236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8198363250121474e-05, + "grad_norm": 25.937015533447266, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8674708604812622, + "num_tokens": 302244440.0, + "step": 7923 + }, + { + "epoch": 1.0080142475512022, + "ewc_loss": 0.036338239908218384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8169119357480668e-05, + "grad_norm": 25.915176391601562, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8565496206283569, + "num_tokens": 302281058.0, + "step": 7924 + }, + { + "epoch": 1.0081414578297927, + "ewc_loss": 0.03641156479716301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8205782907898538e-05, + "grad_norm": 25.984458923339844, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8786150217056274, + "num_tokens": 302319586.0, + "step": 7925 + }, + { + "epoch": 1.0082686681083832, + "ewc_loss": 0.036397334188222885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8198667021351866e-05, + "grad_norm": 25.909954071044922, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8578146696090698, + "num_tokens": 302361356.0, + "step": 7926 + }, + { + "epoch": 1.0083958783869738, + "ewc_loss": 0.0363086573779583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8154329154640436e-05, + "grad_norm": 25.906574249267578, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8710383176803589, + "num_tokens": 302399974.0, + "step": 7927 + }, + { + "epoch": 1.0085230886655643, + "ewc_loss": 0.03645768389105797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.822884223656729e-05, + "grad_norm": 26.012725830078125, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.86474609375, + "num_tokens": 302435709.0, + "step": 7928 + }, + { + "epoch": 1.0086502989441546, + "ewc_loss": 0.03639066219329834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8195330994785763e-05, + "grad_norm": 25.87982177734375, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8583515882492065, + "num_tokens": 302474151.0, + "step": 7929 + }, + { + "epoch": 1.0087775092227451, + "ewc_loss": 0.03645104542374611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8225522580905817e-05, + "grad_norm": 26.081693649291992, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8637232780456543, + "num_tokens": 302509877.0, + "step": 7930 + }, + { + "epoch": 1.0089047195013356, + "ewc_loss": 0.036446232348680496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8223116057924926e-05, + "grad_norm": 25.88519859313965, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.876950204372406, + "num_tokens": 302547689.0, + "step": 7931 + }, + { + "epoch": 1.0090319297799262, + "ewc_loss": 0.036363616585731506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8181808627559803e-05, + "grad_norm": 26.023086547851562, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8602185249328613, + "num_tokens": 302590728.0, + "step": 7932 + }, + { + "epoch": 1.0091591400585167, + "ewc_loss": 0.03647495061159134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8237475160276517e-05, + "grad_norm": 25.88114356994629, + "learning_rate": 1e-06, + "loss": 0.5182, + "mean_token_accuracy": 0.8389284610748291, + "num_tokens": 302625709.0, + "step": 7933 + }, + { + "epoch": 1.0092863503371072, + "ewc_loss": 0.03634365648031235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8171827832702547e-05, + "grad_norm": 25.98253059387207, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8693079948425293, + "num_tokens": 302662086.0, + "step": 7934 + }, + { + "epoch": 1.0094135606156978, + "ewc_loss": 0.036442238837480545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8221118807559833e-05, + "grad_norm": 25.857036590576172, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8786347508430481, + "num_tokens": 302703680.0, + "step": 7935 + }, + { + "epoch": 1.0095407708942883, + "ewc_loss": 0.03643408045172691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8217040633317083e-05, + "grad_norm": 25.970388412475586, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8470249772071838, + "num_tokens": 302740426.0, + "step": 7936 + }, + { + "epoch": 1.0096679811728788, + "ewc_loss": 0.0364399328827858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8219965568277985e-05, + "grad_norm": 25.923429489135742, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8746445178985596, + "num_tokens": 302780778.0, + "step": 7937 + }, + { + "epoch": 1.0097951914514693, + "ewc_loss": 0.036417536437511444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8208767869509757e-05, + "grad_norm": 25.93695831298828, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8659050464630127, + "num_tokens": 302816012.0, + "step": 7938 + }, + { + "epoch": 1.0099224017300599, + "ewc_loss": 0.03644167259335518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8220836864202283e-05, + "grad_norm": 25.904001235961914, + "learning_rate": 1e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.839664101600647, + "num_tokens": 302858325.0, + "step": 7939 + }, + { + "epoch": 1.0100496120086504, + "ewc_loss": 0.03649036958813667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8245184037368745e-05, + "grad_norm": 26.029024124145508, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8626075387001038, + "num_tokens": 302898210.0, + "step": 7940 + }, + { + "epoch": 1.0101768222872407, + "ewc_loss": 0.0363864004611969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.819320095819421e-05, + "grad_norm": 25.836624145507812, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8627393841743469, + "num_tokens": 302936072.0, + "step": 7941 + }, + { + "epoch": 1.0103040325658312, + "ewc_loss": 0.03644062578678131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8220312995254062e-05, + "grad_norm": 26.040210723876953, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8537054061889648, + "num_tokens": 302978941.0, + "step": 7942 + }, + { + "epoch": 1.0104312428444218, + "ewc_loss": 0.03648336976766586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8241684301756322e-05, + "grad_norm": 25.940242767333984, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8547987937927246, + "num_tokens": 303016664.0, + "step": 7943 + }, + { + "epoch": 1.0105584531230123, + "ewc_loss": 0.036421626806259155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8210814232588746e-05, + "grad_norm": 26.006914138793945, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8572384119033813, + "num_tokens": 303063313.0, + "step": 7944 + }, + { + "epoch": 1.0106856634016028, + "ewc_loss": 0.036463748663663864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8231874491903e-05, + "grad_norm": 25.980552673339844, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8523335456848145, + "num_tokens": 303103786.0, + "step": 7945 + }, + { + "epoch": 1.0108128736801933, + "ewc_loss": 0.03635482117533684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.817741031118203e-05, + "grad_norm": 26.066055297851562, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8669779300689697, + "num_tokens": 303136928.0, + "step": 7946 + }, + { + "epoch": 1.0109400839587839, + "ewc_loss": 0.03641211986541748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8206059394287877e-05, + "grad_norm": 25.977994918823242, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8691138029098511, + "num_tokens": 303174854.0, + "step": 7947 + }, + { + "epoch": 1.0110672942373744, + "ewc_loss": 0.03639302775263786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8196513337898068e-05, + "grad_norm": 25.973312377929688, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8593111634254456, + "num_tokens": 303214234.0, + "step": 7948 + }, + { + "epoch": 1.011194504515965, + "ewc_loss": 0.036394741386175156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8197370081907138e-05, + "grad_norm": 26.01250648498535, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8751060962677002, + "num_tokens": 303251830.0, + "step": 7949 + }, + { + "epoch": 1.0113217147945555, + "ewc_loss": 0.036392468959093094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8196235032519326e-05, + "grad_norm": 26.09319305419922, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8517430424690247, + "num_tokens": 303291564.0, + "step": 7950 + }, + { + "epoch": 1.011448925073146, + "ewc_loss": 0.036362096667289734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.818104828998912e-05, + "grad_norm": 25.966272354125977, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8590517044067383, + "num_tokens": 303328642.0, + "step": 7951 + }, + { + "epoch": 1.0115761353517365, + "ewc_loss": 0.036318015307188034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8159007595386356e-05, + "grad_norm": 26.017702102661133, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8595987558364868, + "num_tokens": 303373693.0, + "step": 7952 + }, + { + "epoch": 1.0117033456303268, + "ewc_loss": 0.03645041957497597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8225209714728408e-05, + "grad_norm": 26.019561767578125, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8655558824539185, + "num_tokens": 303407124.0, + "step": 7953 + }, + { + "epoch": 1.0118305559089174, + "ewc_loss": 0.03631976619362831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.815988252928946e-05, + "grad_norm": 25.971677780151367, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.874038815498352, + "num_tokens": 303441586.0, + "step": 7954 + }, + { + "epoch": 1.0119577661875079, + "ewc_loss": 0.036417387425899506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.820869329094421e-05, + "grad_norm": 25.896055221557617, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8631678819656372, + "num_tokens": 303473593.0, + "step": 7955 + }, + { + "epoch": 1.0120849764660984, + "ewc_loss": 0.03634214773774147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.817107477108948e-05, + "grad_norm": 26.01030731201172, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8583869934082031, + "num_tokens": 303510239.0, + "step": 7956 + }, + { + "epoch": 1.012212186744689, + "ewc_loss": 0.036482907831668854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8241453290102072e-05, + "grad_norm": 26.03902816772461, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8600084781646729, + "num_tokens": 303544709.0, + "step": 7957 + }, + { + "epoch": 1.0123393970232795, + "ewc_loss": 0.03639739006757736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.819869430619292e-05, + "grad_norm": 25.858566284179688, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8725467324256897, + "num_tokens": 303581990.0, + "step": 7958 + }, + { + "epoch": 1.01246660730187, + "ewc_loss": 0.03648620843887329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8243104932480492e-05, + "grad_norm": 25.99260902404785, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8637855052947998, + "num_tokens": 303625167.0, + "step": 7959 + }, + { + "epoch": 1.0125938175804605, + "ewc_loss": 0.03650597110390663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8252985682920553e-05, + "grad_norm": 26.01021385192871, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.861894965171814, + "num_tokens": 303664751.0, + "step": 7960 + }, + { + "epoch": 1.012721027859051, + "ewc_loss": 0.036422550678253174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8211274436907843e-05, + "grad_norm": 25.91730499267578, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8601706624031067, + "num_tokens": 303702190.0, + "step": 7961 + }, + { + "epoch": 1.0128482381376416, + "ewc_loss": 0.036479830741882324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8239916244056076e-05, + "grad_norm": 25.871397018432617, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8765822649002075, + "num_tokens": 303742872.0, + "step": 7962 + }, + { + "epoch": 1.012975448416232, + "ewc_loss": 0.03653648495674133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8268243366037495e-05, + "grad_norm": 26.075223922729492, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8564289808273315, + "num_tokens": 303781265.0, + "step": 7963 + }, + { + "epoch": 1.0131026586948226, + "ewc_loss": 0.03650963678956032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8254819224239327e-05, + "grad_norm": 25.929332733154297, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.863248884677887, + "num_tokens": 303820686.0, + "step": 7964 + }, + { + "epoch": 1.0132298689734132, + "ewc_loss": 0.03642109036445618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.821054502215702e-05, + "grad_norm": 25.855138778686523, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8650301098823547, + "num_tokens": 303863428.0, + "step": 7965 + }, + { + "epoch": 1.0133570792520035, + "ewc_loss": 0.036539822816848755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8269911379320547e-05, + "grad_norm": 26.068044662475586, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.85762619972229, + "num_tokens": 303903847.0, + "step": 7966 + }, + { + "epoch": 1.013484289530594, + "ewc_loss": 0.036544181406497955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8272090528625995e-05, + "grad_norm": 26.006078720092773, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8540310859680176, + "num_tokens": 303943083.0, + "step": 7967 + }, + { + "epoch": 1.0136114998091845, + "ewc_loss": 0.03654313459992409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8271566659677774e-05, + "grad_norm": 26.052230834960938, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8707199096679688, + "num_tokens": 303978773.0, + "step": 7968 + }, + { + "epoch": 1.013738710087775, + "ewc_loss": 0.036500729620456696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8250364519190043e-05, + "grad_norm": 25.968334197998047, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8698214888572693, + "num_tokens": 304017058.0, + "step": 7969 + }, + { + "epoch": 1.0138659203663656, + "ewc_loss": 0.03642737492918968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8213688235846348e-05, + "grad_norm": 25.951486587524414, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.853954553604126, + "num_tokens": 304054104.0, + "step": 7970 + }, + { + "epoch": 1.013993130644956, + "ewc_loss": 0.036505214869976044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8252607333124615e-05, + "grad_norm": 26.00997543334961, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.875529408454895, + "num_tokens": 304091100.0, + "step": 7971 + }, + { + "epoch": 1.0141203409235466, + "ewc_loss": 0.0364663265645504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8233164155390114e-05, + "grad_norm": 25.966894149780273, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8631392121315002, + "num_tokens": 304127747.0, + "step": 7972 + }, + { + "epoch": 1.0142475512021372, + "ewc_loss": 0.03642596676945686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8212982467957772e-05, + "grad_norm": 25.881603240966797, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8632596731185913, + "num_tokens": 304171089.0, + "step": 7973 + }, + { + "epoch": 1.0143747614807277, + "ewc_loss": 0.03641284629702568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8206423192168586e-05, + "grad_norm": 25.854602813720703, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8537112474441528, + "num_tokens": 304210207.0, + "step": 7974 + }, + { + "epoch": 1.0145019717593182, + "ewc_loss": 0.036489494144916534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8244747479911894e-05, + "grad_norm": 25.896320343017578, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8796682357788086, + "num_tokens": 304249423.0, + "step": 7975 + }, + { + "epoch": 1.0146291820379088, + "ewc_loss": 0.03652670606970787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8263353922520764e-05, + "grad_norm": 25.921911239624023, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8663605451583862, + "num_tokens": 304286872.0, + "step": 7976 + }, + { + "epoch": 1.0147563923164993, + "ewc_loss": 0.03643936291337013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8219681805931032e-05, + "grad_norm": 25.89023208618164, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8669806718826294, + "num_tokens": 304323033.0, + "step": 7977 + }, + { + "epoch": 1.0148836025950896, + "ewc_loss": 0.03653800114989281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.826900006562937e-05, + "grad_norm": 25.958641052246094, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8644250631332397, + "num_tokens": 304360096.0, + "step": 7978 + }, + { + "epoch": 1.0150108128736801, + "ewc_loss": 0.036517348140478134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.825867366278544e-05, + "grad_norm": 25.94136619567871, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8828078508377075, + "num_tokens": 304396364.0, + "step": 7979 + }, + { + "epoch": 1.0151380231522706, + "ewc_loss": 0.036503951996564865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8251976143801585e-05, + "grad_norm": 25.892398834228516, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8644469976425171, + "num_tokens": 304431583.0, + "step": 7980 + }, + { + "epoch": 1.0152652334308612, + "ewc_loss": 0.03653336316347122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.826668085413985e-05, + "grad_norm": 26.07401466369629, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.85649174451828, + "num_tokens": 304470531.0, + "step": 7981 + }, + { + "epoch": 1.0153924437094517, + "ewc_loss": 0.03654560446739197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.827280175348278e-05, + "grad_norm": 25.99077606201172, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8670743703842163, + "num_tokens": 304505187.0, + "step": 7982 + }, + { + "epoch": 1.0155196539880422, + "ewc_loss": 0.036444902420043945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8222452126792632e-05, + "grad_norm": 25.971309661865234, + "learning_rate": 1e-06, + "loss": 0.5108, + "mean_token_accuracy": 0.8404140472412109, + "num_tokens": 304542685.0, + "step": 7983 + }, + { + "epoch": 1.0156468642666328, + "ewc_loss": 0.03659455105662346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.829727625590749e-05, + "grad_norm": 25.905559539794922, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8615639209747314, + "num_tokens": 304579204.0, + "step": 7984 + }, + { + "epoch": 1.0157740745452233, + "ewc_loss": 0.03646808862686157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.823404454626143e-05, + "grad_norm": 26.01483917236328, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.869200587272644, + "num_tokens": 304615224.0, + "step": 7985 + }, + { + "epoch": 1.0159012848238138, + "ewc_loss": 0.03657441586256027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8287208149558865e-05, + "grad_norm": 26.081279754638672, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8575316071510315, + "num_tokens": 304653513.0, + "step": 7986 + }, + { + "epoch": 1.0160284951024043, + "ewc_loss": 0.03646362945437431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8231814465252683e-05, + "grad_norm": 25.86971092224121, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8673111200332642, + "num_tokens": 304686744.0, + "step": 7987 + }, + { + "epoch": 1.0161557053809949, + "ewc_loss": 0.03649266064167023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.824633000069298e-05, + "grad_norm": 25.975236892700195, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8624552488327026, + "num_tokens": 304727262.0, + "step": 7988 + }, + { + "epoch": 1.0162829156595854, + "ewc_loss": 0.0365438312292099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8271915905643255e-05, + "grad_norm": 25.94044303894043, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8602738380432129, + "num_tokens": 304763418.0, + "step": 7989 + }, + { + "epoch": 1.0164101259381757, + "ewc_loss": 0.03649742901325226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8248714695801027e-05, + "grad_norm": 26.016685485839844, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8661328554153442, + "num_tokens": 304800763.0, + "step": 7990 + }, + { + "epoch": 1.0165373362167662, + "ewc_loss": 0.036557480692863464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.827874075388536e-05, + "grad_norm": 26.008251190185547, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8810458183288574, + "num_tokens": 304833136.0, + "step": 7991 + }, + { + "epoch": 1.0166645464953568, + "ewc_loss": 0.03650369867682457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8251848814543337e-05, + "grad_norm": 25.980907440185547, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8714812397956848, + "num_tokens": 304873208.0, + "step": 7992 + }, + { + "epoch": 1.0167917567739473, + "ewc_loss": 0.03653166815638542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8265833205077797e-05, + "grad_norm": 25.924394607543945, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8633392453193665, + "num_tokens": 304914237.0, + "step": 7993 + }, + { + "epoch": 1.0169189670525378, + "ewc_loss": 0.03656250610947609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8281252778251655e-05, + "grad_norm": 26.066816329956055, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8782966136932373, + "num_tokens": 304955453.0, + "step": 7994 + }, + { + "epoch": 1.0170461773311283, + "ewc_loss": 0.03655833378434181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8279166397405788e-05, + "grad_norm": 26.022884368896484, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8629137277603149, + "num_tokens": 304990533.0, + "step": 7995 + }, + { + "epoch": 1.0171733876097189, + "ewc_loss": 0.03653788939118385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8268945495947264e-05, + "grad_norm": 26.05339241027832, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8615636229515076, + "num_tokens": 305024849.0, + "step": 7996 + }, + { + "epoch": 1.0173005978883094, + "ewc_loss": 0.036536864936351776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8268432540935464e-05, + "grad_norm": 26.03491973876953, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8598992824554443, + "num_tokens": 305061615.0, + "step": 7997 + }, + { + "epoch": 1.0174278081669, + "ewc_loss": 0.036501459777355194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8250730136060156e-05, + "grad_norm": 25.92607307434082, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8435269594192505, + "num_tokens": 305102251.0, + "step": 7998 + }, + { + "epoch": 1.0175550184454905, + "ewc_loss": 0.03652346506714821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8261733202962205e-05, + "grad_norm": 26.019193649291992, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8720293045043945, + "num_tokens": 305135405.0, + "step": 7999 + }, + { + "epoch": 1.017682228724081, + "ewc_loss": 0.03652800992131233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8264005120727234e-05, + "grad_norm": 26.01610565185547, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8620679378509521, + "num_tokens": 305169005.0, + "step": 8000 + }, + { + "epoch": 1.0178094390026715, + "ewc_loss": 0.036493003368377686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8246500985696912e-05, + "grad_norm": 25.92466926574707, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8587008118629456, + "num_tokens": 305209050.0, + "step": 8001 + }, + { + "epoch": 1.0179366492812618, + "ewc_loss": 0.03653049096465111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8265245671500452e-05, + "grad_norm": 26.023468017578125, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8631153702735901, + "num_tokens": 305238250.0, + "step": 8002 + }, + { + "epoch": 1.0180638595598523, + "ewc_loss": 0.03659701719880104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.829850771173369e-05, + "grad_norm": 25.98196029663086, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8667429089546204, + "num_tokens": 305283544.0, + "step": 8003 + }, + { + "epoch": 1.0181910698384429, + "ewc_loss": 0.0365675613284111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8283781173522584e-05, + "grad_norm": 26.05857276916504, + "learning_rate": 1e-06, + "loss": 0.4941, + "mean_token_accuracy": 0.8459656238555908, + "num_tokens": 305320548.0, + "step": 8004 + }, + { + "epoch": 1.0183182801170334, + "ewc_loss": 0.03655894845724106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8279473806614988e-05, + "grad_norm": 26.02717399597168, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8690686225891113, + "num_tokens": 305359560.0, + "step": 8005 + }, + { + "epoch": 1.018445490395624, + "ewc_loss": 0.03654122352600098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8270611690240912e-05, + "grad_norm": 25.949443817138672, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8595492839813232, + "num_tokens": 305396680.0, + "step": 8006 + }, + { + "epoch": 1.0185727006742145, + "ewc_loss": 0.03654403239488602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.827201595006045e-05, + "grad_norm": 26.029098510742188, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8723649978637695, + "num_tokens": 305432856.0, + "step": 8007 + }, + { + "epoch": 1.018699910952805, + "ewc_loss": 0.03662203624844551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.831101872085128e-05, + "grad_norm": 26.046409606933594, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8647263646125793, + "num_tokens": 305469973.0, + "step": 8008 + }, + { + "epoch": 1.0188271212313955, + "ewc_loss": 0.03659076243638992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8295380868948996e-05, + "grad_norm": 26.022449493408203, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8694285154342651, + "num_tokens": 305505576.0, + "step": 8009 + }, + { + "epoch": 1.018954331509986, + "ewc_loss": 0.036552026867866516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8276014088769443e-05, + "grad_norm": 26.07118034362793, + "learning_rate": 1e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.8436200618743896, + "num_tokens": 305542037.0, + "step": 8010 + }, + { + "epoch": 1.0190815417885766, + "ewc_loss": 0.03655582666397095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8277913113706745e-05, + "grad_norm": 25.915189743041992, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8717128038406372, + "num_tokens": 305587784.0, + "step": 8011 + }, + { + "epoch": 1.019208752067167, + "ewc_loss": 0.03649681434035301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8248407286591828e-05, + "grad_norm": 25.980684280395508, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8562772870063782, + "num_tokens": 305621888.0, + "step": 8012 + }, + { + "epoch": 1.0193359623457576, + "ewc_loss": 0.03662547096610069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8312735846848227e-05, + "grad_norm": 25.980649948120117, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8818491697311401, + "num_tokens": 305663844.0, + "step": 8013 + }, + { + "epoch": 1.0194631726243482, + "ewc_loss": 0.03658158332109451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.829079155868385e-05, + "grad_norm": 25.9870662689209, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8769083023071289, + "num_tokens": 305695913.0, + "step": 8014 + }, + { + "epoch": 1.0195903829029385, + "ewc_loss": 0.03653573989868164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.826787047320977e-05, + "grad_norm": 26.042896270751953, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8614339828491211, + "num_tokens": 305733632.0, + "step": 8015 + }, + { + "epoch": 1.019717593181529, + "ewc_loss": 0.036541420966386795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8270709915668704e-05, + "grad_norm": 25.994258880615234, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8479357957839966, + "num_tokens": 305766917.0, + "step": 8016 + }, + { + "epoch": 1.0198448034601195, + "ewc_loss": 0.03662043437361717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8310216546524316e-05, + "grad_norm": 26.068132400512695, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8589248657226562, + "num_tokens": 305805768.0, + "step": 8017 + }, + { + "epoch": 1.01997201373871, + "ewc_loss": 0.03654317185282707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8271586668561213e-05, + "grad_norm": 26.020158767700195, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8636396527290344, + "num_tokens": 305843412.0, + "step": 8018 + }, + { + "epoch": 1.0200992240173006, + "ewc_loss": 0.03653659671545029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8268297935719602e-05, + "grad_norm": 25.940690994262695, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8785449862480164, + "num_tokens": 305880523.0, + "step": 8019 + }, + { + "epoch": 1.020226434295891, + "ewc_loss": 0.03654950484633446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8274751710123383e-05, + "grad_norm": 25.956493377685547, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8651896119117737, + "num_tokens": 305915379.0, + "step": 8020 + }, + { + "epoch": 1.0203536445744816, + "ewc_loss": 0.03658854216337204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8294271285412833e-05, + "grad_norm": 26.017484664916992, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8765054941177368, + "num_tokens": 305952471.0, + "step": 8021 + }, + { + "epoch": 1.0204808548530722, + "ewc_loss": 0.03655456006526947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.827728010539431e-05, + "grad_norm": 25.981672286987305, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8621143102645874, + "num_tokens": 305994023.0, + "step": 8022 + }, + { + "epoch": 1.0206080651316627, + "ewc_loss": 0.03660748898983002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.83037445822265e-05, + "grad_norm": 26.079999923706055, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8676307201385498, + "num_tokens": 306026563.0, + "step": 8023 + }, + { + "epoch": 1.0207352754102532, + "ewc_loss": 0.03658657521009445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8293287212145515e-05, + "grad_norm": 25.95301055908203, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8594903349876404, + "num_tokens": 306067913.0, + "step": 8024 + }, + { + "epoch": 1.0208624856888437, + "ewc_loss": 0.036518048495054245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8259024727740325e-05, + "grad_norm": 26.06631851196289, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8598413467407227, + "num_tokens": 306108692.0, + "step": 8025 + }, + { + "epoch": 1.0209896959674343, + "ewc_loss": 0.03664793446660042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8323968106415123e-05, + "grad_norm": 25.97759246826172, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8635474443435669, + "num_tokens": 306148082.0, + "step": 8026 + }, + { + "epoch": 1.0211169062460246, + "ewc_loss": 0.036487262696027756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8243630620418116e-05, + "grad_norm": 25.985164642333984, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8711956739425659, + "num_tokens": 306182800.0, + "step": 8027 + }, + { + "epoch": 1.021244116524615, + "ewc_loss": 0.03668121621012688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.834060822147876e-05, + "grad_norm": 26.098997116088867, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8601300120353699, + "num_tokens": 306226741.0, + "step": 8028 + }, + { + "epoch": 1.0213713268032056, + "ewc_loss": 0.03654471039772034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8272356101078913e-05, + "grad_norm": 25.923763275146484, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8609795570373535, + "num_tokens": 306265993.0, + "step": 8029 + }, + { + "epoch": 1.0214985370817962, + "ewc_loss": 0.036601465195417404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8300732335774228e-05, + "grad_norm": 26.13643455505371, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.867398202419281, + "num_tokens": 306306511.0, + "step": 8030 + }, + { + "epoch": 1.0216257473603867, + "ewc_loss": 0.03662142530083656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8310713130631484e-05, + "grad_norm": 26.036304473876953, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8772943019866943, + "num_tokens": 306337687.0, + "step": 8031 + }, + { + "epoch": 1.0217529576389772, + "ewc_loss": 0.03652244806289673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8261223885929212e-05, + "grad_norm": 26.011268615722656, + "learning_rate": 1e-06, + "loss": 0.5419, + "mean_token_accuracy": 0.8280031681060791, + "num_tokens": 306379646.0, + "step": 8032 + }, + { + "epoch": 1.0218801679175677, + "ewc_loss": 0.03659631684422493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.829815846576821e-05, + "grad_norm": 26.042476654052734, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8627967238426208, + "num_tokens": 306415389.0, + "step": 8033 + }, + { + "epoch": 1.0220073781961583, + "ewc_loss": 0.036507416516542435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.825370782171376e-05, + "grad_norm": 26.109636306762695, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8572018146514893, + "num_tokens": 306452035.0, + "step": 8034 + }, + { + "epoch": 1.0221345884747488, + "ewc_loss": 0.03655115142464638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.827557571232319e-05, + "grad_norm": 26.030752182006836, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8623392581939697, + "num_tokens": 306496277.0, + "step": 8035 + }, + { + "epoch": 1.0222617987533393, + "ewc_loss": 0.03648627921938896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.824313949327916e-05, + "grad_norm": 25.85395622253418, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8683040142059326, + "num_tokens": 306528212.0, + "step": 8036 + }, + { + "epoch": 1.0223890090319299, + "ewc_loss": 0.03650980815291405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8254904716741294e-05, + "grad_norm": 26.06854820251465, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8673744201660156, + "num_tokens": 306562047.0, + "step": 8037 + }, + { + "epoch": 1.0225162193105204, + "ewc_loss": 0.03664255887269974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8321279640076682e-05, + "grad_norm": 26.0452880859375, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8532528877258301, + "num_tokens": 306601298.0, + "step": 8038 + }, + { + "epoch": 1.0226434295891107, + "ewc_loss": 0.03646035119891167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8230175555800088e-05, + "grad_norm": 25.948490142822266, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8628267645835876, + "num_tokens": 306640961.0, + "step": 8039 + }, + { + "epoch": 1.0227706398677012, + "ewc_loss": 0.036621734499931335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8310867744730785e-05, + "grad_norm": 26.054929733276367, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8814838528633118, + "num_tokens": 306679507.0, + "step": 8040 + }, + { + "epoch": 1.0228978501462918, + "ewc_loss": 0.036584120243787766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.829205939429812e-05, + "grad_norm": 26.015880584716797, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.854156494140625, + "num_tokens": 306717337.0, + "step": 8041 + }, + { + "epoch": 1.0230250604248823, + "ewc_loss": 0.036620140075683594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.831006920838263e-05, + "grad_norm": 26.126394271850586, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8364341259002686, + "num_tokens": 306754152.0, + "step": 8042 + }, + { + "epoch": 1.0231522707034728, + "ewc_loss": 0.03660952299833298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8304761397303082e-05, + "grad_norm": 26.023452758789062, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8559242486953735, + "num_tokens": 306794887.0, + "step": 8043 + }, + { + "epoch": 1.0232794809820633, + "ewc_loss": 0.036562301218509674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8281150914845057e-05, + "grad_norm": 25.994304656982422, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8670130968093872, + "num_tokens": 306828374.0, + "step": 8044 + }, + { + "epoch": 1.0234066912606539, + "ewc_loss": 0.03661169111728668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8305845514987595e-05, + "grad_norm": 25.946775436401367, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8672506809234619, + "num_tokens": 306865285.0, + "step": 8045 + }, + { + "epoch": 1.0235339015392444, + "ewc_loss": 0.03656496852636337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8282484234077856e-05, + "grad_norm": 25.994304656982422, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8687804937362671, + "num_tokens": 306897195.0, + "step": 8046 + }, + { + "epoch": 1.023661111817835, + "ewc_loss": 0.03666606917977333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8333033949602395e-05, + "grad_norm": 26.082780838012695, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8612762689590454, + "num_tokens": 306935352.0, + "step": 8047 + }, + { + "epoch": 1.0237883220964255, + "ewc_loss": 0.036675550043582916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8337774235988036e-05, + "grad_norm": 25.977060317993164, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8677493929862976, + "num_tokens": 306964621.0, + "step": 8048 + }, + { + "epoch": 1.023915532375016, + "ewc_loss": 0.03664088249206543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8320441085961647e-05, + "grad_norm": 25.941097259521484, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8612272143363953, + "num_tokens": 307003451.0, + "step": 8049 + }, + { + "epoch": 1.0240427426536065, + "ewc_loss": 0.0366881862282753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8344093405175954e-05, + "grad_norm": 25.976566314697266, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8646582365036011, + "num_tokens": 307044695.0, + "step": 8050 + }, + { + "epoch": 1.0241699529321968, + "ewc_loss": 0.036703478544950485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8351738617639057e-05, + "grad_norm": 26.05504608154297, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.870025634765625, + "num_tokens": 307080603.0, + "step": 8051 + }, + { + "epoch": 1.0242971632107873, + "ewc_loss": 0.03671159967780113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8355800420977175e-05, + "grad_norm": 26.048871994018555, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8563328385353088, + "num_tokens": 307122300.0, + "step": 8052 + }, + { + "epoch": 1.0244243734893779, + "ewc_loss": 0.03665557876229286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8327789803151973e-05, + "grad_norm": 25.994083404541016, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8693432807922363, + "num_tokens": 307158685.0, + "step": 8053 + }, + { + "epoch": 1.0245515837679684, + "ewc_loss": 0.03671722486615181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8358612578595057e-05, + "grad_norm": 26.0860538482666, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8698961734771729, + "num_tokens": 307199104.0, + "step": 8054 + }, + { + "epoch": 1.024678794046559, + "ewc_loss": 0.036639001220464706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8319500668440014e-05, + "grad_norm": 25.990659713745117, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8556410074234009, + "num_tokens": 307235840.0, + "step": 8055 + }, + { + "epoch": 1.0248060043251495, + "ewc_loss": 0.036689821630716324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8344910131418146e-05, + "grad_norm": 26.107309341430664, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8527815937995911, + "num_tokens": 307273089.0, + "step": 8056 + }, + { + "epoch": 1.02493321460374, + "ewc_loss": 0.036772508174180984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.838625394157134e-05, + "grad_norm": 25.982891082763672, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8644038438796997, + "num_tokens": 307318825.0, + "step": 8057 + }, + { + "epoch": 1.0250604248823305, + "ewc_loss": 0.03665335103869438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.83266747626476e-05, + "grad_norm": 26.165138244628906, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8704997301101685, + "num_tokens": 307357006.0, + "step": 8058 + }, + { + "epoch": 1.025187635160921, + "ewc_loss": 0.036675553768873215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.833777605497744e-05, + "grad_norm": 26.085811614990234, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8618959188461304, + "num_tokens": 307390274.0, + "step": 8059 + }, + { + "epoch": 1.0253148454395116, + "ewc_loss": 0.03667876496911049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.833938222262077e-05, + "grad_norm": 25.99224853515625, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8716052770614624, + "num_tokens": 307427157.0, + "step": 8060 + }, + { + "epoch": 1.025442055718102, + "ewc_loss": 0.036669716238975525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8334858395974152e-05, + "grad_norm": 26.047189712524414, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.859780490398407, + "num_tokens": 307466555.0, + "step": 8061 + }, + { + "epoch": 1.0255692659966926, + "ewc_loss": 0.03675009310245514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8375047147856094e-05, + "grad_norm": 26.096467971801758, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8553266525268555, + "num_tokens": 307503375.0, + "step": 8062 + }, + { + "epoch": 1.0256964762752832, + "ewc_loss": 0.03667508438229561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8337541405344382e-05, + "grad_norm": 26.075088500976562, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8734310865402222, + "num_tokens": 307543170.0, + "step": 8063 + }, + { + "epoch": 1.0258236865538735, + "ewc_loss": 0.036689065396785736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8344533600611612e-05, + "grad_norm": 26.031051635742188, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8596622943878174, + "num_tokens": 307582890.0, + "step": 8064 + }, + { + "epoch": 1.025950896832464, + "ewc_loss": 0.036659467965364456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8329734302824363e-05, + "grad_norm": 26.144062042236328, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8564197421073914, + "num_tokens": 307617643.0, + "step": 8065 + }, + { + "epoch": 1.0260781071110545, + "ewc_loss": 0.036638811230659485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.831940608099103e-05, + "grad_norm": 25.94876480102539, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8599727749824524, + "num_tokens": 307652258.0, + "step": 8066 + }, + { + "epoch": 1.026205317389645, + "ewc_loss": 0.036612097173929214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8306049241800793e-05, + "grad_norm": 26.034446716308594, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8584235310554504, + "num_tokens": 307693245.0, + "step": 8067 + }, + { + "epoch": 1.0263325276682356, + "ewc_loss": 0.03671044111251831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8355220163357444e-05, + "grad_norm": 26.132070541381836, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8660962581634521, + "num_tokens": 307728751.0, + "step": 8068 + }, + { + "epoch": 1.026459737946826, + "ewc_loss": 0.036605432629585266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8302716853213497e-05, + "grad_norm": 26.01030158996582, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8525803089141846, + "num_tokens": 307767869.0, + "step": 8069 + }, + { + "epoch": 1.0265869482254166, + "ewc_loss": 0.036613285541534424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.830664223234635e-05, + "grad_norm": 25.997983932495117, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8724070191383362, + "num_tokens": 307809438.0, + "step": 8070 + }, + { + "epoch": 1.0267141585040072, + "ewc_loss": 0.03664395958185196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8321979950997047e-05, + "grad_norm": 25.978687286376953, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8431907296180725, + "num_tokens": 307848752.0, + "step": 8071 + }, + { + "epoch": 1.0268413687825977, + "ewc_loss": 0.03665246441960335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8326232748222537e-05, + "grad_norm": 26.075157165527344, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8574691414833069, + "num_tokens": 307886080.0, + "step": 8072 + }, + { + "epoch": 1.0269685790611882, + "ewc_loss": 0.03668984770774841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8344924683333375e-05, + "grad_norm": 26.069984436035156, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8766785860061646, + "num_tokens": 307922854.0, + "step": 8073 + }, + { + "epoch": 1.0270957893397787, + "ewc_loss": 0.03662955388426781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8314776752959006e-05, + "grad_norm": 26.02936553955078, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8599916696548462, + "num_tokens": 307960032.0, + "step": 8074 + }, + { + "epoch": 1.0272229996183693, + "ewc_loss": 0.03676494210958481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8382470443611965e-05, + "grad_norm": 26.09119987487793, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8628500699996948, + "num_tokens": 307996074.0, + "step": 8075 + }, + { + "epoch": 1.0273502098969596, + "ewc_loss": 0.036726564168930054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.836328192439396e-05, + "grad_norm": 26.106340408325195, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8761566877365112, + "num_tokens": 308030833.0, + "step": 8076 + }, + { + "epoch": 1.02747742017555, + "ewc_loss": 0.036665692925453186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.833284659369383e-05, + "grad_norm": 26.041568756103516, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8728065490722656, + "num_tokens": 308079620.0, + "step": 8077 + }, + { + "epoch": 1.0276046304541406, + "ewc_loss": 0.03670409694314003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.835204784583766e-05, + "grad_norm": 26.15791893005371, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8564497828483582, + "num_tokens": 308118555.0, + "step": 8078 + }, + { + "epoch": 1.0277318407327312, + "ewc_loss": 0.03668515011668205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8342574549023993e-05, + "grad_norm": 26.07447624206543, + "learning_rate": 1e-06, + "loss": 0.5213, + "mean_token_accuracy": 0.8456202745437622, + "num_tokens": 308154410.0, + "step": 8079 + }, + { + "epoch": 1.0278590510113217, + "ewc_loss": 0.0366588793694973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.832943962654099e-05, + "grad_norm": 26.09726333618164, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8637405037879944, + "num_tokens": 308191990.0, + "step": 8080 + }, + { + "epoch": 1.0279862612899122, + "ewc_loss": 0.03667550906538963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8337754227104597e-05, + "grad_norm": 26.055526733398438, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8428339958190918, + "num_tokens": 308231047.0, + "step": 8081 + }, + { + "epoch": 1.0281134715685027, + "ewc_loss": 0.0366697758436203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.833488749980461e-05, + "grad_norm": 26.061641693115234, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8762511014938354, + "num_tokens": 308269487.0, + "step": 8082 + }, + { + "epoch": 1.0282406818470933, + "ewc_loss": 0.03663022443652153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8315111447009258e-05, + "grad_norm": 26.09773063659668, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8624439239501953, + "num_tokens": 308304974.0, + "step": 8083 + }, + { + "epoch": 1.0283678921256838, + "ewc_loss": 0.03668675199151039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8343376723350957e-05, + "grad_norm": 26.03082275390625, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8855555653572083, + "num_tokens": 308342072.0, + "step": 8084 + }, + { + "epoch": 1.0284951024042743, + "ewc_loss": 0.036564819514751434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.828240965551231e-05, + "grad_norm": 25.977046966552734, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8588664531707764, + "num_tokens": 308374293.0, + "step": 8085 + }, + { + "epoch": 1.0286223126828649, + "ewc_loss": 0.03669098764657974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8345494027016684e-05, + "grad_norm": 26.045740127563477, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8680890798568726, + "num_tokens": 308415872.0, + "step": 8086 + }, + { + "epoch": 1.0287495229614554, + "ewc_loss": 0.03667913377285004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8339567759539932e-05, + "grad_norm": 25.964916229248047, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8569709062576294, + "num_tokens": 308451198.0, + "step": 8087 + }, + { + "epoch": 1.0288767332400457, + "ewc_loss": 0.03673765808343887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8368829842074774e-05, + "grad_norm": 26.05608367919922, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8703362941741943, + "num_tokens": 308490554.0, + "step": 8088 + }, + { + "epoch": 1.0290039435186362, + "ewc_loss": 0.036724843084812164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8362421542406082e-05, + "grad_norm": 25.944494247436523, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8577035069465637, + "num_tokens": 308530221.0, + "step": 8089 + }, + { + "epoch": 1.0291311537972267, + "ewc_loss": 0.036715082824230194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.835754119383637e-05, + "grad_norm": 26.002565383911133, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8469189405441284, + "num_tokens": 308563634.0, + "step": 8090 + }, + { + "epoch": 1.0292583640758173, + "ewc_loss": 0.03679867833852768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.839933975134045e-05, + "grad_norm": 25.979263305664062, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8635307550430298, + "num_tokens": 308605033.0, + "step": 8091 + }, + { + "epoch": 1.0293855743544078, + "ewc_loss": 0.03673144057393074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.836571937019471e-05, + "grad_norm": 26.00811195373535, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8640251159667969, + "num_tokens": 308642603.0, + "step": 8092 + }, + { + "epoch": 1.0295127846329983, + "ewc_loss": 0.03678440302610397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8392202036920935e-05, + "grad_norm": 25.959259033203125, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8597077131271362, + "num_tokens": 308685189.0, + "step": 8093 + }, + { + "epoch": 1.0296399949115889, + "ewc_loss": 0.03680204972624779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8401024135528132e-05, + "grad_norm": 26.073253631591797, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8624904751777649, + "num_tokens": 308726052.0, + "step": 8094 + }, + { + "epoch": 1.0297672051901794, + "ewc_loss": 0.03677047789096832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.838523894548416e-05, + "grad_norm": 26.084163665771484, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8510545492172241, + "num_tokens": 308764828.0, + "step": 8095 + }, + { + "epoch": 1.02989441546877, + "ewc_loss": 0.03676586598157883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8382932466920465e-05, + "grad_norm": 25.91847801208496, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8729182481765747, + "num_tokens": 308799712.0, + "step": 8096 + }, + { + "epoch": 1.0300216257473604, + "ewc_loss": 0.036771003156900406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8385500879958272e-05, + "grad_norm": 26.128732681274414, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8626488447189331, + "num_tokens": 308839879.0, + "step": 8097 + }, + { + "epoch": 1.030148836025951, + "ewc_loss": 0.03681615740060806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8408078176435083e-05, + "grad_norm": 25.965822219848633, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8564584255218506, + "num_tokens": 308880362.0, + "step": 8098 + }, + { + "epoch": 1.0302760463045415, + "ewc_loss": 0.03672036528587341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8360182366450317e-05, + "grad_norm": 25.94937515258789, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.872906506061554, + "num_tokens": 308921395.0, + "step": 8099 + }, + { + "epoch": 1.0304032565831318, + "ewc_loss": 0.036804016679525375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.840200820879545e-05, + "grad_norm": 26.05874252319336, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8711326718330383, + "num_tokens": 308959976.0, + "step": 8100 + }, + { + "epoch": 1.0305304668617223, + "ewc_loss": 0.0367467887699604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.837339368648827e-05, + "grad_norm": 25.988168716430664, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8758808374404907, + "num_tokens": 309000702.0, + "step": 8101 + }, + { + "epoch": 1.0306576771403129, + "ewc_loss": 0.03675326332449913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8376631487626582e-05, + "grad_norm": 26.07880973815918, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8811089396476746, + "num_tokens": 309036540.0, + "step": 8102 + }, + { + "epoch": 1.0307848874189034, + "ewc_loss": 0.036740195006132126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8370097677689046e-05, + "grad_norm": 25.965389251708984, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8619188070297241, + "num_tokens": 309072408.0, + "step": 8103 + }, + { + "epoch": 1.030912097697494, + "ewc_loss": 0.03671219199895859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8356096916249953e-05, + "grad_norm": 26.140695571899414, + "learning_rate": 1e-06, + "loss": 0.4945, + "mean_token_accuracy": 0.8492233753204346, + "num_tokens": 309110366.0, + "step": 8104 + }, + { + "epoch": 1.0310393079760845, + "ewc_loss": 0.036813996732234955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8406997696729377e-05, + "grad_norm": 26.14800262451172, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8655697107315063, + "num_tokens": 309155194.0, + "step": 8105 + }, + { + "epoch": 1.031166518254675, + "ewc_loss": 0.036627136170864105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.831356894399505e-05, + "grad_norm": 26.050521850585938, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.871658205986023, + "num_tokens": 309193747.0, + "step": 8106 + }, + { + "epoch": 1.0312937285332655, + "ewc_loss": 0.036646682769060135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.83233405550709e-05, + "grad_norm": 26.04761505126953, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8513621687889099, + "num_tokens": 309234161.0, + "step": 8107 + }, + { + "epoch": 1.031420938811856, + "ewc_loss": 0.03661557286977768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.830778637668118e-05, + "grad_norm": 26.024700164794922, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8526129722595215, + "num_tokens": 309272574.0, + "step": 8108 + }, + { + "epoch": 1.0315481490904466, + "ewc_loss": 0.036684583872556686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8342292605666444e-05, + "grad_norm": 26.016551971435547, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8654195070266724, + "num_tokens": 309306567.0, + "step": 8109 + }, + { + "epoch": 1.031675359369037, + "ewc_loss": 0.036733612418174744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.836680712585803e-05, + "grad_norm": 26.019609451293945, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8675063252449036, + "num_tokens": 309347218.0, + "step": 8110 + }, + { + "epoch": 1.0318025696476276, + "ewc_loss": 0.0367526113986969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8376305888523348e-05, + "grad_norm": 26.143726348876953, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.877252995967865, + "num_tokens": 309385701.0, + "step": 8111 + }, + { + "epoch": 1.0319297799262181, + "ewc_loss": 0.03667333349585533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8336666471441276e-05, + "grad_norm": 26.05809783935547, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8616544604301453, + "num_tokens": 309424600.0, + "step": 8112 + }, + { + "epoch": 1.0320569902048085, + "ewc_loss": 0.03664765879511833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8323829863220453e-05, + "grad_norm": 26.024560928344727, + "learning_rate": 1e-06, + "loss": 0.5066, + "mean_token_accuracy": 0.8416483402252197, + "num_tokens": 309459929.0, + "step": 8113 + }, + { + "epoch": 1.032184200483399, + "ewc_loss": 0.03671964630484581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.835982402553782e-05, + "grad_norm": 26.054580688476562, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8555490970611572, + "num_tokens": 309505085.0, + "step": 8114 + }, + { + "epoch": 1.0323114107619895, + "ewc_loss": 0.036657724529504776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8328863006900065e-05, + "grad_norm": 26.069438934326172, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.866219162940979, + "num_tokens": 309544746.0, + "step": 8115 + }, + { + "epoch": 1.03243862104058, + "ewc_loss": 0.03664720803499222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8323604308534414e-05, + "grad_norm": 25.945812225341797, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.849899172782898, + "num_tokens": 309584301.0, + "step": 8116 + }, + { + "epoch": 1.0325658313191706, + "ewc_loss": 0.036696918308734894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8348458979744464e-05, + "grad_norm": 26.138877868652344, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8665837645530701, + "num_tokens": 309622570.0, + "step": 8117 + }, + { + "epoch": 1.032693041597761, + "ewc_loss": 0.03671056032180786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.835528019000776e-05, + "grad_norm": 25.95252227783203, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8476680517196655, + "num_tokens": 309666417.0, + "step": 8118 + }, + { + "epoch": 1.0328202518763516, + "ewc_loss": 0.036606524139642715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.830326255003456e-05, + "grad_norm": 26.12600326538086, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8534209728240967, + "num_tokens": 309705706.0, + "step": 8119 + }, + { + "epoch": 1.0329474621549422, + "ewc_loss": 0.036726582795381546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8363291019340977e-05, + "grad_norm": 26.0582275390625, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8720334768295288, + "num_tokens": 309744288.0, + "step": 8120 + }, + { + "epoch": 1.0330746724335327, + "ewc_loss": 0.036645032465457916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8322516552871093e-05, + "grad_norm": 26.011871337890625, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8505034446716309, + "num_tokens": 309780406.0, + "step": 8121 + }, + { + "epoch": 1.0332018827121232, + "ewc_loss": 0.03670212998986244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8351065591559745e-05, + "grad_norm": 26.208040237426758, + "learning_rate": 1e-06, + "loss": 0.5393, + "mean_token_accuracy": 0.8307656049728394, + "num_tokens": 309811135.0, + "step": 8122 + }, + { + "epoch": 1.0333290929907137, + "ewc_loss": 0.03668367117643356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8341836039326154e-05, + "grad_norm": 26.009033203125, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8663140535354614, + "num_tokens": 309847037.0, + "step": 8123 + }, + { + "epoch": 1.0334563032693043, + "ewc_loss": 0.03668465465307236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.834232716646511e-05, + "grad_norm": 26.123533248901367, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8623058199882507, + "num_tokens": 309884274.0, + "step": 8124 + }, + { + "epoch": 1.0335835135478946, + "ewc_loss": 0.0367162711918354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8358136003371328e-05, + "grad_norm": 26.032915115356445, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8750606179237366, + "num_tokens": 309915572.0, + "step": 8125 + }, + { + "epoch": 1.033710723826485, + "ewc_loss": 0.03663566708564758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8317834474146366e-05, + "grad_norm": 26.030607223510742, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8680980801582336, + "num_tokens": 309954502.0, + "step": 8126 + }, + { + "epoch": 1.0338379341050756, + "ewc_loss": 0.03671073541045189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.835536750149913e-05, + "grad_norm": 26.058542251586914, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8847659230232239, + "num_tokens": 309993015.0, + "step": 8127 + }, + { + "epoch": 1.0339651443836662, + "ewc_loss": 0.036706775426864624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8353388441028073e-05, + "grad_norm": 26.06203842163086, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8541923761367798, + "num_tokens": 310030840.0, + "step": 8128 + }, + { + "epoch": 1.0340923546622567, + "ewc_loss": 0.03673300892114639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8366505173617043e-05, + "grad_norm": 25.99480628967285, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8627824783325195, + "num_tokens": 310067586.0, + "step": 8129 + }, + { + "epoch": 1.0342195649408472, + "ewc_loss": 0.03677506744861603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8387534510111436e-05, + "grad_norm": 26.174095153808594, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.872544527053833, + "num_tokens": 310104496.0, + "step": 8130 + }, + { + "epoch": 1.0343467752194377, + "ewc_loss": 0.03682895004749298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8414475562167354e-05, + "grad_norm": 26.126201629638672, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8455992937088013, + "num_tokens": 310141162.0, + "step": 8131 + }, + { + "epoch": 1.0344739854980283, + "ewc_loss": 0.03674982488155365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.837491254264023e-05, + "grad_norm": 26.06195640563965, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8615027070045471, + "num_tokens": 310182965.0, + "step": 8132 + }, + { + "epoch": 1.0346011957766188, + "ewc_loss": 0.03682016208767891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8410080883768387e-05, + "grad_norm": 26.135250091552734, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8458168506622314, + "num_tokens": 310219572.0, + "step": 8133 + }, + { + "epoch": 1.0347284060552093, + "ewc_loss": 0.03673353046178818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.836676528910175e-05, + "grad_norm": 25.995725631713867, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8504223227500916, + "num_tokens": 310248711.0, + "step": 8134 + }, + { + "epoch": 1.0348556163337999, + "ewc_loss": 0.036750420928001404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8375210856902413e-05, + "grad_norm": 26.12465476989746, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8464245200157166, + "num_tokens": 310287915.0, + "step": 8135 + }, + { + "epoch": 1.0349828266123904, + "ewc_loss": 0.03678346797823906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8391734556644224e-05, + "grad_norm": 26.043811798095703, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8613231778144836, + "num_tokens": 310325619.0, + "step": 8136 + }, + { + "epoch": 1.0351100368909807, + "ewc_loss": 0.03675604239106178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.837802119553089e-05, + "grad_norm": 26.076662063598633, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8710068464279175, + "num_tokens": 310362456.0, + "step": 8137 + }, + { + "epoch": 1.0352372471695712, + "ewc_loss": 0.03678176924586296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8390885088592768e-05, + "grad_norm": 26.027746200561523, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8576894402503967, + "num_tokens": 310399584.0, + "step": 8138 + }, + { + "epoch": 1.0353644574481617, + "ewc_loss": 0.03681711480021477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8408556570648216e-05, + "grad_norm": 26.180463790893555, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8562611937522888, + "num_tokens": 310437435.0, + "step": 8139 + }, + { + "epoch": 1.0354916677267523, + "ewc_loss": 0.03678448125720024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.839224023569841e-05, + "grad_norm": 26.017927169799805, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8687518835067749, + "num_tokens": 310476320.0, + "step": 8140 + }, + { + "epoch": 1.0356188780053428, + "ewc_loss": 0.03679919242858887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.839959622884635e-05, + "grad_norm": 26.090103149414062, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.878307580947876, + "num_tokens": 310512956.0, + "step": 8141 + }, + { + "epoch": 1.0357460882839333, + "ewc_loss": 0.03683555871248245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8417778846924193e-05, + "grad_norm": 26.09330177307129, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8616914749145508, + "num_tokens": 310547605.0, + "step": 8142 + }, + { + "epoch": 1.0358732985625239, + "ewc_loss": 0.03678950294852257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8394752260064706e-05, + "grad_norm": 26.06321907043457, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8608359098434448, + "num_tokens": 310586991.0, + "step": 8143 + }, + { + "epoch": 1.0360005088411144, + "ewc_loss": 0.03685897961258888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.842948950070422e-05, + "grad_norm": 26.09273910522461, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8616555333137512, + "num_tokens": 310621271.0, + "step": 8144 + }, + { + "epoch": 1.036127719119705, + "ewc_loss": 0.03683779388666153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8418897525407374e-05, + "grad_norm": 26.13836097717285, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8658366203308105, + "num_tokens": 310662154.0, + "step": 8145 + }, + { + "epoch": 1.0362549293982954, + "ewc_loss": 0.03683115169405937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.84155760507565e-05, + "grad_norm": 26.09343719482422, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8722800612449646, + "num_tokens": 310701924.0, + "step": 8146 + }, + { + "epoch": 1.036382139676886, + "ewc_loss": 0.0367923341691494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8396167433820665e-05, + "grad_norm": 25.939632415771484, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8692905902862549, + "num_tokens": 310739688.0, + "step": 8147 + }, + { + "epoch": 1.0365093499554765, + "ewc_loss": 0.036828164011240005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8414082660456188e-05, + "grad_norm": 26.05191993713379, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8577994108200073, + "num_tokens": 310773317.0, + "step": 8148 + }, + { + "epoch": 1.0366365602340668, + "ewc_loss": 0.03691035136580467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8455175450071692e-05, + "grad_norm": 26.122407913208008, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8613748550415039, + "num_tokens": 310810854.0, + "step": 8149 + }, + { + "epoch": 1.0367637705126573, + "ewc_loss": 0.03688036650419235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8440183339407668e-05, + "grad_norm": 26.173168182373047, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8554875254631042, + "num_tokens": 310846080.0, + "step": 8150 + }, + { + "epoch": 1.0368909807912479, + "ewc_loss": 0.03686603158712387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8433016521157697e-05, + "grad_norm": 26.157455444335938, + "learning_rate": 1e-06, + "loss": 0.5141, + "mean_token_accuracy": 0.8398507833480835, + "num_tokens": 310885032.0, + "step": 8151 + }, + { + "epoch": 1.0370181910698384, + "ewc_loss": 0.03682930767536163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.84146538231289e-05, + "grad_norm": 26.06821060180664, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8684170246124268, + "num_tokens": 310924518.0, + "step": 8152 + }, + { + "epoch": 1.037145401348429, + "ewc_loss": 0.03682927414774895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.841463745222427e-05, + "grad_norm": 26.07192611694336, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8650155067443848, + "num_tokens": 310961734.0, + "step": 8153 + }, + { + "epoch": 1.0372726116270194, + "ewc_loss": 0.0368938185274601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.844690996222198e-05, + "grad_norm": 26.133037567138672, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8642659187316895, + "num_tokens": 311004541.0, + "step": 8154 + }, + { + "epoch": 1.03739982190561, + "ewc_loss": 0.036824360489845276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.841217999754008e-05, + "grad_norm": 25.92755889892578, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8711239099502563, + "num_tokens": 311043947.0, + "step": 8155 + }, + { + "epoch": 1.0375270321842005, + "ewc_loss": 0.036855705082416534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.842785241024103e-05, + "grad_norm": 26.157604217529297, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8586322069168091, + "num_tokens": 311083154.0, + "step": 8156 + }, + { + "epoch": 1.037654242462791, + "ewc_loss": 0.03688938543200493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8444692614139058e-05, + "grad_norm": 26.077524185180664, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.874276340007782, + "num_tokens": 311121429.0, + "step": 8157 + }, + { + "epoch": 1.0377814527413816, + "ewc_loss": 0.036828555166721344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8414277292322367e-05, + "grad_norm": 26.08049964904785, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8474841713905334, + "num_tokens": 311166578.0, + "step": 8158 + }, + { + "epoch": 1.037908663019972, + "ewc_loss": 0.036897528916597366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8448765331413597e-05, + "grad_norm": 26.132917404174805, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8652726411819458, + "num_tokens": 311209645.0, + "step": 8159 + }, + { + "epoch": 1.0380358732985626, + "ewc_loss": 0.036853592842817307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.842679557739757e-05, + "grad_norm": 26.11590003967285, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8666305541992188, + "num_tokens": 311248428.0, + "step": 8160 + }, + { + "epoch": 1.0381630835771531, + "ewc_loss": 0.036822784692049026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8411392375128344e-05, + "grad_norm": 26.076833724975586, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8640201091766357, + "num_tokens": 311282040.0, + "step": 8161 + }, + { + "epoch": 1.0382902938557435, + "ewc_loss": 0.036781877279281616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.839093783928547e-05, + "grad_norm": 26.04930877685547, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8692243099212646, + "num_tokens": 311320233.0, + "step": 8162 + }, + { + "epoch": 1.038417504134334, + "ewc_loss": 0.03683703392744064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8418517356622033e-05, + "grad_norm": 26.146686553955078, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8471201658248901, + "num_tokens": 311357586.0, + "step": 8163 + }, + { + "epoch": 1.0385447144129245, + "ewc_loss": 0.03689804673194885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.84490236279089e-05, + "grad_norm": 26.08411407470703, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8622973561286926, + "num_tokens": 311395545.0, + "step": 8164 + }, + { + "epoch": 1.038671924691515, + "ewc_loss": 0.03679978474974632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8399892724119127e-05, + "grad_norm": 26.129322052001953, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.861606240272522, + "num_tokens": 311428785.0, + "step": 8165 + }, + { + "epoch": 1.0387991349701056, + "ewc_loss": 0.03688129037618637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8440645362716168e-05, + "grad_norm": 26.200464248657227, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8507881760597229, + "num_tokens": 311469546.0, + "step": 8166 + }, + { + "epoch": 1.038926345248696, + "ewc_loss": 0.03680392727255821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8401964553049766e-05, + "grad_norm": 26.049680709838867, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8543572425842285, + "num_tokens": 311504064.0, + "step": 8167 + }, + { + "epoch": 1.0390535555272866, + "ewc_loss": 0.03680604323744774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8403021385893226e-05, + "grad_norm": 26.285999298095703, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.845531165599823, + "num_tokens": 311536600.0, + "step": 8168 + }, + { + "epoch": 1.0391807658058771, + "ewc_loss": 0.03687357157468796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8436785467201844e-05, + "grad_norm": 26.281200408935547, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8795496821403503, + "num_tokens": 311573638.0, + "step": 8169 + }, + { + "epoch": 1.0393079760844677, + "ewc_loss": 0.03673616051673889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8368080418440513e-05, + "grad_norm": 26.03086280822754, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8548663258552551, + "num_tokens": 311611085.0, + "step": 8170 + }, + { + "epoch": 1.0394351863630582, + "ewc_loss": 0.036770015954971313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.838500793382991e-05, + "grad_norm": 26.265846252441406, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8559266924858093, + "num_tokens": 311655497.0, + "step": 8171 + }, + { + "epoch": 1.0395623966416487, + "ewc_loss": 0.036750126630067825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8375063518760726e-05, + "grad_norm": 26.011953353881836, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8479246497154236, + "num_tokens": 311688167.0, + "step": 8172 + }, + { + "epoch": 1.0396896069202393, + "ewc_loss": 0.0367659367620945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8382968846708536e-05, + "grad_norm": 26.076900482177734, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8612629771232605, + "num_tokens": 311724926.0, + "step": 8173 + }, + { + "epoch": 1.0398168171988296, + "ewc_loss": 0.03683490678668022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.841745324782096e-05, + "grad_norm": 26.170787811279297, + "learning_rate": 1e-06, + "loss": 0.5119, + "mean_token_accuracy": 0.843644380569458, + "num_tokens": 311760729.0, + "step": 8174 + }, + { + "epoch": 1.03994402747742, + "ewc_loss": 0.036834634840488434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8417316823615693e-05, + "grad_norm": 26.022424697875977, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.862496018409729, + "num_tokens": 311802138.0, + "step": 8175 + }, + { + "epoch": 1.0400712377560106, + "ewc_loss": 0.036815572530031204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8407787138130516e-05, + "grad_norm": 26.174684524536133, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8694217205047607, + "num_tokens": 311838510.0, + "step": 8176 + }, + { + "epoch": 1.0401984480346012, + "ewc_loss": 0.036887139081954956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.844357029767707e-05, + "grad_norm": 26.031787872314453, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8553075194358826, + "num_tokens": 311870875.0, + "step": 8177 + }, + { + "epoch": 1.0403256583131917, + "ewc_loss": 0.03683236241340637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.841618177422788e-05, + "grad_norm": 26.158321380615234, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.874565064907074, + "num_tokens": 311905854.0, + "step": 8178 + }, + { + "epoch": 1.0404528685917822, + "ewc_loss": 0.03690028563141823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.845014230639208e-05, + "grad_norm": 26.107885360717773, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8584874868392944, + "num_tokens": 311936381.0, + "step": 8179 + }, + { + "epoch": 1.0405800788703727, + "ewc_loss": 0.03682059794664383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8410299162496813e-05, + "grad_norm": 26.090181350708008, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.855800449848175, + "num_tokens": 311972529.0, + "step": 8180 + }, + { + "epoch": 1.0407072891489633, + "ewc_loss": 0.03694772720336914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8473863747203723e-05, + "grad_norm": 26.13001251220703, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8809666633605957, + "num_tokens": 312007883.0, + "step": 8181 + }, + { + "epoch": 1.0408344994275538, + "ewc_loss": 0.03678184375166893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.839092146838084e-05, + "grad_norm": 26.11821174621582, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8573823571205139, + "num_tokens": 312041000.0, + "step": 8182 + }, + { + "epoch": 1.0409617097061443, + "ewc_loss": 0.03695645183324814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8478225683793426e-05, + "grad_norm": 26.114948272705078, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8666155338287354, + "num_tokens": 312076011.0, + "step": 8183 + }, + { + "epoch": 1.0410889199847349, + "ewc_loss": 0.036939237266778946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8469618225935847e-05, + "grad_norm": 26.184160232543945, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8560478687286377, + "num_tokens": 312115614.0, + "step": 8184 + }, + { + "epoch": 1.0412161302633254, + "ewc_loss": 0.03688816726207733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.844408325268887e-05, + "grad_norm": 26.119985580444336, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8551830053329468, + "num_tokens": 312154111.0, + "step": 8185 + }, + { + "epoch": 1.0413433405419157, + "ewc_loss": 0.03688567876815796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8442839063936844e-05, + "grad_norm": 26.15599250793457, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8561016917228699, + "num_tokens": 312191571.0, + "step": 8186 + }, + { + "epoch": 1.0414705508205062, + "ewc_loss": 0.03696943819522858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8484719475964084e-05, + "grad_norm": 26.067550659179688, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8538566827774048, + "num_tokens": 312229205.0, + "step": 8187 + }, + { + "epoch": 1.0415977610990967, + "ewc_loss": 0.03689391165971756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.844695543695707e-05, + "grad_norm": 26.14252471923828, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8769770860671997, + "num_tokens": 312266604.0, + "step": 8188 + }, + { + "epoch": 1.0417249713776873, + "ewc_loss": 0.0370330773293972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8516539057600312e-05, + "grad_norm": 26.22531509399414, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8521174788475037, + "num_tokens": 312305222.0, + "step": 8189 + }, + { + "epoch": 1.0418521816562778, + "ewc_loss": 0.03695450350642204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.847725252446253e-05, + "grad_norm": 26.1366024017334, + "learning_rate": 1e-06, + "loss": 0.5043, + "mean_token_accuracy": 0.83978271484375, + "num_tokens": 312345823.0, + "step": 8190 + }, + { + "epoch": 1.0419793919348683, + "ewc_loss": 0.03691817820072174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8459089915268123e-05, + "grad_norm": 26.12614631652832, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.852477490901947, + "num_tokens": 312383569.0, + "step": 8191 + }, + { + "epoch": 1.0421066022134589, + "ewc_loss": 0.03688674792647362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8443373846821487e-05, + "grad_norm": 26.046802520751953, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.86539626121521, + "num_tokens": 312423068.0, + "step": 8192 + }, + { + "epoch": 1.0422338124920494, + "ewc_loss": 0.0368814580142498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.844072903622873e-05, + "grad_norm": 26.08820152282715, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8560986518859863, + "num_tokens": 312460569.0, + "step": 8193 + }, + { + "epoch": 1.04236102277064, + "ewc_loss": 0.03694026172161102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8470131180947646e-05, + "grad_norm": 26.050872802734375, + "learning_rate": 1e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.8404858112335205, + "num_tokens": 312501654.0, + "step": 8194 + }, + { + "epoch": 1.0424882330492304, + "ewc_loss": 0.036911964416503906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8455983081366867e-05, + "grad_norm": 26.08401107788086, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8532974720001221, + "num_tokens": 312543630.0, + "step": 8195 + }, + { + "epoch": 1.042615443327821, + "ewc_loss": 0.037011656910181046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8505828847992234e-05, + "grad_norm": 26.16946792602539, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8595829010009766, + "num_tokens": 312579428.0, + "step": 8196 + }, + { + "epoch": 1.0427426536064115, + "ewc_loss": 0.0369144007563591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.845719998527784e-05, + "grad_norm": 25.998271942138672, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8758954405784607, + "num_tokens": 312621388.0, + "step": 8197 + }, + { + "epoch": 1.0428698638850018, + "ewc_loss": 0.0369793102145195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8489654394215904e-05, + "grad_norm": 26.224689483642578, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8633027076721191, + "num_tokens": 312665303.0, + "step": 8198 + }, + { + "epoch": 1.0429970741635923, + "ewc_loss": 0.036931611597537994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8465805624146014e-05, + "grad_norm": 26.103113174438477, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8513513207435608, + "num_tokens": 312702354.0, + "step": 8199 + }, + { + "epoch": 1.0431242844421829, + "ewc_loss": 0.03684012591838837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8420063497615047e-05, + "grad_norm": 26.117321014404297, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8664538860321045, + "num_tokens": 312737223.0, + "step": 8200 + }, + { + "epoch": 1.0432514947207734, + "ewc_loss": 0.03697643429040909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8488217392587103e-05, + "grad_norm": 26.173633575439453, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8637055158615112, + "num_tokens": 312773353.0, + "step": 8201 + }, + { + "epoch": 1.043378704999364, + "ewc_loss": 0.03686818480491638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8434091543895192e-05, + "grad_norm": 26.065526962280273, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8441686034202576, + "num_tokens": 312809230.0, + "step": 8202 + }, + { + "epoch": 1.0435059152779544, + "ewc_loss": 0.036908410489559174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8454205928719603e-05, + "grad_norm": 26.143497467041016, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8786987066268921, + "num_tokens": 312841278.0, + "step": 8203 + }, + { + "epoch": 1.043633125556545, + "ewc_loss": 0.036901701241731644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.845084989327006e-05, + "grad_norm": 26.11807632446289, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8664002418518066, + "num_tokens": 312888469.0, + "step": 8204 + }, + { + "epoch": 1.0437603358351355, + "ewc_loss": 0.03693882375955582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8469412680133246e-05, + "grad_norm": 26.315082550048828, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.869300365447998, + "num_tokens": 312925446.0, + "step": 8205 + }, + { + "epoch": 1.043887546113726, + "ewc_loss": 0.036843184381723404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8421591448714025e-05, + "grad_norm": 26.049236297607422, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8597651720046997, + "num_tokens": 312965425.0, + "step": 8206 + }, + { + "epoch": 1.0440147563923166, + "ewc_loss": 0.03674907237291336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8374536011833698e-05, + "grad_norm": 26.160627365112305, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8561989665031433, + "num_tokens": 313000637.0, + "step": 8207 + }, + { + "epoch": 1.044141966670907, + "ewc_loss": 0.03684989735484123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.842494930315297e-05, + "grad_norm": 26.12040138244629, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8626191020011902, + "num_tokens": 313035986.0, + "step": 8208 + }, + { + "epoch": 1.0442691769494976, + "ewc_loss": 0.036793701350688934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8396851373836398e-05, + "grad_norm": 26.049541473388672, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8504297137260437, + "num_tokens": 313077027.0, + "step": 8209 + }, + { + "epoch": 1.0443963872280881, + "ewc_loss": 0.0368635319173336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8431765056448057e-05, + "grad_norm": 26.291845321655273, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8721649646759033, + "num_tokens": 313121733.0, + "step": 8210 + }, + { + "epoch": 1.0445235975066784, + "ewc_loss": 0.036826763302087784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8413382349535823e-05, + "grad_norm": 26.036287307739258, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8558935523033142, + "num_tokens": 313165542.0, + "step": 8211 + }, + { + "epoch": 1.044650807785269, + "ewc_loss": 0.03678746521472931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8393731807009317e-05, + "grad_norm": 26.289514541625977, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8514704704284668, + "num_tokens": 313201082.0, + "step": 8212 + }, + { + "epoch": 1.0447780180638595, + "ewc_loss": 0.036831919103860855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8415959857520647e-05, + "grad_norm": 26.137434005737305, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.873179018497467, + "num_tokens": 313241765.0, + "step": 8213 + }, + { + "epoch": 1.04490522834245, + "ewc_loss": 0.03673149645328522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8365748474025168e-05, + "grad_norm": 26.21671485900879, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8601281642913818, + "num_tokens": 313282471.0, + "step": 8214 + }, + { + "epoch": 1.0450324386210406, + "ewc_loss": 0.036751121282577515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8375560102867894e-05, + "grad_norm": 26.040334701538086, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8575154542922974, + "num_tokens": 313331505.0, + "step": 8215 + }, + { + "epoch": 1.045159648899631, + "ewc_loss": 0.03679220378398895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8396101950202137e-05, + "grad_norm": 26.080764770507812, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.861937403678894, + "num_tokens": 313369823.0, + "step": 8216 + }, + { + "epoch": 1.0452868591782216, + "ewc_loss": 0.03680910915136337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8404554793960415e-05, + "grad_norm": 26.09733009338379, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8626723885536194, + "num_tokens": 313404255.0, + "step": 8217 + }, + { + "epoch": 1.0454140694568121, + "ewc_loss": 0.0367920882999897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8396043742541224e-05, + "grad_norm": 26.149377822875977, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8724181652069092, + "num_tokens": 313439368.0, + "step": 8218 + }, + { + "epoch": 1.0455412797354027, + "ewc_loss": 0.036790210753679276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8395105144008994e-05, + "grad_norm": 26.1444149017334, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8536614179611206, + "num_tokens": 313479475.0, + "step": 8219 + }, + { + "epoch": 1.0456684900139932, + "ewc_loss": 0.0368414930999279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.842074743763078e-05, + "grad_norm": 26.195024490356445, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8541991114616394, + "num_tokens": 313514604.0, + "step": 8220 + }, + { + "epoch": 1.0457957002925837, + "ewc_loss": 0.03682902827858925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.841451376094483e-05, + "grad_norm": 26.1241512298584, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8571944236755371, + "num_tokens": 313548907.0, + "step": 8221 + }, + { + "epoch": 1.0459229105711743, + "ewc_loss": 0.03678498789668083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8392493075225502e-05, + "grad_norm": 26.091609954833984, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8608050346374512, + "num_tokens": 313586615.0, + "step": 8222 + }, + { + "epoch": 1.0460501208497646, + "ewc_loss": 0.03676369786262512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8381848349235952e-05, + "grad_norm": 26.07245635986328, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8521336317062378, + "num_tokens": 313625202.0, + "step": 8223 + }, + { + "epoch": 1.046177331128355, + "ewc_loss": 0.03682347759604454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8411737983115017e-05, + "grad_norm": 26.114181518554688, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8549103140830994, + "num_tokens": 313663257.0, + "step": 8224 + }, + { + "epoch": 1.0463045414069456, + "ewc_loss": 0.03686442971229553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8432214346830733e-05, + "grad_norm": 26.151851654052734, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8647226095199585, + "num_tokens": 313701430.0, + "step": 8225 + }, + { + "epoch": 1.0464317516855361, + "ewc_loss": 0.036804746836423874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8402373825665563e-05, + "grad_norm": 26.08445930480957, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8717348575592041, + "num_tokens": 313741286.0, + "step": 8226 + }, + { + "epoch": 1.0465589619641267, + "ewc_loss": 0.03686848282814026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8434240701026283e-05, + "grad_norm": 26.12713050842285, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8464547395706177, + "num_tokens": 313779115.0, + "step": 8227 + }, + { + "epoch": 1.0466861722427172, + "ewc_loss": 0.036941543221473694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8470771465217695e-05, + "grad_norm": 26.219139099121094, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8667839169502258, + "num_tokens": 313822331.0, + "step": 8228 + }, + { + "epoch": 1.0468133825213077, + "ewc_loss": 0.03688611462712288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.844305734266527e-05, + "grad_norm": 26.356937408447266, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8558272123336792, + "num_tokens": 313863545.0, + "step": 8229 + }, + { + "epoch": 1.0469405927998983, + "ewc_loss": 0.03683584928512573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8417924366076477e-05, + "grad_norm": 26.092674255371094, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8815221190452576, + "num_tokens": 313894956.0, + "step": 8230 + }, + { + "epoch": 1.0470678030784888, + "ewc_loss": 0.036876093596220016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.84380460268585e-05, + "grad_norm": 26.20763397216797, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8699101805686951, + "num_tokens": 313927920.0, + "step": 8231 + }, + { + "epoch": 1.0471950133570793, + "ewc_loss": 0.036905378103256226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8452688891557045e-05, + "grad_norm": 26.19867706298828, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8699843883514404, + "num_tokens": 313963230.0, + "step": 8232 + }, + { + "epoch": 1.0473222236356698, + "ewc_loss": 0.03682650253176689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8413251382298768e-05, + "grad_norm": 26.115089416503906, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8534609079360962, + "num_tokens": 313995951.0, + "step": 8233 + }, + { + "epoch": 1.0474494339142604, + "ewc_loss": 0.03689366951584816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8446835383656435e-05, + "grad_norm": 26.136119842529297, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8620853424072266, + "num_tokens": 314040928.0, + "step": 8234 + }, + { + "epoch": 1.0475766441928507, + "ewc_loss": 0.03681393712759018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.840696859289892e-05, + "grad_norm": 26.084012985229492, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8532525300979614, + "num_tokens": 314080329.0, + "step": 8235 + }, + { + "epoch": 1.0477038544714412, + "ewc_loss": 0.03685130178928375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.842565143306274e-05, + "grad_norm": 26.17819595336914, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8625076413154602, + "num_tokens": 314117107.0, + "step": 8236 + }, + { + "epoch": 1.0478310647500317, + "ewc_loss": 0.03687914460897446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8439572158968076e-05, + "grad_norm": 26.07097816467285, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8583508729934692, + "num_tokens": 314148077.0, + "step": 8237 + }, + { + "epoch": 1.0479582750286223, + "ewc_loss": 0.036903202533721924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8451601135893725e-05, + "grad_norm": 26.115558624267578, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.853240966796875, + "num_tokens": 314182064.0, + "step": 8238 + }, + { + "epoch": 1.0480854853072128, + "ewc_loss": 0.036933787167072296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8466893379809335e-05, + "grad_norm": 26.11678695678711, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8529797792434692, + "num_tokens": 314221667.0, + "step": 8239 + }, + { + "epoch": 1.0482126955858033, + "ewc_loss": 0.03693540394306183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8467702830093913e-05, + "grad_norm": 26.145660400390625, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8582614660263062, + "num_tokens": 314255225.0, + "step": 8240 + }, + { + "epoch": 1.0483399058643939, + "ewc_loss": 0.03696141391992569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8480706785339862e-05, + "grad_norm": 26.12998390197754, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8614848852157593, + "num_tokens": 314288011.0, + "step": 8241 + }, + { + "epoch": 1.0484671161429844, + "ewc_loss": 0.0369735062122345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.848675310611725e-05, + "grad_norm": 26.178590774536133, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.859120786190033, + "num_tokens": 314332461.0, + "step": 8242 + }, + { + "epoch": 1.048594326421575, + "ewc_loss": 0.03702165186405182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8510825611883774e-05, + "grad_norm": 26.060754776000977, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8578441143035889, + "num_tokens": 314367967.0, + "step": 8243 + }, + { + "epoch": 1.0487215367001654, + "ewc_loss": 0.03696407377719879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8482036466593854e-05, + "grad_norm": 26.162002563476562, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8703308701515198, + "num_tokens": 314404487.0, + "step": 8244 + }, + { + "epoch": 1.048848746978756, + "ewc_loss": 0.03699401393532753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8497006749385037e-05, + "grad_norm": 26.029285430908203, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8583869934082031, + "num_tokens": 314444731.0, + "step": 8245 + }, + { + "epoch": 1.0489759572573465, + "ewc_loss": 0.03697764873504639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8488824935047887e-05, + "grad_norm": 26.20485496520996, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8612246513366699, + "num_tokens": 314479720.0, + "step": 8246 + }, + { + "epoch": 1.0491031675359368, + "ewc_loss": 0.0370766818523407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8538341464591213e-05, + "grad_norm": 26.096683502197266, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8788846731185913, + "num_tokens": 314515613.0, + "step": 8247 + }, + { + "epoch": 1.0492303778145273, + "ewc_loss": 0.03699028491973877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8495142285246402e-05, + "grad_norm": 26.1580867767334, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8605541586875916, + "num_tokens": 314556519.0, + "step": 8248 + }, + { + "epoch": 1.0493575880931179, + "ewc_loss": 0.03700961545109749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8504808394936845e-05, + "grad_norm": 26.14987564086914, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8727535605430603, + "num_tokens": 314590803.0, + "step": 8249 + }, + { + "epoch": 1.0494847983717084, + "ewc_loss": 0.03700518608093262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8502592865843326e-05, + "grad_norm": 26.1198787689209, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8634840250015259, + "num_tokens": 314631347.0, + "step": 8250 + }, + { + "epoch": 1.049612008650299, + "ewc_loss": 0.03699289634823799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8496448319638148e-05, + "grad_norm": 26.134973526000977, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8615210652351379, + "num_tokens": 314673739.0, + "step": 8251 + }, + { + "epoch": 1.0497392189288894, + "ewc_loss": 0.03699793666601181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.849896761996206e-05, + "grad_norm": 26.102203369140625, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8631556034088135, + "num_tokens": 314718498.0, + "step": 8252 + }, + { + "epoch": 1.04986642920748, + "ewc_loss": 0.03700467571616173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.850233820732683e-05, + "grad_norm": 26.188669204711914, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8669837713241577, + "num_tokens": 314758755.0, + "step": 8253 + }, + { + "epoch": 1.0499936394860705, + "ewc_loss": 0.03702724352478981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8513621398597024e-05, + "grad_norm": 26.128353118896484, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8651567101478577, + "num_tokens": 314793644.0, + "step": 8254 + }, + { + "epoch": 1.050120849764661, + "ewc_loss": 0.036906491965055466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.845324550231453e-05, + "grad_norm": 26.073711395263672, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8573421239852905, + "num_tokens": 314828045.0, + "step": 8255 + }, + { + "epoch": 1.0502480600432516, + "ewc_loss": 0.03704734146595001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8523671315051615e-05, + "grad_norm": 26.23058319091797, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8641636371612549, + "num_tokens": 314866088.0, + "step": 8256 + }, + { + "epoch": 1.050375270321842, + "ewc_loss": 0.03697621822357178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.848810825322289e-05, + "grad_norm": 26.077299118041992, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8532796502113342, + "num_tokens": 314904757.0, + "step": 8257 + }, + { + "epoch": 1.0505024806004326, + "ewc_loss": 0.03693375736474991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8466878827894107e-05, + "grad_norm": 26.261333465576172, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8531515598297119, + "num_tokens": 314941410.0, + "step": 8258 + }, + { + "epoch": 1.0506296908790231, + "ewc_loss": 0.03706739470362663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.853369758464396e-05, + "grad_norm": 26.177080154418945, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.869523286819458, + "num_tokens": 314982060.0, + "step": 8259 + }, + { + "epoch": 1.0507569011576134, + "ewc_loss": 0.036931850016117096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.846592567744665e-05, + "grad_norm": 26.254552841186523, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8529376983642578, + "num_tokens": 315021812.0, + "step": 8260 + }, + { + "epoch": 1.050884111436204, + "ewc_loss": 0.03698311001062393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.849155523814261e-05, + "grad_norm": 26.19679832458496, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8577792048454285, + "num_tokens": 315059520.0, + "step": 8261 + }, + { + "epoch": 1.0510113217147945, + "ewc_loss": 0.03689567372202873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.844783764681779e-05, + "grad_norm": 26.18406105041504, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8564960956573486, + "num_tokens": 315098392.0, + "step": 8262 + }, + { + "epoch": 1.051138531993385, + "ewc_loss": 0.0369335375726223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.846676786954049e-05, + "grad_norm": 26.168991088867188, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8503342270851135, + "num_tokens": 315137375.0, + "step": 8263 + }, + { + "epoch": 1.0512657422719756, + "ewc_loss": 0.03693556413054466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.846778286562767e-05, + "grad_norm": 26.15070152282715, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8654900789260864, + "num_tokens": 315171308.0, + "step": 8264 + }, + { + "epoch": 1.051392952550566, + "ewc_loss": 0.03699006512761116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.849503314588219e-05, + "grad_norm": 26.165006637573242, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8634016513824463, + "num_tokens": 315212176.0, + "step": 8265 + }, + { + "epoch": 1.0515201628291566, + "ewc_loss": 0.03699805960059166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.849902946560178e-05, + "grad_norm": 26.17258071899414, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8495235443115234, + "num_tokens": 315251321.0, + "step": 8266 + }, + { + "epoch": 1.0516473731077471, + "ewc_loss": 0.03694836422801018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8474182070349343e-05, + "grad_norm": 26.1000919342041, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8477290272712708, + "num_tokens": 315288780.0, + "step": 8267 + }, + { + "epoch": 1.0517745833863377, + "ewc_loss": 0.03697976469993591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8489881767891347e-05, + "grad_norm": 26.11745834350586, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8839743137359619, + "num_tokens": 315325630.0, + "step": 8268 + }, + { + "epoch": 1.0519017936649282, + "ewc_loss": 0.03698785975575447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8493929019314237e-05, + "grad_norm": 26.114158630371094, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.856395959854126, + "num_tokens": 315362887.0, + "step": 8269 + }, + { + "epoch": 1.0520290039435187, + "ewc_loss": 0.03708775341510773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8543876649346203e-05, + "grad_norm": 26.329038619995117, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8731315732002258, + "num_tokens": 315397552.0, + "step": 8270 + }, + { + "epoch": 1.0521562142221093, + "ewc_loss": 0.036991242319345474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8495620679459535e-05, + "grad_norm": 25.963769912719727, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8712615966796875, + "num_tokens": 315436785.0, + "step": 8271 + }, + { + "epoch": 1.0522834245006996, + "ewc_loss": 0.03698807209730148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8494036339689046e-05, + "grad_norm": 26.242496490478516, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8707602024078369, + "num_tokens": 315467849.0, + "step": 8272 + }, + { + "epoch": 1.05241063477929, + "ewc_loss": 0.037091050297021866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8545524653745815e-05, + "grad_norm": 26.04273223876953, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8696851134300232, + "num_tokens": 315501496.0, + "step": 8273 + }, + { + "epoch": 1.0525378450578806, + "ewc_loss": 0.03699235990643501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8496179109206423e-05, + "grad_norm": 26.193302154541016, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8700194358825684, + "num_tokens": 315540878.0, + "step": 8274 + }, + { + "epoch": 1.0526650553364711, + "ewc_loss": 0.03707101196050644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8535505660111085e-05, + "grad_norm": 26.166444778442383, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8761964440345764, + "num_tokens": 315578139.0, + "step": 8275 + }, + { + "epoch": 1.0527922656150617, + "ewc_loss": 0.03696051985025406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.848025931394659e-05, + "grad_norm": 26.165687561035156, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8631153702735901, + "num_tokens": 315615845.0, + "step": 8276 + }, + { + "epoch": 1.0529194758936522, + "ewc_loss": 0.037048935890197754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.852446803241037e-05, + "grad_norm": 26.108325958251953, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8539758920669556, + "num_tokens": 315645407.0, + "step": 8277 + }, + { + "epoch": 1.0530466861722427, + "ewc_loss": 0.037012066692113876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.850603257480543e-05, + "grad_norm": 26.088451385498047, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8686416149139404, + "num_tokens": 315687922.0, + "step": 8278 + }, + { + "epoch": 1.0531738964508333, + "ewc_loss": 0.0371202789247036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8560140233603306e-05, + "grad_norm": 26.278968811035156, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8634141683578491, + "num_tokens": 315723730.0, + "step": 8279 + }, + { + "epoch": 1.0533011067294238, + "ewc_loss": 0.037125807255506516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8562903278507292e-05, + "grad_norm": 26.195209503173828, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8734641075134277, + "num_tokens": 315765662.0, + "step": 8280 + }, + { + "epoch": 1.0534283170080143, + "ewc_loss": 0.037035003304481506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8517501302994788e-05, + "grad_norm": 26.105266571044922, + "learning_rate": 1e-06, + "loss": 0.5166, + "mean_token_accuracy": 0.8370628356933594, + "num_tokens": 315808668.0, + "step": 8281 + }, + { + "epoch": 1.0535555272866048, + "ewc_loss": 0.03715313598513603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.857656752690673e-05, + "grad_norm": 26.302467346191406, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8576592206954956, + "num_tokens": 315844878.0, + "step": 8282 + }, + { + "epoch": 1.0536827375651954, + "ewc_loss": 0.037054453045129776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8527225620346144e-05, + "grad_norm": 26.071617126464844, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8755030632019043, + "num_tokens": 315879472.0, + "step": 8283 + }, + { + "epoch": 1.0538099478437857, + "ewc_loss": 0.03702831268310547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8514156181481667e-05, + "grad_norm": 26.239885330200195, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8557624816894531, + "num_tokens": 315915496.0, + "step": 8284 + }, + { + "epoch": 1.0539371581223762, + "ewc_loss": 0.03704714775085449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8523573089623824e-05, + "grad_norm": 26.067598342895508, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8639678955078125, + "num_tokens": 315954317.0, + "step": 8285 + }, + { + "epoch": 1.0540643684009667, + "ewc_loss": 0.03702080249786377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8510401787352748e-05, + "grad_norm": 26.227323532104492, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8575544357299805, + "num_tokens": 315995518.0, + "step": 8286 + }, + { + "epoch": 1.0541915786795573, + "ewc_loss": 0.0370236411690712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8511820599087514e-05, + "grad_norm": 26.026161193847656, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8753931522369385, + "num_tokens": 316040475.0, + "step": 8287 + }, + { + "epoch": 1.0543187889581478, + "ewc_loss": 0.03699992597103119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.84999626071658e-05, + "grad_norm": 26.194520950317383, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8601678609848022, + "num_tokens": 316083003.0, + "step": 8288 + }, + { + "epoch": 1.0544459992367383, + "ewc_loss": 0.03706832975149155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.853416506492067e-05, + "grad_norm": 26.089859008789062, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8624253273010254, + "num_tokens": 316124687.0, + "step": 8289 + }, + { + "epoch": 1.0545732095153288, + "ewc_loss": 0.037003207951784134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8501603335607797e-05, + "grad_norm": 26.323974609375, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8497301936149597, + "num_tokens": 316170052.0, + "step": 8290 + }, + { + "epoch": 1.0547004197939194, + "ewc_loss": 0.03698479384183884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.849239743023645e-05, + "grad_norm": 26.11793327331543, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8479077816009521, + "num_tokens": 316211941.0, + "step": 8291 + }, + { + "epoch": 1.05482763007251, + "ewc_loss": 0.03694372624158859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8471862858859822e-05, + "grad_norm": 26.25967025756836, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8652439117431641, + "num_tokens": 316252592.0, + "step": 8292 + }, + { + "epoch": 1.0549548403511004, + "ewc_loss": 0.037033528089523315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.851676461228635e-05, + "grad_norm": 26.227338790893555, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.870475709438324, + "num_tokens": 316287250.0, + "step": 8293 + }, + { + "epoch": 1.055082050629691, + "ewc_loss": 0.03689243644475937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8446218746248633e-05, + "grad_norm": 26.21026039123535, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8658583164215088, + "num_tokens": 316326308.0, + "step": 8294 + }, + { + "epoch": 1.0552092609082815, + "ewc_loss": 0.03691093623638153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8455468307365663e-05, + "grad_norm": 26.112834930419922, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8666863441467285, + "num_tokens": 316364519.0, + "step": 8295 + }, + { + "epoch": 1.0553364711868718, + "ewc_loss": 0.036916494369506836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.845824772317428e-05, + "grad_norm": 26.152935028076172, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8676794767379761, + "num_tokens": 316401165.0, + "step": 8296 + }, + { + "epoch": 1.0554636814654623, + "ewc_loss": 0.03693370521068573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8466853362042457e-05, + "grad_norm": 26.116724014282227, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8568007349967957, + "num_tokens": 316440245.0, + "step": 8297 + }, + { + "epoch": 1.0555908917440529, + "ewc_loss": 0.03699104115366936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.849552063504234e-05, + "grad_norm": 26.286209106445312, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8711163997650146, + "num_tokens": 316484969.0, + "step": 8298 + }, + { + "epoch": 1.0557181020226434, + "ewc_loss": 0.03697808459401131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.848904139478691e-05, + "grad_norm": 26.187244415283203, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8800753355026245, + "num_tokens": 316520510.0, + "step": 8299 + }, + { + "epoch": 1.055845312301234, + "ewc_loss": 0.036917492747306824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8458746126270853e-05, + "grad_norm": 26.334197998046875, + "learning_rate": 1e-06, + "loss": 0.4901, + "mean_token_accuracy": 0.8459247350692749, + "num_tokens": 316561946.0, + "step": 8300 + }, + { + "epoch": 1.0559725225798244, + "ewc_loss": 0.036967676132917404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8483837266103365e-05, + "grad_norm": 26.144712448120117, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8792788982391357, + "num_tokens": 316605489.0, + "step": 8301 + }, + { + "epoch": 1.056099732858415, + "ewc_loss": 0.03689601644873619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8448008631821722e-05, + "grad_norm": 26.313322067260742, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8455809354782104, + "num_tokens": 316648590.0, + "step": 8302 + }, + { + "epoch": 1.0562269431370055, + "ewc_loss": 0.036887843161821365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8443921362631954e-05, + "grad_norm": 26.00366973876953, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8751430511474609, + "num_tokens": 316688169.0, + "step": 8303 + }, + { + "epoch": 1.056354153415596, + "ewc_loss": 0.036864154040813446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8432076103636064e-05, + "grad_norm": 26.258010864257812, + "learning_rate": 1e-06, + "loss": 0.5172, + "mean_token_accuracy": 0.8413903117179871, + "num_tokens": 316724031.0, + "step": 8304 + }, + { + "epoch": 1.0564813636941865, + "ewc_loss": 0.03698885813355446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8494429241400212e-05, + "grad_norm": 26.043947219848633, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8622251152992249, + "num_tokens": 316769762.0, + "step": 8305 + }, + { + "epoch": 1.056608573972777, + "ewc_loss": 0.03692703694105148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8463519154465757e-05, + "grad_norm": 26.2318172454834, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8705252408981323, + "num_tokens": 316814156.0, + "step": 8306 + }, + { + "epoch": 1.0567357842513676, + "ewc_loss": 0.0369548499584198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8477425328455865e-05, + "grad_norm": 26.128379821777344, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8723809123039246, + "num_tokens": 316852784.0, + "step": 8307 + }, + { + "epoch": 1.0568629945299581, + "ewc_loss": 0.036935292184352875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8467646441422403e-05, + "grad_norm": 26.179662704467773, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8563433885574341, + "num_tokens": 316891992.0, + "step": 8308 + }, + { + "epoch": 1.0569902048085484, + "ewc_loss": 0.03695264086127281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8476321201887913e-05, + "grad_norm": 26.214258193969727, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8771540522575378, + "num_tokens": 316932246.0, + "step": 8309 + }, + { + "epoch": 1.057117415087139, + "ewc_loss": 0.03687487542629242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8437438484397717e-05, + "grad_norm": 26.217708587646484, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8539851903915405, + "num_tokens": 316972017.0, + "step": 8310 + }, + { + "epoch": 1.0572446253657295, + "ewc_loss": 0.03695737570524216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8478687707101926e-05, + "grad_norm": 26.12028694152832, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8567308783531189, + "num_tokens": 317008631.0, + "step": 8311 + }, + { + "epoch": 1.05737183564432, + "ewc_loss": 0.03688165545463562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8440827261656523e-05, + "grad_norm": 26.197219848632812, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8682909607887268, + "num_tokens": 317048616.0, + "step": 8312 + }, + { + "epoch": 1.0574990459229106, + "ewc_loss": 0.03694907948374748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.847454041126184e-05, + "grad_norm": 26.11794090270996, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8544343709945679, + "num_tokens": 317088675.0, + "step": 8313 + }, + { + "epoch": 1.057626256201501, + "ewc_loss": 0.03689488023519516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8447439288138412e-05, + "grad_norm": 26.28786277770996, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8823837041854858, + "num_tokens": 317120351.0, + "step": 8314 + }, + { + "epoch": 1.0577534664800916, + "ewc_loss": 0.03693627566099167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.846813756856136e-05, + "grad_norm": 26.145204544067383, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8563761711120605, + "num_tokens": 317159027.0, + "step": 8315 + }, + { + "epoch": 1.0578806767586821, + "ewc_loss": 0.0368974395096302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8448719856678508e-05, + "grad_norm": 26.22605323791504, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8622557520866394, + "num_tokens": 317188892.0, + "step": 8316 + }, + { + "epoch": 1.0580078870372727, + "ewc_loss": 0.03695186600089073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8475933757144958e-05, + "grad_norm": 26.2154483795166, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8676643967628479, + "num_tokens": 317229026.0, + "step": 8317 + }, + { + "epoch": 1.0581350973158632, + "ewc_loss": 0.0369027778506279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.845138831413351e-05, + "grad_norm": 26.235248565673828, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8614311814308167, + "num_tokens": 317272946.0, + "step": 8318 + }, + { + "epoch": 1.0582623075944537, + "ewc_loss": 0.036881107836961746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.844055441324599e-05, + "grad_norm": 26.10661506652832, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8619372844696045, + "num_tokens": 317314625.0, + "step": 8319 + }, + { + "epoch": 1.058389517873044, + "ewc_loss": 0.036891769617795944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.844588405219838e-05, + "grad_norm": 26.265583038330078, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8719244003295898, + "num_tokens": 317351450.0, + "step": 8320 + }, + { + "epoch": 1.0585167281516346, + "ewc_loss": 0.03697698190808296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8488490240997635e-05, + "grad_norm": 26.12180519104004, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8467913866043091, + "num_tokens": 317391519.0, + "step": 8321 + }, + { + "epoch": 1.058643938430225, + "ewc_loss": 0.03690670058131218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8453351003699936e-05, + "grad_norm": 26.249534606933594, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8554673790931702, + "num_tokens": 317430672.0, + "step": 8322 + }, + { + "epoch": 1.0587711487088156, + "ewc_loss": 0.036978255957365036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.848912870627828e-05, + "grad_norm": 26.10502815246582, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8631877303123474, + "num_tokens": 317470203.0, + "step": 8323 + }, + { + "epoch": 1.0588983589874061, + "ewc_loss": 0.03691369667649269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8456848920322955e-05, + "grad_norm": 26.13101577758789, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8643277883529663, + "num_tokens": 317509594.0, + "step": 8324 + }, + { + "epoch": 1.0590255692659967, + "ewc_loss": 0.03700394555926323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8501972590456717e-05, + "grad_norm": 26.149688720703125, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8600187301635742, + "num_tokens": 317549735.0, + "step": 8325 + }, + { + "epoch": 1.0591527795445872, + "ewc_loss": 0.036997441202402115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8498720237403177e-05, + "grad_norm": 26.230281829833984, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8596429824829102, + "num_tokens": 317585064.0, + "step": 8326 + }, + { + "epoch": 1.0592799898231777, + "ewc_loss": 0.0370001494884491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.850007538450882e-05, + "grad_norm": 26.095256805419922, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8773288726806641, + "num_tokens": 317622289.0, + "step": 8327 + }, + { + "epoch": 1.0594072001017683, + "ewc_loss": 0.036978885531425476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8489443391445093e-05, + "grad_norm": 26.126529693603516, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8513031005859375, + "num_tokens": 317662872.0, + "step": 8328 + }, + { + "epoch": 1.0595344103803588, + "ewc_loss": 0.037033505737781525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.851675369834993e-05, + "grad_norm": 26.268442153930664, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8629216551780701, + "num_tokens": 317701418.0, + "step": 8329 + }, + { + "epoch": 1.0596616206589493, + "ewc_loss": 0.037068985402584076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.853449248301331e-05, + "grad_norm": 26.141223907470703, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.875798225402832, + "num_tokens": 317741416.0, + "step": 8330 + }, + { + "epoch": 1.0597888309375398, + "ewc_loss": 0.0370158776640892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8507938875700347e-05, + "grad_norm": 26.422945022583008, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.866665244102478, + "num_tokens": 317778623.0, + "step": 8331 + }, + { + "epoch": 1.0599160412161304, + "ewc_loss": 0.03704502433538437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8522512618801557e-05, + "grad_norm": 26.157129287719727, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8644107580184937, + "num_tokens": 317816577.0, + "step": 8332 + }, + { + "epoch": 1.0600432514947207, + "ewc_loss": 0.03695608302950859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8478041965863667e-05, + "grad_norm": 26.272720336914062, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8603239059448242, + "num_tokens": 317851532.0, + "step": 8333 + }, + { + "epoch": 1.0601704617733112, + "ewc_loss": 0.03707674890756607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8538374206400476e-05, + "grad_norm": 26.281036376953125, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8628201484680176, + "num_tokens": 317886789.0, + "step": 8334 + }, + { + "epoch": 1.0602976720519017, + "ewc_loss": 0.03696427121758461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8482134692021646e-05, + "grad_norm": 26.1965389251709, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8656105399131775, + "num_tokens": 317929511.0, + "step": 8335 + }, + { + "epoch": 1.0604248823304923, + "ewc_loss": 0.03699023649096489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8495118638384156e-05, + "grad_norm": 26.168882369995117, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.860875129699707, + "num_tokens": 317963767.0, + "step": 8336 + }, + { + "epoch": 1.0605520926090828, + "ewc_loss": 0.03702007234096527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8510036170482635e-05, + "grad_norm": 26.07931137084961, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8805840015411377, + "num_tokens": 318000458.0, + "step": 8337 + }, + { + "epoch": 1.0606793028876733, + "ewc_loss": 0.03702235966920853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8511180314817466e-05, + "grad_norm": 26.22999382019043, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8800644874572754, + "num_tokens": 318040394.0, + "step": 8338 + }, + { + "epoch": 1.0608065131662638, + "ewc_loss": 0.03711726889014244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.855863411037717e-05, + "grad_norm": 26.13454246520996, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8582496643066406, + "num_tokens": 318080859.0, + "step": 8339 + }, + { + "epoch": 1.0609337234448544, + "ewc_loss": 0.03698335215449333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8491675291443244e-05, + "grad_norm": 26.184167861938477, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8796662092208862, + "num_tokens": 318115378.0, + "step": 8340 + }, + { + "epoch": 1.061060933723445, + "ewc_loss": 0.0371309332549572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.856546623457689e-05, + "grad_norm": 26.248178482055664, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.854689359664917, + "num_tokens": 318150351.0, + "step": 8341 + }, + { + "epoch": 1.0611881440020354, + "ewc_loss": 0.03706933557987213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.853466710599605e-05, + "grad_norm": 26.09604263305664, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8623819351196289, + "num_tokens": 318192427.0, + "step": 8342 + }, + { + "epoch": 1.061315354280626, + "ewc_loss": 0.03706736117601395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8533681213739328e-05, + "grad_norm": 26.246417999267578, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8500344753265381, + "num_tokens": 318232150.0, + "step": 8343 + }, + { + "epoch": 1.0614425645592165, + "ewc_loss": 0.037091489881277084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8545744751463644e-05, + "grad_norm": 26.02776527404785, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8585437536239624, + "num_tokens": 318267742.0, + "step": 8344 + }, + { + "epoch": 1.0615697748378068, + "ewc_loss": 0.03701486811041832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.850743319664616e-05, + "grad_norm": 26.295475006103516, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8672928810119629, + "num_tokens": 318307974.0, + "step": 8345 + }, + { + "epoch": 1.0616969851163973, + "ewc_loss": 0.03717566654086113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8587832528282888e-05, + "grad_norm": 26.142038345336914, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8445029854774475, + "num_tokens": 318349880.0, + "step": 8346 + }, + { + "epoch": 1.0618241953949878, + "ewc_loss": 0.03704516962170601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.85225853783777e-05, + "grad_norm": 26.256690979003906, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8606191873550415, + "num_tokens": 318383869.0, + "step": 8347 + }, + { + "epoch": 1.0619514056735784, + "ewc_loss": 0.037135522812604904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8567761799204163e-05, + "grad_norm": 26.15116310119629, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8776289224624634, + "num_tokens": 318419659.0, + "step": 8348 + }, + { + "epoch": 1.062078615952169, + "ewc_loss": 0.03709776699542999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8548884327174164e-05, + "grad_norm": 26.30988121032715, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8718353509902954, + "num_tokens": 318460795.0, + "step": 8349 + }, + { + "epoch": 1.0622058262307594, + "ewc_loss": 0.03712388873100281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8561944671091624e-05, + "grad_norm": 26.332073211669922, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8526628017425537, + "num_tokens": 318499451.0, + "step": 8350 + }, + { + "epoch": 1.06233303650935, + "ewc_loss": 0.03701651096343994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8508255379856564e-05, + "grad_norm": 26.184955596923828, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.868268609046936, + "num_tokens": 318533413.0, + "step": 8351 + }, + { + "epoch": 1.0624602467879405, + "ewc_loss": 0.037041787058115005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.85208937182324e-05, + "grad_norm": 26.229822158813477, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8522322773933411, + "num_tokens": 318572533.0, + "step": 8352 + }, + { + "epoch": 1.062587457066531, + "ewc_loss": 0.03704975172877312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8524875486036763e-05, + "grad_norm": 26.049701690673828, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.865495502948761, + "num_tokens": 318609442.0, + "step": 8353 + }, + { + "epoch": 1.0627146673451215, + "ewc_loss": 0.03712139278650284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.856069684436079e-05, + "grad_norm": 26.319013595581055, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8774126172065735, + "num_tokens": 318643475.0, + "step": 8354 + }, + { + "epoch": 1.062841877623712, + "ewc_loss": 0.03720557689666748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8602788259158842e-05, + "grad_norm": 26.214839935302734, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8653690814971924, + "num_tokens": 318684223.0, + "step": 8355 + }, + { + "epoch": 1.0629690879023026, + "ewc_loss": 0.037025440484285355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.851272099884227e-05, + "grad_norm": 26.159997940063477, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8642069697380066, + "num_tokens": 318717148.0, + "step": 8356 + }, + { + "epoch": 1.0630962981808931, + "ewc_loss": 0.037117742002010345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.855887057899963e-05, + "grad_norm": 26.239179611206055, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8736456632614136, + "num_tokens": 318758929.0, + "step": 8357 + }, + { + "epoch": 1.0632235084594834, + "ewc_loss": 0.037033554166555405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8516777345212176e-05, + "grad_norm": 26.205856323242188, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8604547381401062, + "num_tokens": 318796438.0, + "step": 8358 + }, + { + "epoch": 1.063350718738074, + "ewc_loss": 0.037101004272699356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8550501408753917e-05, + "grad_norm": 26.282085418701172, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.8496899604797363, + "num_tokens": 318835732.0, + "step": 8359 + }, + { + "epoch": 1.0634779290166645, + "ewc_loss": 0.03710412606596947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8552063920651563e-05, + "grad_norm": 26.209779739379883, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.860206127166748, + "num_tokens": 318872151.0, + "step": 8360 + }, + { + "epoch": 1.063605139295255, + "ewc_loss": 0.03708634898066521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8543174519436434e-05, + "grad_norm": 26.221397399902344, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8518391847610474, + "num_tokens": 318911956.0, + "step": 8361 + }, + { + "epoch": 1.0637323495738455, + "ewc_loss": 0.03710455074906349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8552274923422374e-05, + "grad_norm": 26.23090934753418, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.864459216594696, + "num_tokens": 318945593.0, + "step": 8362 + }, + { + "epoch": 1.063859559852436, + "ewc_loss": 0.03707634285092354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8538172298576683e-05, + "grad_norm": 26.438386917114258, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8725385665893555, + "num_tokens": 318976670.0, + "step": 8363 + }, + { + "epoch": 1.0639867701310266, + "ewc_loss": 0.03714191913604736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8570959582575597e-05, + "grad_norm": 26.194725036621094, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8659483790397644, + "num_tokens": 319007798.0, + "step": 8364 + }, + { + "epoch": 1.0641139804096171, + "ewc_loss": 0.03704887256026268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8524437109590508e-05, + "grad_norm": 26.436939239501953, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8520892858505249, + "num_tokens": 319044808.0, + "step": 8365 + }, + { + "epoch": 1.0642411906882077, + "ewc_loss": 0.03707965090870857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8539825759944506e-05, + "grad_norm": 26.185739517211914, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8825401067733765, + "num_tokens": 319083387.0, + "step": 8366 + }, + { + "epoch": 1.0643684009667982, + "ewc_loss": 0.03699663281440735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.849831642175559e-05, + "grad_norm": 26.342281341552734, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8782457113265991, + "num_tokens": 319121897.0, + "step": 8367 + }, + { + "epoch": 1.0644956112453887, + "ewc_loss": 0.037130676209926605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.856533890531864e-05, + "grad_norm": 26.13751220703125, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8516801595687866, + "num_tokens": 319157190.0, + "step": 8368 + }, + { + "epoch": 1.064622821523979, + "ewc_loss": 0.037017591297626495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8508795619709417e-05, + "grad_norm": 26.11234474182129, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8725130558013916, + "num_tokens": 319188353.0, + "step": 8369 + }, + { + "epoch": 1.0647500318025696, + "ewc_loss": 0.0371817909181118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.859089570643846e-05, + "grad_norm": 26.19672393798828, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8650970458984375, + "num_tokens": 319225536.0, + "step": 8370 + }, + { + "epoch": 1.06487724208116, + "ewc_loss": 0.03715641051530838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.857820461736992e-05, + "grad_norm": 26.183378219604492, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8692718744277954, + "num_tokens": 319265114.0, + "step": 8371 + }, + { + "epoch": 1.0650044523597506, + "ewc_loss": 0.03712071105837822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8560354874352925e-05, + "grad_norm": 26.118314743041992, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.873318076133728, + "num_tokens": 319309280.0, + "step": 8372 + }, + { + "epoch": 1.0651316626383411, + "ewc_loss": 0.03723820671439171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8619102775119245e-05, + "grad_norm": 26.346586227416992, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.874252200126648, + "num_tokens": 319346114.0, + "step": 8373 + }, + { + "epoch": 1.0652588729169317, + "ewc_loss": 0.03720678761601448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8603393982630223e-05, + "grad_norm": 26.13340187072754, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8720638751983643, + "num_tokens": 319389001.0, + "step": 8374 + }, + { + "epoch": 1.0653860831955222, + "ewc_loss": 0.03708561882376671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.854280890256632e-05, + "grad_norm": 26.070783615112305, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.870815634727478, + "num_tokens": 319431464.0, + "step": 8375 + }, + { + "epoch": 1.0655132934741127, + "ewc_loss": 0.037230003625154495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.861500095401425e-05, + "grad_norm": 26.244020462036133, + "learning_rate": 1e-06, + "loss": 0.5161, + "mean_token_accuracy": 0.8384417295455933, + "num_tokens": 319475638.0, + "step": 8376 + }, + { + "epoch": 1.0656405037527032, + "ewc_loss": 0.037194594740867615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8597296730149537e-05, + "grad_norm": 26.147241592407227, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8736847043037415, + "num_tokens": 319515983.0, + "step": 8377 + }, + { + "epoch": 1.0657677140312938, + "ewc_loss": 0.03713931515812874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8569657186162658e-05, + "grad_norm": 26.259990692138672, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8591848611831665, + "num_tokens": 319552152.0, + "step": 8378 + }, + { + "epoch": 1.0658949243098843, + "ewc_loss": 0.03719493746757507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8597469534142874e-05, + "grad_norm": 26.27783203125, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8555413484573364, + "num_tokens": 319592963.0, + "step": 8379 + }, + { + "epoch": 1.0660221345884748, + "ewc_loss": 0.0371110774576664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8555538190412335e-05, + "grad_norm": 26.226646423339844, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8498609066009521, + "num_tokens": 319635136.0, + "step": 8380 + }, + { + "epoch": 1.0661493448670654, + "ewc_loss": 0.037118036299943924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.855901791714132e-05, + "grad_norm": 26.312732696533203, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8662635684013367, + "num_tokens": 319670118.0, + "step": 8381 + }, + { + "epoch": 1.0662765551456557, + "ewc_loss": 0.03709280863404274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.854640504461713e-05, + "grad_norm": 26.17795753479004, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8690698742866516, + "num_tokens": 319711057.0, + "step": 8382 + }, + { + "epoch": 1.0664037654242462, + "ewc_loss": 0.03703044727444649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8515223928261548e-05, + "grad_norm": 26.23493766784668, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.861617922782898, + "num_tokens": 319746451.0, + "step": 8383 + }, + { + "epoch": 1.0665309757028367, + "ewc_loss": 0.0372018925845623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.860094562289305e-05, + "grad_norm": 26.2664852142334, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8565611839294434, + "num_tokens": 319792149.0, + "step": 8384 + }, + { + "epoch": 1.0666581859814273, + "ewc_loss": 0.03702791407704353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.851395791163668e-05, + "grad_norm": 26.169742584228516, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8556110858917236, + "num_tokens": 319826508.0, + "step": 8385 + }, + { + "epoch": 1.0667853962600178, + "ewc_loss": 0.03712288662791252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8561442630016245e-05, + "grad_norm": 26.205209732055664, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8652608394622803, + "num_tokens": 319867000.0, + "step": 8386 + }, + { + "epoch": 1.0669126065386083, + "ewc_loss": 0.0370464064180851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8523203834774904e-05, + "grad_norm": 26.185407638549805, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8450537919998169, + "num_tokens": 319904476.0, + "step": 8387 + }, + { + "epoch": 1.0670398168171988, + "ewc_loss": 0.03711148723959923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8555743736214936e-05, + "grad_norm": 26.251083374023438, + "learning_rate": 1e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8383913636207581, + "num_tokens": 319943279.0, + "step": 8388 + }, + { + "epoch": 1.0671670270957894, + "ewc_loss": 0.03719428926706314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.859714393503964e-05, + "grad_norm": 26.32558822631836, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8585140705108643, + "num_tokens": 319986610.0, + "step": 8389 + }, + { + "epoch": 1.06729423737438, + "ewc_loss": 0.03707664832472801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8538323274697177e-05, + "grad_norm": 26.201126098632812, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8559818863868713, + "num_tokens": 320022702.0, + "step": 8390 + }, + { + "epoch": 1.0674214476529704, + "ewc_loss": 0.037118539214134216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8559268937679008e-05, + "grad_norm": 26.260854721069336, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8642747402191162, + "num_tokens": 320067918.0, + "step": 8391 + }, + { + "epoch": 1.067548657931561, + "ewc_loss": 0.03713283687829971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8566417566034943e-05, + "grad_norm": 26.155780792236328, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8546091318130493, + "num_tokens": 320110455.0, + "step": 8392 + }, + { + "epoch": 1.0676758682101515, + "ewc_loss": 0.03713759779930115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8568798623164184e-05, + "grad_norm": 26.279268264770508, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8805306553840637, + "num_tokens": 320147675.0, + "step": 8393 + }, + { + "epoch": 1.0678030784887418, + "ewc_loss": 0.03719007968902588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8595039364299737e-05, + "grad_norm": 26.219202041625977, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8495290279388428, + "num_tokens": 320183882.0, + "step": 8394 + }, + { + "epoch": 1.0679302887673323, + "ewc_loss": 0.037099216133356094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8549608284956776e-05, + "grad_norm": 26.21052360534668, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8520452380180359, + "num_tokens": 320222215.0, + "step": 8395 + }, + { + "epoch": 1.0680574990459228, + "ewc_loss": 0.037191640585660934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8595819710753858e-05, + "grad_norm": 26.23814582824707, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8556025624275208, + "num_tokens": 320262418.0, + "step": 8396 + }, + { + "epoch": 1.0681847093245134, + "ewc_loss": 0.037136510014534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8568254745332524e-05, + "grad_norm": 26.23308753967285, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8567389249801636, + "num_tokens": 320297304.0, + "step": 8397 + }, + { + "epoch": 1.068311919603104, + "ewc_loss": 0.03722331300377846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.861165583250113e-05, + "grad_norm": 26.35061264038086, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8646315932273865, + "num_tokens": 320330919.0, + "step": 8398 + }, + { + "epoch": 1.0684391298816944, + "ewc_loss": 0.03719545155763626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8597726011648774e-05, + "grad_norm": 26.155744552612305, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8632586002349854, + "num_tokens": 320367602.0, + "step": 8399 + }, + { + "epoch": 1.068566340160285, + "ewc_loss": 0.037129778414964676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8564889614935964e-05, + "grad_norm": 26.317777633666992, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8636089563369751, + "num_tokens": 320403875.0, + "step": 8400 + }, + { + "epoch": 1.0686935504388755, + "ewc_loss": 0.03720422089099884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.860210977611132e-05, + "grad_norm": 26.199174880981445, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.865927517414093, + "num_tokens": 320439149.0, + "step": 8401 + }, + { + "epoch": 1.068820760717466, + "ewc_loss": 0.03711995482444763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8559976524556987e-05, + "grad_norm": 26.22501564025879, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8742886781692505, + "num_tokens": 320476786.0, + "step": 8402 + }, + { + "epoch": 1.0689479709960565, + "ewc_loss": 0.03714999929070473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8574999558040872e-05, + "grad_norm": 26.226825714111328, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8487945795059204, + "num_tokens": 320516945.0, + "step": 8403 + }, + { + "epoch": 1.069075181274647, + "ewc_loss": 0.0371554009616375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8577700757305138e-05, + "grad_norm": 26.206029891967773, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8767004013061523, + "num_tokens": 320552801.0, + "step": 8404 + }, + { + "epoch": 1.0692023915532376, + "ewc_loss": 0.03716328740119934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8581644326332025e-05, + "grad_norm": 26.226680755615234, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8654758334159851, + "num_tokens": 320593605.0, + "step": 8405 + }, + { + "epoch": 1.0693296018318281, + "ewc_loss": 0.0372718870639801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8635942979017273e-05, + "grad_norm": 26.344676971435547, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8701948523521423, + "num_tokens": 320624476.0, + "step": 8406 + }, + { + "epoch": 1.0694568121104184, + "ewc_loss": 0.0372471921145916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8623595678946003e-05, + "grad_norm": 26.27100944519043, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8695403337478638, + "num_tokens": 320667000.0, + "step": 8407 + }, + { + "epoch": 1.069584022389009, + "ewc_loss": 0.03715499863028526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8577498849481344e-05, + "grad_norm": 26.306142807006836, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8678615093231201, + "num_tokens": 320699286.0, + "step": 8408 + }, + { + "epoch": 1.0697112326675995, + "ewc_loss": 0.03722003474831581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8610016923048533e-05, + "grad_norm": 26.419069290161133, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8687267303466797, + "num_tokens": 320732520.0, + "step": 8409 + }, + { + "epoch": 1.06983844294619, + "ewc_loss": 0.037153054028749466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.857652750913985e-05, + "grad_norm": 26.314868927001953, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8515561819076538, + "num_tokens": 320773425.0, + "step": 8410 + }, + { + "epoch": 1.0699656532247805, + "ewc_loss": 0.0371273048222065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8563652702141553e-05, + "grad_norm": 26.221572875976562, + "learning_rate": 1e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8406702876091003, + "num_tokens": 320810859.0, + "step": 8411 + }, + { + "epoch": 1.070092863503371, + "ewc_loss": 0.03717638552188873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.858819268818479e-05, + "grad_norm": 26.452083587646484, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.865600049495697, + "num_tokens": 320850266.0, + "step": 8412 + }, + { + "epoch": 1.0702200737819616, + "ewc_loss": 0.037245314568281174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8622657080413774e-05, + "grad_norm": 26.281219482421875, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8615468740463257, + "num_tokens": 320892812.0, + "step": 8413 + }, + { + "epoch": 1.0703472840605521, + "ewc_loss": 0.03703492879867554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8517464923206717e-05, + "grad_norm": 26.423986434936523, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8745625019073486, + "num_tokens": 320929563.0, + "step": 8414 + }, + { + "epoch": 1.0704744943391427, + "ewc_loss": 0.03718672692775726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.859336407505907e-05, + "grad_norm": 26.216476440429688, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8708359003067017, + "num_tokens": 320964010.0, + "step": 8415 + }, + { + "epoch": 1.0706017046177332, + "ewc_loss": 0.03704101964831352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8520509911468253e-05, + "grad_norm": 26.23984146118164, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8599077463150024, + "num_tokens": 321006405.0, + "step": 8416 + }, + { + "epoch": 1.0707289148963237, + "ewc_loss": 0.03717058151960373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8585291400086135e-05, + "grad_norm": 26.307958602905273, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8575955033302307, + "num_tokens": 321050146.0, + "step": 8417 + }, + { + "epoch": 1.070856125174914, + "ewc_loss": 0.037150561809539795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8575281501398422e-05, + "grad_norm": 26.24369239807129, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.862441897392273, + "num_tokens": 321086857.0, + "step": 8418 + }, + { + "epoch": 1.0709833354535045, + "ewc_loss": 0.03711416572332382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8557082512415946e-05, + "grad_norm": 26.18588638305664, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8683390617370605, + "num_tokens": 321127310.0, + "step": 8419 + }, + { + "epoch": 1.071110545732095, + "ewc_loss": 0.03712807968258858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.856404014688451e-05, + "grad_norm": 26.31046485900879, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8529261946678162, + "num_tokens": 321165881.0, + "step": 8420 + }, + { + "epoch": 1.0712377560106856, + "ewc_loss": 0.037161290645599365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.858064570114948e-05, + "grad_norm": 26.070051193237305, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8569952249526978, + "num_tokens": 321204494.0, + "step": 8421 + }, + { + "epoch": 1.0713649662892761, + "ewc_loss": 0.037102263420820236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8551132598076947e-05, + "grad_norm": 26.301164627075195, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8518704771995544, + "num_tokens": 321243639.0, + "step": 8422 + }, + { + "epoch": 1.0714921765678667, + "ewc_loss": 0.03727983683347702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8639919289853424e-05, + "grad_norm": 26.215938568115234, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8542838096618652, + "num_tokens": 321281399.0, + "step": 8423 + }, + { + "epoch": 1.0716193868464572, + "ewc_loss": 0.037105411291122437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8552706023911014e-05, + "grad_norm": 26.181398391723633, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8493456840515137, + "num_tokens": 321316539.0, + "step": 8424 + }, + { + "epoch": 1.0717465971250477, + "ewc_loss": 0.037203170359134674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.86015859071631e-05, + "grad_norm": 26.17034912109375, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8581502437591553, + "num_tokens": 321348618.0, + "step": 8425 + }, + { + "epoch": 1.0718738074036382, + "ewc_loss": 0.037225041538476944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8612519852467813e-05, + "grad_norm": 26.266355514526367, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8676612973213196, + "num_tokens": 321381674.0, + "step": 8426 + }, + { + "epoch": 1.0720010176822288, + "ewc_loss": 0.037286173552274704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8643086150404997e-05, + "grad_norm": 26.223609924316406, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8773691654205322, + "num_tokens": 321420342.0, + "step": 8427 + }, + { + "epoch": 1.0721282279608193, + "ewc_loss": 0.03721906617283821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.860953307186719e-05, + "grad_norm": 26.24220848083496, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8592811822891235, + "num_tokens": 321450770.0, + "step": 8428 + }, + { + "epoch": 1.0722554382394098, + "ewc_loss": 0.0372331365942955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8616568922880106e-05, + "grad_norm": 26.25354766845703, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.865868330001831, + "num_tokens": 321491450.0, + "step": 8429 + }, + { + "epoch": 1.0723826485180004, + "ewc_loss": 0.03720796853303909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8603985154186375e-05, + "grad_norm": 26.123329162597656, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8567860126495361, + "num_tokens": 321531774.0, + "step": 8430 + }, + { + "epoch": 1.0725098587965907, + "ewc_loss": 0.03726086765527725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8630433260113932e-05, + "grad_norm": 26.280986785888672, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8625425696372986, + "num_tokens": 321567692.0, + "step": 8431 + }, + { + "epoch": 1.0726370690751812, + "ewc_loss": 0.03736782446503639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.868391154857818e-05, + "grad_norm": 26.316442489624023, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.857528567314148, + "num_tokens": 321601168.0, + "step": 8432 + }, + { + "epoch": 1.0727642793537717, + "ewc_loss": 0.03727712109684944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8638560504768975e-05, + "grad_norm": 26.254749298095703, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8455126285552979, + "num_tokens": 321636100.0, + "step": 8433 + }, + { + "epoch": 1.0728914896323622, + "ewc_loss": 0.0372631773352623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8631588318385184e-05, + "grad_norm": 26.231918334960938, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8637649416923523, + "num_tokens": 321674365.0, + "step": 8434 + }, + { + "epoch": 1.0730186999109528, + "ewc_loss": 0.037267353385686874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8633676518220454e-05, + "grad_norm": 26.319293975830078, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8730904459953308, + "num_tokens": 321712267.0, + "step": 8435 + }, + { + "epoch": 1.0731459101895433, + "ewc_loss": 0.037287235260009766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8643617295310833e-05, + "grad_norm": 26.199676513671875, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8603370189666748, + "num_tokens": 321749955.0, + "step": 8436 + }, + { + "epoch": 1.0732731204681338, + "ewc_loss": 0.037259504199028015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8629752958077006e-05, + "grad_norm": 26.206649780273438, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.870247483253479, + "num_tokens": 321788522.0, + "step": 8437 + }, + { + "epoch": 1.0734003307467244, + "ewc_loss": 0.03732503205537796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8662516595213674e-05, + "grad_norm": 26.422527313232422, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8603314161300659, + "num_tokens": 321826314.0, + "step": 8438 + }, + { + "epoch": 1.073527541025315, + "ewc_loss": 0.03732381761074066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.866190905275289e-05, + "grad_norm": 26.22745704650879, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8592776656150818, + "num_tokens": 321864851.0, + "step": 8439 + }, + { + "epoch": 1.0736547513039054, + "ewc_loss": 0.03725217282772064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8626085875439458e-05, + "grad_norm": 26.284439086914062, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.87203049659729, + "num_tokens": 321901001.0, + "step": 8440 + }, + { + "epoch": 1.073781961582496, + "ewc_loss": 0.037387702614068985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8693850506679155e-05, + "grad_norm": 26.31735610961914, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8609044551849365, + "num_tokens": 321941368.0, + "step": 8441 + }, + { + "epoch": 1.0739091718610865, + "ewc_loss": 0.03726882115006447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8634411389939487e-05, + "grad_norm": 26.208927154541016, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8661591410636902, + "num_tokens": 321978268.0, + "step": 8442 + }, + { + "epoch": 1.0740363821396768, + "ewc_loss": 0.03735962510108948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.867981336545199e-05, + "grad_norm": 26.316125869750977, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8695040941238403, + "num_tokens": 322016969.0, + "step": 8443 + }, + { + "epoch": 1.0741635924182673, + "ewc_loss": 0.037290751934051514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.864537625806406e-05, + "grad_norm": 26.239013671875, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8552474975585938, + "num_tokens": 322059362.0, + "step": 8444 + }, + { + "epoch": 1.0742908026968578, + "ewc_loss": 0.03723942115902901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.861971031758003e-05, + "grad_norm": 26.223886489868164, + "learning_rate": 1e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8433368802070618, + "num_tokens": 322097334.0, + "step": 8445 + }, + { + "epoch": 1.0744180129754484, + "ewc_loss": 0.037289489060640335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.864474506874103e-05, + "grad_norm": 26.37350082397461, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8830000162124634, + "num_tokens": 322121988.0, + "step": 8446 + }, + { + "epoch": 1.074545223254039, + "ewc_loss": 0.03728458657860756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.864229307102505e-05, + "grad_norm": 26.29082489013672, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8575767278671265, + "num_tokens": 322156602.0, + "step": 8447 + }, + { + "epoch": 1.0746724335326294, + "ewc_loss": 0.03722122684121132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8610613551572897e-05, + "grad_norm": 26.162229537963867, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.859871506690979, + "num_tokens": 322194475.0, + "step": 8448 + }, + { + "epoch": 1.07479964381122, + "ewc_loss": 0.037282492965459824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8641247152118012e-05, + "grad_norm": 26.363555908203125, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8703500032424927, + "num_tokens": 322232158.0, + "step": 8449 + }, + { + "epoch": 1.0749268540898105, + "ewc_loss": 0.037261057645082474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.863052966655232e-05, + "grad_norm": 26.17084312438965, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8704326152801514, + "num_tokens": 322272170.0, + "step": 8450 + }, + { + "epoch": 1.075054064368401, + "ewc_loss": 0.03722813352942467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8614065993460827e-05, + "grad_norm": 26.213645935058594, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8673769235610962, + "num_tokens": 322305496.0, + "step": 8451 + }, + { + "epoch": 1.0751812746469915, + "ewc_loss": 0.03730952367186546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8654762243386358e-05, + "grad_norm": 26.376239776611328, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8673266768455505, + "num_tokens": 322347552.0, + "step": 8452 + }, + { + "epoch": 1.075308484925582, + "ewc_loss": 0.03724151477217674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8620758055476472e-05, + "grad_norm": 26.15944480895996, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8647900223731995, + "num_tokens": 322386471.0, + "step": 8453 + }, + { + "epoch": 1.0754356952041726, + "ewc_loss": 0.03723800927400589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8619004549691454e-05, + "grad_norm": 26.31980323791504, + "learning_rate": 1e-06, + "loss": 0.5454, + "mean_token_accuracy": 0.8273898363113403, + "num_tokens": 322431019.0, + "step": 8454 + }, + { + "epoch": 1.0755629054827631, + "ewc_loss": 0.037271223962306976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8635611922945827e-05, + "grad_norm": 26.306467056274414, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8812555074691772, + "num_tokens": 322471525.0, + "step": 8455 + }, + { + "epoch": 1.0756901157613534, + "ewc_loss": 0.037208013236522675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8604006982059218e-05, + "grad_norm": 26.31047821044922, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8656651973724365, + "num_tokens": 322516594.0, + "step": 8456 + }, + { + "epoch": 1.075817326039944, + "ewc_loss": 0.03721419721841812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8607099264045246e-05, + "grad_norm": 26.21512222290039, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8698175549507141, + "num_tokens": 322557837.0, + "step": 8457 + }, + { + "epoch": 1.0759445363185345, + "ewc_loss": 0.03719990327954292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8599952454678714e-05, + "grad_norm": 26.366497039794922, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8574791550636292, + "num_tokens": 322594165.0, + "step": 8458 + }, + { + "epoch": 1.076071746597125, + "ewc_loss": 0.037210430949926376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8605214791023172e-05, + "grad_norm": 26.202213287353516, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.845697820186615, + "num_tokens": 322637083.0, + "step": 8459 + }, + { + "epoch": 1.0761989568757155, + "ewc_loss": 0.037176407873630524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.858820360212121e-05, + "grad_norm": 26.358741760253906, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8621495962142944, + "num_tokens": 322673216.0, + "step": 8460 + }, + { + "epoch": 1.076326167154306, + "ewc_loss": 0.037282608449459076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8641303540789522e-05, + "grad_norm": 26.327363967895508, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8648143410682678, + "num_tokens": 322709150.0, + "step": 8461 + }, + { + "epoch": 1.0764533774328966, + "ewc_loss": 0.03711619973182678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8558099327492528e-05, + "grad_norm": 26.30843162536621, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8678879737854004, + "num_tokens": 322747893.0, + "step": 8462 + }, + { + "epoch": 1.0765805877114871, + "ewc_loss": 0.037160005420446396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8580003597890027e-05, + "grad_norm": 26.279693603515625, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8456743955612183, + "num_tokens": 322781667.0, + "step": 8463 + }, + { + "epoch": 1.0767077979900777, + "ewc_loss": 0.03712129592895508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8560647731646895e-05, + "grad_norm": 26.287351608276367, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8622190952301025, + "num_tokens": 322818008.0, + "step": 8464 + }, + { + "epoch": 1.0768350082686682, + "ewc_loss": 0.03710825368762016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8554126654635184e-05, + "grad_norm": 26.162513732910156, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.842807412147522, + "num_tokens": 322856528.0, + "step": 8465 + }, + { + "epoch": 1.0769622185472587, + "ewc_loss": 0.03712937608361244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.856468770711217e-05, + "grad_norm": 26.220014572143555, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8663901090621948, + "num_tokens": 322888798.0, + "step": 8466 + }, + { + "epoch": 1.077089428825849, + "ewc_loss": 0.037256669253110886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.862833414634224e-05, + "grad_norm": 26.226741790771484, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8651148676872253, + "num_tokens": 322923301.0, + "step": 8467 + }, + { + "epoch": 1.0772166391044395, + "ewc_loss": 0.037159428000450134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8579714378574863e-05, + "grad_norm": 26.259342193603516, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8748436570167542, + "num_tokens": 322964253.0, + "step": 8468 + }, + { + "epoch": 1.07734384938303, + "ewc_loss": 0.03725432977080345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.862716453615576e-05, + "grad_norm": 26.353580474853516, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8611294031143188, + "num_tokens": 323006926.0, + "step": 8469 + }, + { + "epoch": 1.0774710596616206, + "ewc_loss": 0.0371643602848053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.858218092820607e-05, + "grad_norm": 26.202011108398438, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8752425909042358, + "num_tokens": 323047541.0, + "step": 8470 + }, + { + "epoch": 1.0775982699402111, + "ewc_loss": 0.03725966066122055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.862983117462136e-05, + "grad_norm": 26.29224395751953, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8555604219436646, + "num_tokens": 323084272.0, + "step": 8471 + }, + { + "epoch": 1.0777254802188017, + "ewc_loss": 0.03730437904596329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8652190192369744e-05, + "grad_norm": 26.30195426940918, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8744556903839111, + "num_tokens": 323116698.0, + "step": 8472 + }, + { + "epoch": 1.0778526904973922, + "ewc_loss": 0.03724765405058861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8623826690600254e-05, + "grad_norm": 26.227489471435547, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8707166910171509, + "num_tokens": 323158836.0, + "step": 8473 + }, + { + "epoch": 1.0779799007759827, + "ewc_loss": 0.03730597347021103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8652986909728497e-05, + "grad_norm": 26.30379867553711, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8581680059432983, + "num_tokens": 323193777.0, + "step": 8474 + }, + { + "epoch": 1.0781071110545732, + "ewc_loss": 0.03724795952439308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.862397948571015e-05, + "grad_norm": 26.267213821411133, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.862564742565155, + "num_tokens": 323232546.0, + "step": 8475 + }, + { + "epoch": 1.0782343213331638, + "ewc_loss": 0.037273671478033066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8636836102814414e-05, + "grad_norm": 26.235260009765625, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8619533777236938, + "num_tokens": 323269403.0, + "step": 8476 + }, + { + "epoch": 1.0783615316117543, + "ewc_loss": 0.03727060183882713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.863530087575782e-05, + "grad_norm": 26.383296966552734, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8754842281341553, + "num_tokens": 323305982.0, + "step": 8477 + }, + { + "epoch": 1.0784887418903448, + "ewc_loss": 0.0372219979763031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8610999177326448e-05, + "grad_norm": 26.247800827026367, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8570703268051147, + "num_tokens": 323341032.0, + "step": 8478 + }, + { + "epoch": 1.0786159521689354, + "ewc_loss": 0.03730369731783867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8651848222361878e-05, + "grad_norm": 26.39678955078125, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8569132089614868, + "num_tokens": 323381647.0, + "step": 8479 + }, + { + "epoch": 1.0787431624475257, + "ewc_loss": 0.037331823259592056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.866591082944069e-05, + "grad_norm": 26.363527297973633, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8671224117279053, + "num_tokens": 323417185.0, + "step": 8480 + }, + { + "epoch": 1.0788703727261162, + "ewc_loss": 0.037292610853910446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8646305761649273e-05, + "grad_norm": 26.398868560791016, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8622375726699829, + "num_tokens": 323454624.0, + "step": 8481 + }, + { + "epoch": 1.0789975830047067, + "ewc_loss": 0.03726186603307724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8630933482199907e-05, + "grad_norm": 26.22136688232422, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8629195690155029, + "num_tokens": 323492862.0, + "step": 8482 + }, + { + "epoch": 1.0791247932832972, + "ewc_loss": 0.03728652372956276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.864326259237714e-05, + "grad_norm": 26.29511260986328, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8594570159912109, + "num_tokens": 323536012.0, + "step": 8483 + }, + { + "epoch": 1.0792520035618878, + "ewc_loss": 0.03734587877988815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8672939404495992e-05, + "grad_norm": 26.260774612426758, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8672218918800354, + "num_tokens": 323573495.0, + "step": 8484 + }, + { + "epoch": 1.0793792138404783, + "ewc_loss": 0.037267498672008514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8633749277796596e-05, + "grad_norm": 26.427757263183594, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8731666207313538, + "num_tokens": 323610900.0, + "step": 8485 + }, + { + "epoch": 1.0795064241190688, + "ewc_loss": 0.03730836510658264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8654181985766627e-05, + "grad_norm": 26.281145095825195, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.860978364944458, + "num_tokens": 323650983.0, + "step": 8486 + }, + { + "epoch": 1.0796336343976594, + "ewc_loss": 0.037215568125247955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.860778320406098e-05, + "grad_norm": 26.34518814086914, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8552116751670837, + "num_tokens": 323690155.0, + "step": 8487 + }, + { + "epoch": 1.0797608446762499, + "ewc_loss": 0.03737225756049156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8686128896661103e-05, + "grad_norm": 26.32868194580078, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8760865926742554, + "num_tokens": 323732212.0, + "step": 8488 + }, + { + "epoch": 1.0798880549548404, + "ewc_loss": 0.03731518238782883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8657590771908872e-05, + "grad_norm": 26.39267921447754, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8717708587646484, + "num_tokens": 323773400.0, + "step": 8489 + }, + { + "epoch": 1.080015265233431, + "ewc_loss": 0.037259023636579514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8629511032486334e-05, + "grad_norm": 26.304914474487305, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8707190752029419, + "num_tokens": 323812753.0, + "step": 8490 + }, + { + "epoch": 1.0801424755120215, + "ewc_loss": 0.037243954837322235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8621976778376848e-05, + "grad_norm": 26.442462921142578, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8649168014526367, + "num_tokens": 323855835.0, + "step": 8491 + }, + { + "epoch": 1.0802696857906118, + "ewc_loss": 0.037330109626054764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.866505408543162e-05, + "grad_norm": 26.341604232788086, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8760439157485962, + "num_tokens": 323896065.0, + "step": 8492 + }, + { + "epoch": 1.0803968960692023, + "ewc_loss": 0.03714354708790779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8571772670838982e-05, + "grad_norm": 26.377840042114258, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8660739660263062, + "num_tokens": 323934351.0, + "step": 8493 + }, + { + "epoch": 1.0805241063477928, + "ewc_loss": 0.03729991242289543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.864995647338219e-05, + "grad_norm": 26.33907699584961, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8528680801391602, + "num_tokens": 323968878.0, + "step": 8494 + }, + { + "epoch": 1.0806513166263834, + "ewc_loss": 0.03720200061798096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8601000192575157e-05, + "grad_norm": 26.389745712280273, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.862078070640564, + "num_tokens": 324004838.0, + "step": 8495 + }, + { + "epoch": 1.080778526904974, + "ewc_loss": 0.03720149025321007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.860074553405866e-05, + "grad_norm": 26.316925048828125, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8782253265380859, + "num_tokens": 324040851.0, + "step": 8496 + }, + { + "epoch": 1.0809057371835644, + "ewc_loss": 0.03717340901494026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.858670475485269e-05, + "grad_norm": 26.45100975036621, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8511937856674194, + "num_tokens": 324084675.0, + "step": 8497 + }, + { + "epoch": 1.081032947462155, + "ewc_loss": 0.037200380116701126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.860019074229058e-05, + "grad_norm": 26.27560043334961, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8686295747756958, + "num_tokens": 324122136.0, + "step": 8498 + }, + { + "epoch": 1.0811601577407455, + "ewc_loss": 0.0371553972363472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8577698938315734e-05, + "grad_norm": 26.474136352539062, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.86260586977005, + "num_tokens": 324153142.0, + "step": 8499 + }, + { + "epoch": 1.081287368019336, + "ewc_loss": 0.03720913454890251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.860456723079551e-05, + "grad_norm": 26.203096389770508, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8441826105117798, + "num_tokens": 324187244.0, + "step": 8500 + }, + { + "epoch": 1.0814145782979265, + "ewc_loss": 0.03713267669081688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8566337530501187e-05, + "grad_norm": 26.43301010131836, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8658756017684937, + "num_tokens": 324225644.0, + "step": 8501 + }, + { + "epoch": 1.081541788576517, + "ewc_loss": 0.03729129210114479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8645645468495786e-05, + "grad_norm": 26.319589614868164, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8706097602844238, + "num_tokens": 324270394.0, + "step": 8502 + }, + { + "epoch": 1.0816689988551076, + "ewc_loss": 0.03720397874712944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8601989722810686e-05, + "grad_norm": 26.2615909576416, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8743235468864441, + "num_tokens": 324311244.0, + "step": 8503 + }, + { + "epoch": 1.0817962091336981, + "ewc_loss": 0.037321824580430984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8660912246559747e-05, + "grad_norm": 26.353933334350586, + "learning_rate": 1e-06, + "loss": 0.4883, + "mean_token_accuracy": 0.844921350479126, + "num_tokens": 324350821.0, + "step": 8504 + }, + { + "epoch": 1.0819234194122884, + "ewc_loss": 0.03722454607486725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.861227246990893e-05, + "grad_norm": 26.176727294921875, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8631619215011597, + "num_tokens": 324388255.0, + "step": 8505 + }, + { + "epoch": 1.082050629690879, + "ewc_loss": 0.037280336022377014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.864016849140171e-05, + "grad_norm": 26.35553550720215, + "learning_rate": 1e-06, + "loss": 0.5078, + "mean_token_accuracy": 0.8390541672706604, + "num_tokens": 324430068.0, + "step": 8506 + }, + { + "epoch": 1.0821778399694695, + "ewc_loss": 0.03732764348387718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.866382262960542e-05, + "grad_norm": 26.20867347717285, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8693122267723083, + "num_tokens": 324465341.0, + "step": 8507 + }, + { + "epoch": 1.08230505024806, + "ewc_loss": 0.03730500116944313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.865250123955775e-05, + "grad_norm": 26.361957550048828, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8716579079627991, + "num_tokens": 324502199.0, + "step": 8508 + }, + { + "epoch": 1.0824322605266505, + "ewc_loss": 0.03735324367880821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.867662103904877e-05, + "grad_norm": 26.230926513671875, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8657772541046143, + "num_tokens": 324535898.0, + "step": 8509 + }, + { + "epoch": 1.082559470805241, + "ewc_loss": 0.03737073019146919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8685364921111614e-05, + "grad_norm": 26.375383377075195, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8542490005493164, + "num_tokens": 324574404.0, + "step": 8510 + }, + { + "epoch": 1.0826866810838316, + "ewc_loss": 0.037428658455610275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8714328689384274e-05, + "grad_norm": 26.292583465576172, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8625631332397461, + "num_tokens": 324611743.0, + "step": 8511 + }, + { + "epoch": 1.0828138913624221, + "ewc_loss": 0.03729560971260071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8647804608917795e-05, + "grad_norm": 26.3375244140625, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8538410663604736, + "num_tokens": 324653126.0, + "step": 8512 + }, + { + "epoch": 1.0829411016410126, + "ewc_loss": 0.03741392865777016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8706963601289317e-05, + "grad_norm": 26.356801986694336, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8699116110801697, + "num_tokens": 324692957.0, + "step": 8513 + }, + { + "epoch": 1.0830683119196032, + "ewc_loss": 0.03737374767661095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8686874682316557e-05, + "grad_norm": 26.35593605041504, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8586360216140747, + "num_tokens": 324737214.0, + "step": 8514 + }, + { + "epoch": 1.0831955221981937, + "ewc_loss": 0.037334222346544266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.866711136244703e-05, + "grad_norm": 26.390913009643555, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8741797208786011, + "num_tokens": 324773098.0, + "step": 8515 + }, + { + "epoch": 1.083322732476784, + "ewc_loss": 0.03736843168735504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8684215319808573e-05, + "grad_norm": 26.2929744720459, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8527465462684631, + "num_tokens": 324815846.0, + "step": 8516 + }, + { + "epoch": 1.0834499427553745, + "ewc_loss": 0.03726135194301605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8630675185704604e-05, + "grad_norm": 26.334280014038086, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8597368597984314, + "num_tokens": 324855912.0, + "step": 8517 + }, + { + "epoch": 1.083577153033965, + "ewc_loss": 0.037331145256757736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.866557249741163e-05, + "grad_norm": 26.35285186767578, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8561514616012573, + "num_tokens": 324893090.0, + "step": 8518 + }, + { + "epoch": 1.0837043633125556, + "ewc_loss": 0.03729204088449478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8646020180312917e-05, + "grad_norm": 26.27376365661621, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.862214982509613, + "num_tokens": 324929779.0, + "step": 8519 + }, + { + "epoch": 1.0838315735911461, + "ewc_loss": 0.037348728626966476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8674363673198968e-05, + "grad_norm": 26.40582275390625, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8681919574737549, + "num_tokens": 324961349.0, + "step": 8520 + }, + { + "epoch": 1.0839587838697367, + "ewc_loss": 0.03734203055500984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.867101491370704e-05, + "grad_norm": 26.262813568115234, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8618330955505371, + "num_tokens": 324998688.0, + "step": 8521 + }, + { + "epoch": 1.0840859941483272, + "ewc_loss": 0.03731849789619446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8659249690244906e-05, + "grad_norm": 26.361988067626953, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8552660942077637, + "num_tokens": 325040601.0, + "step": 8522 + }, + { + "epoch": 1.0842132044269177, + "ewc_loss": 0.03735263645648956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.867631908680778e-05, + "grad_norm": 26.261213302612305, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.867439866065979, + "num_tokens": 325081729.0, + "step": 8523 + }, + { + "epoch": 1.0843404147055082, + "ewc_loss": 0.03727804869413376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.863902434706688e-05, + "grad_norm": 26.27859878540039, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8524760007858276, + "num_tokens": 325118886.0, + "step": 8524 + }, + { + "epoch": 1.0844676249840988, + "ewc_loss": 0.037346210330724716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8673104932531714e-05, + "grad_norm": 26.3072509765625, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8813323378562927, + "num_tokens": 325156417.0, + "step": 8525 + }, + { + "epoch": 1.0845948352626893, + "ewc_loss": 0.037325747311115265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.866287311713677e-05, + "grad_norm": 26.258649826049805, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8762319087982178, + "num_tokens": 325200902.0, + "step": 8526 + }, + { + "epoch": 1.0847220455412798, + "ewc_loss": 0.0373648926615715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8682445443118922e-05, + "grad_norm": 26.29001808166504, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8645272254943848, + "num_tokens": 325233362.0, + "step": 8527 + }, + { + "epoch": 1.0848492558198704, + "ewc_loss": 0.03728194534778595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8640972484718077e-05, + "grad_norm": 26.124576568603516, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8614810109138489, + "num_tokens": 325274646.0, + "step": 8528 + }, + { + "epoch": 1.0849764660984607, + "ewc_loss": 0.03734474629163742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.867237369879149e-05, + "grad_norm": 26.279354095458984, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8793034553527832, + "num_tokens": 325304735.0, + "step": 8529 + }, + { + "epoch": 1.0851036763770512, + "ewc_loss": 0.03739459812641144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8697299310588278e-05, + "grad_norm": 26.233213424682617, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8670157194137573, + "num_tokens": 325339833.0, + "step": 8530 + }, + { + "epoch": 1.0852308866556417, + "ewc_loss": 0.03730182722210884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8650913261808455e-05, + "grad_norm": 26.288320541381836, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8521721959114075, + "num_tokens": 325373838.0, + "step": 8531 + }, + { + "epoch": 1.0853580969342322, + "ewc_loss": 0.037402793765068054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8701397493714467e-05, + "grad_norm": 26.263303756713867, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8849203586578369, + "num_tokens": 325412913.0, + "step": 8532 + }, + { + "epoch": 1.0854853072128228, + "ewc_loss": 0.03736121580004692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.868060826382134e-05, + "grad_norm": 26.31155776977539, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8700268268585205, + "num_tokens": 325451830.0, + "step": 8533 + }, + { + "epoch": 1.0856125174914133, + "ewc_loss": 0.037346769124269485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.867338505689986e-05, + "grad_norm": 26.227554321289062, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8466014862060547, + "num_tokens": 325487162.0, + "step": 8534 + }, + { + "epoch": 1.0857397277700038, + "ewc_loss": 0.037376437336206436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8688218915485777e-05, + "grad_norm": 26.366594314575195, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8729796409606934, + "num_tokens": 325525906.0, + "step": 8535 + }, + { + "epoch": 1.0858669380485944, + "ewc_loss": 0.03742990642786026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.871495260274969e-05, + "grad_norm": 26.307720184326172, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8748804926872253, + "num_tokens": 325559925.0, + "step": 8536 + }, + { + "epoch": 1.0859941483271849, + "ewc_loss": 0.03732265159487724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.866132515715435e-05, + "grad_norm": 26.26963233947754, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8625568747520447, + "num_tokens": 325599980.0, + "step": 8537 + }, + { + "epoch": 1.0861213586057754, + "ewc_loss": 0.037428729236125946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8714365069172345e-05, + "grad_norm": 26.434078216552734, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8471535444259644, + "num_tokens": 325631745.0, + "step": 8538 + }, + { + "epoch": 1.086248568884366, + "ewc_loss": 0.037480589002370834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.874029476311989e-05, + "grad_norm": 26.315153121948242, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8599516153335571, + "num_tokens": 325673583.0, + "step": 8539 + }, + { + "epoch": 1.0863757791629565, + "ewc_loss": 0.037281155586242676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8640577764017507e-05, + "grad_norm": 26.343948364257812, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8553594946861267, + "num_tokens": 325712995.0, + "step": 8540 + }, + { + "epoch": 1.0865029894415468, + "ewc_loss": 0.0374247245490551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.871236236183904e-05, + "grad_norm": 26.241289138793945, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8698582053184509, + "num_tokens": 325748259.0, + "step": 8541 + }, + { + "epoch": 1.0866301997201373, + "ewc_loss": 0.03732464462518692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8662321963347495e-05, + "grad_norm": 26.31585121154785, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8683396577835083, + "num_tokens": 325787676.0, + "step": 8542 + }, + { + "epoch": 1.0867574099987278, + "ewc_loss": 0.037462711334228516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8731356249190867e-05, + "grad_norm": 26.27324104309082, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.838858962059021, + "num_tokens": 325827819.0, + "step": 8543 + }, + { + "epoch": 1.0868846202773184, + "ewc_loss": 0.037412915378808975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.870645792223513e-05, + "grad_norm": 26.366384506225586, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.872104287147522, + "num_tokens": 325866328.0, + "step": 8544 + }, + { + "epoch": 1.0870118305559089, + "ewc_loss": 0.03738545626401901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8692728190217167e-05, + "grad_norm": 26.259700775146484, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8586509227752686, + "num_tokens": 325906752.0, + "step": 8545 + }, + { + "epoch": 1.0871390408344994, + "ewc_loss": 0.037429939955472946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8714970792643726e-05, + "grad_norm": 26.325632095336914, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8572721481323242, + "num_tokens": 325943765.0, + "step": 8546 + }, + { + "epoch": 1.08726625111309, + "ewc_loss": 0.03740481659770012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8702408851822838e-05, + "grad_norm": 26.235702514648438, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8699877858161926, + "num_tokens": 325981471.0, + "step": 8547 + }, + { + "epoch": 1.0873934613916805, + "ewc_loss": 0.03739607334136963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8698036001296714e-05, + "grad_norm": 26.303586959838867, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8688241839408875, + "num_tokens": 326017253.0, + "step": 8548 + }, + { + "epoch": 1.087520671670271, + "ewc_loss": 0.037447888404130936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8723943867371418e-05, + "grad_norm": 26.321916580200195, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8644602298736572, + "num_tokens": 326053484.0, + "step": 8549 + }, + { + "epoch": 1.0876478819488615, + "ewc_loss": 0.037362951785326004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8681475921766832e-05, + "grad_norm": 26.309762954711914, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8686767816543579, + "num_tokens": 326084779.0, + "step": 8550 + }, + { + "epoch": 1.087775092227452, + "ewc_loss": 0.03744654357433319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.872327266028151e-05, + "grad_norm": 26.33312225341797, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8613985776901245, + "num_tokens": 326120795.0, + "step": 8551 + }, + { + "epoch": 1.0879023025060426, + "ewc_loss": 0.03739571198821068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8697855921345763e-05, + "grad_norm": 26.27731704711914, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8612555861473083, + "num_tokens": 326155991.0, + "step": 8552 + }, + { + "epoch": 1.0880295127846331, + "ewc_loss": 0.03738132491707802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.869066181825474e-05, + "grad_norm": 26.256868362426758, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.85945725440979, + "num_tokens": 326195738.0, + "step": 8553 + }, + { + "epoch": 1.0881567230632234, + "ewc_loss": 0.03747779503464699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8738897779257968e-05, + "grad_norm": 26.360122680664062, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8623039722442627, + "num_tokens": 326237773.0, + "step": 8554 + }, + { + "epoch": 1.088283933341814, + "ewc_loss": 0.0374252013862133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8712600649450906e-05, + "grad_norm": 26.262765884399414, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8765804767608643, + "num_tokens": 326278589.0, + "step": 8555 + }, + { + "epoch": 1.0884111436204045, + "ewc_loss": 0.037378933280706406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.868946674221661e-05, + "grad_norm": 26.29416847229004, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8564586043357849, + "num_tokens": 326316862.0, + "step": 8556 + }, + { + "epoch": 1.088538353898995, + "ewc_loss": 0.0373896062374115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8694803657126613e-05, + "grad_norm": 26.23994255065918, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8604886531829834, + "num_tokens": 326353892.0, + "step": 8557 + }, + { + "epoch": 1.0886655641775855, + "ewc_loss": 0.03743056580424309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8715283658821136e-05, + "grad_norm": 26.410770416259766, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8674811720848083, + "num_tokens": 326389478.0, + "step": 8558 + }, + { + "epoch": 1.088792774456176, + "ewc_loss": 0.03749610483646393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8748052752926014e-05, + "grad_norm": 26.323413848876953, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8581940531730652, + "num_tokens": 326428072.0, + "step": 8559 + }, + { + "epoch": 1.0889199847347666, + "ewc_loss": 0.037438444793224335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8719221770879813e-05, + "grad_norm": 26.401573181152344, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8659274578094482, + "num_tokens": 326466374.0, + "step": 8560 + }, + { + "epoch": 1.0890471950133571, + "ewc_loss": 0.037482138723134995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8741069652605802e-05, + "grad_norm": 26.371828079223633, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8599840402603149, + "num_tokens": 326501081.0, + "step": 8561 + }, + { + "epoch": 1.0891744052919476, + "ewc_loss": 0.0373888723552227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8694436221267097e-05, + "grad_norm": 26.321632385253906, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8738254308700562, + "num_tokens": 326532539.0, + "step": 8562 + }, + { + "epoch": 1.0893016155705382, + "ewc_loss": 0.03735395520925522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8676977560971864e-05, + "grad_norm": 26.35459327697754, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8684740662574768, + "num_tokens": 326569481.0, + "step": 8563 + }, + { + "epoch": 1.0894288258491287, + "ewc_loss": 0.03739354386925697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.869677180366125e-05, + "grad_norm": 26.332639694213867, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8610105514526367, + "num_tokens": 326609746.0, + "step": 8564 + }, + { + "epoch": 1.089556036127719, + "ewc_loss": 0.037430379539728165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.871518907137215e-05, + "grad_norm": 26.418628692626953, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8526269793510437, + "num_tokens": 326652379.0, + "step": 8565 + }, + { + "epoch": 1.0896832464063095, + "ewc_loss": 0.0373910516500473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.869552579591982e-05, + "grad_norm": 26.28323745727539, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8591973781585693, + "num_tokens": 326687617.0, + "step": 8566 + }, + { + "epoch": 1.0898104566849, + "ewc_loss": 0.03737816959619522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8689084754441865e-05, + "grad_norm": 26.325769424438477, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8548427820205688, + "num_tokens": 326724155.0, + "step": 8567 + }, + { + "epoch": 1.0899376669634906, + "ewc_loss": 0.037531863898038864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8765931599773467e-05, + "grad_norm": 26.396209716796875, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8622281551361084, + "num_tokens": 326768234.0, + "step": 8568 + }, + { + "epoch": 1.0900648772420811, + "ewc_loss": 0.037398677319288254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8699338397709653e-05, + "grad_norm": 26.30776023864746, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8669775724411011, + "num_tokens": 326810423.0, + "step": 8569 + }, + { + "epoch": 1.0901920875206716, + "ewc_loss": 0.0374491922557354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.872459688456729e-05, + "grad_norm": 26.335983276367188, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.861297607421875, + "num_tokens": 326851834.0, + "step": 8570 + }, + { + "epoch": 1.0903192977992622, + "ewc_loss": 0.03739413619041443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8697068298934028e-05, + "grad_norm": 26.294158935546875, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8507140874862671, + "num_tokens": 326889334.0, + "step": 8571 + }, + { + "epoch": 1.0904465080778527, + "ewc_loss": 0.03744414448738098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.872207212727517e-05, + "grad_norm": 26.418180465698242, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8524675965309143, + "num_tokens": 326931884.0, + "step": 8572 + }, + { + "epoch": 1.0905737183564432, + "ewc_loss": 0.037454091012477875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8727045244304463e-05, + "grad_norm": 26.387248992919922, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8707386255264282, + "num_tokens": 326969043.0, + "step": 8573 + }, + { + "epoch": 1.0907009286350338, + "ewc_loss": 0.037406809628009796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.870340565801598e-05, + "grad_norm": 26.3705997467041, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8640788793563843, + "num_tokens": 326998369.0, + "step": 8574 + }, + { + "epoch": 1.0908281389136243, + "ewc_loss": 0.03742364048957825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8711820302996784e-05, + "grad_norm": 26.395362854003906, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8634649515151978, + "num_tokens": 327036909.0, + "step": 8575 + }, + { + "epoch": 1.0909553491922148, + "ewc_loss": 0.037360161542892456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8680080756894313e-05, + "grad_norm": 26.234622955322266, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8565248250961304, + "num_tokens": 327076053.0, + "step": 8576 + }, + { + "epoch": 1.0910825594708053, + "ewc_loss": 0.037453990429639816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8726994312601164e-05, + "grad_norm": 26.42803382873535, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8707495331764221, + "num_tokens": 327112285.0, + "step": 8577 + }, + { + "epoch": 1.0912097697493957, + "ewc_loss": 0.03743072599172592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8715363694354892e-05, + "grad_norm": 26.237438201904297, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8696389198303223, + "num_tokens": 327151591.0, + "step": 8578 + }, + { + "epoch": 1.0913369800279862, + "ewc_loss": 0.03743312507867813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.871656240837183e-05, + "grad_norm": 26.357702255249023, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8468758463859558, + "num_tokens": 327193600.0, + "step": 8579 + }, + { + "epoch": 1.0914641903065767, + "ewc_loss": 0.03751848638057709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.875924317573663e-05, + "grad_norm": 26.327159881591797, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8533273935317993, + "num_tokens": 327230249.0, + "step": 8580 + }, + { + "epoch": 1.0915914005851672, + "ewc_loss": 0.03744856268167496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8724282199400477e-05, + "grad_norm": 26.360034942626953, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8770872354507446, + "num_tokens": 327270043.0, + "step": 8581 + }, + { + "epoch": 1.0917186108637578, + "ewc_loss": 0.03745623677968979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8728118448052555e-05, + "grad_norm": 26.34954833984375, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8648902177810669, + "num_tokens": 327309396.0, + "step": 8582 + }, + { + "epoch": 1.0918458211423483, + "ewc_loss": 0.037481121718883514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.874056033557281e-05, + "grad_norm": 26.43758201599121, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8706285953521729, + "num_tokens": 327349497.0, + "step": 8583 + }, + { + "epoch": 1.0919730314209388, + "ewc_loss": 0.03747044503688812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8735221601673402e-05, + "grad_norm": 26.334674835205078, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8549132943153381, + "num_tokens": 327385648.0, + "step": 8584 + }, + { + "epoch": 1.0921002416995294, + "ewc_loss": 0.03742099180817604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8710496078711003e-05, + "grad_norm": 26.339954376220703, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8643548488616943, + "num_tokens": 327426379.0, + "step": 8585 + }, + { + "epoch": 1.0922274519781199, + "ewc_loss": 0.037477727979421616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.87388632184593e-05, + "grad_norm": 26.334453582763672, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8670727014541626, + "num_tokens": 327467555.0, + "step": 8586 + }, + { + "epoch": 1.0923546622567104, + "ewc_loss": 0.03747135028243065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8735674530034885e-05, + "grad_norm": 26.308712005615234, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8591545224189758, + "num_tokens": 327505303.0, + "step": 8587 + }, + { + "epoch": 1.092481872535301, + "ewc_loss": 0.03747262433171272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.873631299531553e-05, + "grad_norm": 26.3792724609375, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8678112030029297, + "num_tokens": 327547854.0, + "step": 8588 + }, + { + "epoch": 1.0926090828138915, + "ewc_loss": 0.03736211732029915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.868105937319342e-05, + "grad_norm": 26.239072799682617, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8649150133132935, + "num_tokens": 327584132.0, + "step": 8589 + }, + { + "epoch": 1.0927362930924818, + "ewc_loss": 0.03739875555038452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.869937841547653e-05, + "grad_norm": 26.283748626708984, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8518213033676147, + "num_tokens": 327629250.0, + "step": 8590 + }, + { + "epoch": 1.0928635033710723, + "ewc_loss": 0.037545569241046906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8772785551846027e-05, + "grad_norm": 26.338008880615234, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8556413650512695, + "num_tokens": 327666988.0, + "step": 8591 + }, + { + "epoch": 1.0929907136496628, + "ewc_loss": 0.03747989237308502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.873994551715441e-05, + "grad_norm": 26.355724334716797, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8648263216018677, + "num_tokens": 327705955.0, + "step": 8592 + }, + { + "epoch": 1.0931179239282534, + "ewc_loss": 0.037447936832904816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8723969333223067e-05, + "grad_norm": 26.24779510498047, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8557165265083313, + "num_tokens": 327746362.0, + "step": 8593 + }, + { + "epoch": 1.0932451342068439, + "ewc_loss": 0.037489403039216995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8744702174444683e-05, + "grad_norm": 26.40248680114746, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8624132871627808, + "num_tokens": 327789034.0, + "step": 8594 + }, + { + "epoch": 1.0933723444854344, + "ewc_loss": 0.03747013211250305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.87350669875741e-05, + "grad_norm": 26.292869567871094, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.851518988609314, + "num_tokens": 327824203.0, + "step": 8595 + }, + { + "epoch": 1.093499554764025, + "ewc_loss": 0.037404995411634445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8702497982303612e-05, + "grad_norm": 26.342193603515625, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8641258478164673, + "num_tokens": 327860340.0, + "step": 8596 + }, + { + "epoch": 1.0936267650426155, + "ewc_loss": 0.037541914731264114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8770957467495464e-05, + "grad_norm": 26.363143920898438, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8559735417366028, + "num_tokens": 327901709.0, + "step": 8597 + }, + { + "epoch": 1.093753975321206, + "ewc_loss": 0.03746992349624634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8734961486188695e-05, + "grad_norm": 26.424907684326172, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8639965057373047, + "num_tokens": 327940327.0, + "step": 8598 + }, + { + "epoch": 1.0938811855997965, + "ewc_loss": 0.03743467852473259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8717339116847143e-05, + "grad_norm": 26.37505531311035, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8675422668457031, + "num_tokens": 327977966.0, + "step": 8599 + }, + { + "epoch": 1.094008395878387, + "ewc_loss": 0.037492286413908005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.874614281405229e-05, + "grad_norm": 26.507488250732422, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8590344190597534, + "num_tokens": 328021143.0, + "step": 8600 + }, + { + "epoch": 1.0941356061569776, + "ewc_loss": 0.03737657517194748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8688288037083112e-05, + "grad_norm": 26.33101463317871, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8692115545272827, + "num_tokens": 328060792.0, + "step": 8601 + }, + { + "epoch": 1.094262816435568, + "ewc_loss": 0.03737359493970871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8686798284761608e-05, + "grad_norm": 26.432485580444336, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8646295070648193, + "num_tokens": 328099698.0, + "step": 8602 + }, + { + "epoch": 1.0943900267141584, + "ewc_loss": 0.03739689290523529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8698447092901915e-05, + "grad_norm": 26.294048309326172, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8662996292114258, + "num_tokens": 328141075.0, + "step": 8603 + }, + { + "epoch": 1.094517236992749, + "ewc_loss": 0.037355951964855194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.867797618615441e-05, + "grad_norm": 26.395923614501953, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8555848598480225, + "num_tokens": 328178945.0, + "step": 8604 + }, + { + "epoch": 1.0946444472713395, + "ewc_loss": 0.03745753690600395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.872876782726962e-05, + "grad_norm": 26.47394561767578, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.83669513463974, + "num_tokens": 328214423.0, + "step": 8605 + }, + { + "epoch": 1.09477165754993, + "ewc_loss": 0.037369634956121445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8684817405301146e-05, + "grad_norm": 26.303152084350586, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8634675145149231, + "num_tokens": 328248640.0, + "step": 8606 + }, + { + "epoch": 1.0948988678285205, + "ewc_loss": 0.037378616631031036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.86893084901385e-05, + "grad_norm": 26.48939323425293, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8405691385269165, + "num_tokens": 328283016.0, + "step": 8607 + }, + { + "epoch": 1.095026078107111, + "ewc_loss": 0.03746403381228447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8732016542344354e-05, + "grad_norm": 26.42317008972168, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8665086627006531, + "num_tokens": 328322377.0, + "step": 8608 + }, + { + "epoch": 1.0951532883857016, + "ewc_loss": 0.037341292947530746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.867064565885812e-05, + "grad_norm": 26.402254104614258, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8645187616348267, + "num_tokens": 328359409.0, + "step": 8609 + }, + { + "epoch": 1.0952804986642921, + "ewc_loss": 0.0374004989862442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.870024971140083e-05, + "grad_norm": 26.438993453979492, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8673182725906372, + "num_tokens": 328401257.0, + "step": 8610 + }, + { + "epoch": 1.0954077089428826, + "ewc_loss": 0.03737396001815796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8686980183701962e-05, + "grad_norm": 26.34845542907715, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8660809993743896, + "num_tokens": 328435886.0, + "step": 8611 + }, + { + "epoch": 1.0955349192214732, + "ewc_loss": 0.037405598908662796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.87027999345446e-05, + "grad_norm": 26.324663162231445, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8515739440917969, + "num_tokens": 328472091.0, + "step": 8612 + }, + { + "epoch": 1.0956621295000637, + "ewc_loss": 0.037434834986925125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8717417333391495e-05, + "grad_norm": 26.35081672668457, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8547043204307556, + "num_tokens": 328509453.0, + "step": 8613 + }, + { + "epoch": 1.095789339778654, + "ewc_loss": 0.037481822073459625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8740911400527693e-05, + "grad_norm": 26.33871078491211, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8626773357391357, + "num_tokens": 328545590.0, + "step": 8614 + }, + { + "epoch": 1.0959165500572445, + "ewc_loss": 0.03751620650291443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8758102669380605e-05, + "grad_norm": 26.4427547454834, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8529132008552551, + "num_tokens": 328583093.0, + "step": 8615 + }, + { + "epoch": 1.096043760335835, + "ewc_loss": 0.0374937504529953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.874687586678192e-05, + "grad_norm": 26.40330696105957, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8461446762084961, + "num_tokens": 328625312.0, + "step": 8616 + }, + { + "epoch": 1.0961709706144256, + "ewc_loss": 0.03744702786207199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.872351458587218e-05, + "grad_norm": 26.384357452392578, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8555358052253723, + "num_tokens": 328655392.0, + "step": 8617 + }, + { + "epoch": 1.0962981808930161, + "ewc_loss": 0.03745812550187111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8729062503552996e-05, + "grad_norm": 26.401554107666016, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8473045825958252, + "num_tokens": 328693686.0, + "step": 8618 + }, + { + "epoch": 1.0964253911716066, + "ewc_loss": 0.03753283619880676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8766417269944213e-05, + "grad_norm": 26.340848922729492, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8743196725845337, + "num_tokens": 328728051.0, + "step": 8619 + }, + { + "epoch": 1.0965526014501972, + "ewc_loss": 0.037517111748456955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8758555597742088e-05, + "grad_norm": 26.45173454284668, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8520785570144653, + "num_tokens": 328768091.0, + "step": 8620 + }, + { + "epoch": 1.0966798117287877, + "ewc_loss": 0.03754144906997681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.877072463685181e-05, + "grad_norm": 26.41225814819336, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8586363792419434, + "num_tokens": 328803851.0, + "step": 8621 + }, + { + "epoch": 1.0968070220073782, + "ewc_loss": 0.03743967041373253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8719834770308807e-05, + "grad_norm": 26.33946990966797, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8728682994842529, + "num_tokens": 328837429.0, + "step": 8622 + }, + { + "epoch": 1.0969342322859688, + "ewc_loss": 0.037507232278585434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.875361704151146e-05, + "grad_norm": 26.487838745117188, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8622773885726929, + "num_tokens": 328874264.0, + "step": 8623 + }, + { + "epoch": 1.0970614425645593, + "ewc_loss": 0.03752585127949715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8762924810289405e-05, + "grad_norm": 26.37184715270996, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.864023745059967, + "num_tokens": 328911627.0, + "step": 8624 + }, + { + "epoch": 1.0971886528431498, + "ewc_loss": 0.03748291730880737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.874145891633816e-05, + "grad_norm": 26.3680477142334, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8561036586761475, + "num_tokens": 328944471.0, + "step": 8625 + }, + { + "epoch": 1.0973158631217403, + "ewc_loss": 0.037583645433187485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.879182309494354e-05, + "grad_norm": 26.407690048217773, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8695896863937378, + "num_tokens": 328983826.0, + "step": 8626 + }, + { + "epoch": 1.0974430734003306, + "ewc_loss": 0.03747871518135071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8739357983577065e-05, + "grad_norm": 26.290254592895508, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8628196716308594, + "num_tokens": 329019385.0, + "step": 8627 + }, + { + "epoch": 1.0975702836789212, + "ewc_loss": 0.037577297538518906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.878864895843435e-05, + "grad_norm": 26.422664642333984, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8502471446990967, + "num_tokens": 329065179.0, + "step": 8628 + }, + { + "epoch": 1.0976974939575117, + "ewc_loss": 0.03756711259484291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8783555788104422e-05, + "grad_norm": 26.346759796142578, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8684425354003906, + "num_tokens": 329101077.0, + "step": 8629 + }, + { + "epoch": 1.0978247042361022, + "ewc_loss": 0.03748916834592819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8744583940133452e-05, + "grad_norm": 26.312685012817383, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8671761155128479, + "num_tokens": 329137538.0, + "step": 8630 + }, + { + "epoch": 1.0979519145146928, + "ewc_loss": 0.03752267360687256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.876133683254011e-05, + "grad_norm": 26.269657135009766, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8752326369285583, + "num_tokens": 329171952.0, + "step": 8631 + }, + { + "epoch": 1.0980791247932833, + "ewc_loss": 0.0375916063785553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8795803043758497e-05, + "grad_norm": 26.34034538269043, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8704490661621094, + "num_tokens": 329214956.0, + "step": 8632 + }, + { + "epoch": 1.0982063350718738, + "ewc_loss": 0.037555452436208725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8777725927066058e-05, + "grad_norm": 26.411951065063477, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8517553806304932, + "num_tokens": 329254228.0, + "step": 8633 + }, + { + "epoch": 1.0983335453504643, + "ewc_loss": 0.03757629543542862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8788146917358972e-05, + "grad_norm": 26.42743492126465, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8634869456291199, + "num_tokens": 329288337.0, + "step": 8634 + }, + { + "epoch": 1.0984607556290549, + "ewc_loss": 0.037562254816293716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.878112743725069e-05, + "grad_norm": 26.43416404724121, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.855890691280365, + "num_tokens": 329329118.0, + "step": 8635 + }, + { + "epoch": 1.0985879659076454, + "ewc_loss": 0.037527721375226974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.876386158983223e-05, + "grad_norm": 26.392934799194336, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8492931723594666, + "num_tokens": 329366301.0, + "step": 8636 + }, + { + "epoch": 1.098715176186236, + "ewc_loss": 0.037524573504924774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.876228634500876e-05, + "grad_norm": 26.49167251586914, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8515400290489197, + "num_tokens": 329401612.0, + "step": 8637 + }, + { + "epoch": 1.0988423864648265, + "ewc_loss": 0.03749116137623787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8745580746326596e-05, + "grad_norm": 26.29060935974121, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8647823333740234, + "num_tokens": 329440403.0, + "step": 8638 + }, + { + "epoch": 1.0989695967434168, + "ewc_loss": 0.03746131435036659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8730657757259905e-05, + "grad_norm": 26.380464553833008, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8728428483009338, + "num_tokens": 329478699.0, + "step": 8639 + }, + { + "epoch": 1.0990968070220073, + "ewc_loss": 0.03754868358373642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8774342606775463e-05, + "grad_norm": 26.324357986450195, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8610473871231079, + "num_tokens": 329517860.0, + "step": 8640 + }, + { + "epoch": 1.0992240173005978, + "ewc_loss": 0.03749930486083031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.874965164461173e-05, + "grad_norm": 26.349302291870117, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8676728010177612, + "num_tokens": 329555385.0, + "step": 8641 + }, + { + "epoch": 1.0993512275791884, + "ewc_loss": 0.037531930953264236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8765966160572134e-05, + "grad_norm": 26.34270477294922, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8584340810775757, + "num_tokens": 329594775.0, + "step": 8642 + }, + { + "epoch": 1.0994784378577789, + "ewc_loss": 0.037492234259843826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.874611734820064e-05, + "grad_norm": 26.250186920166016, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8664302825927734, + "num_tokens": 329629872.0, + "step": 8643 + }, + { + "epoch": 1.0996056481363694, + "ewc_loss": 0.03755782172083855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8778910089167766e-05, + "grad_norm": 26.412935256958008, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8573490381240845, + "num_tokens": 329665263.0, + "step": 8644 + }, + { + "epoch": 1.09973285841496, + "ewc_loss": 0.037599626928567886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8799813915393315e-05, + "grad_norm": 26.398033142089844, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8531522154808044, + "num_tokens": 329705853.0, + "step": 8645 + }, + { + "epoch": 1.0998600686935505, + "ewc_loss": 0.03753451257944107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8767255824059248e-05, + "grad_norm": 26.357376098632812, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8691219091415405, + "num_tokens": 329741690.0, + "step": 8646 + }, + { + "epoch": 1.099987278972141, + "ewc_loss": 0.03762432560324669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8812163034453988e-05, + "grad_norm": 26.472381591796875, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8557583689689636, + "num_tokens": 329781548.0, + "step": 8647 + }, + { + "epoch": 1.1001144892507315, + "ewc_loss": 0.03752997890114784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.876498936326243e-05, + "grad_norm": 26.358802795410156, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8700262308120728, + "num_tokens": 329822304.0, + "step": 8648 + }, + { + "epoch": 1.100241699529322, + "ewc_loss": 0.03753911331295967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8769556845654733e-05, + "grad_norm": 26.34905433654785, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8637531995773315, + "num_tokens": 329856065.0, + "step": 8649 + }, + { + "epoch": 1.1003689098079126, + "ewc_loss": 0.037554264068603516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8777132936520502e-05, + "grad_norm": 26.386810302734375, + "learning_rate": 1e-06, + "loss": 0.5099, + "mean_token_accuracy": 0.8472027778625488, + "num_tokens": 329895002.0, + "step": 8650 + }, + { + "epoch": 1.100496120086503, + "ewc_loss": 0.03755291551351547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8776458091451786e-05, + "grad_norm": 26.29698371887207, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.856259286403656, + "num_tokens": 329938810.0, + "step": 8651 + }, + { + "epoch": 1.1006233303650934, + "ewc_loss": 0.037523992359638214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8761995306704193e-05, + "grad_norm": 26.344139099121094, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8752108812332153, + "num_tokens": 329976513.0, + "step": 8652 + }, + { + "epoch": 1.100750540643684, + "ewc_loss": 0.0375741608440876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8787080989568494e-05, + "grad_norm": 26.432600021362305, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8699939250946045, + "num_tokens": 330014013.0, + "step": 8653 + }, + { + "epoch": 1.1008777509222745, + "ewc_loss": 0.03757505863904953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8787528460961767e-05, + "grad_norm": 26.350494384765625, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8586916327476501, + "num_tokens": 330052577.0, + "step": 8654 + }, + { + "epoch": 1.101004961200865, + "ewc_loss": 0.03757459670305252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8787297449307516e-05, + "grad_norm": 26.4086856842041, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8597927689552307, + "num_tokens": 330091385.0, + "step": 8655 + }, + { + "epoch": 1.1011321714794555, + "ewc_loss": 0.037554994225502014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.877749673440121e-05, + "grad_norm": 26.370939254760742, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8711613416671753, + "num_tokens": 330130285.0, + "step": 8656 + }, + { + "epoch": 1.101259381758046, + "ewc_loss": 0.03754293918609619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8771470422507264e-05, + "grad_norm": 26.430429458618164, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8488813042640686, + "num_tokens": 330166823.0, + "step": 8657 + }, + { + "epoch": 1.1013865920366366, + "ewc_loss": 0.03751685470342636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.875842826848384e-05, + "grad_norm": 26.321584701538086, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8641557693481445, + "num_tokens": 330200655.0, + "step": 8658 + }, + { + "epoch": 1.101513802315227, + "ewc_loss": 0.03755074739456177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8775373973767273e-05, + "grad_norm": 26.322063446044922, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8609273433685303, + "num_tokens": 330243656.0, + "step": 8659 + }, + { + "epoch": 1.1016410125938176, + "ewc_loss": 0.03759580850601196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8797903976519592e-05, + "grad_norm": 26.39092254638672, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8551965951919556, + "num_tokens": 330287200.0, + "step": 8660 + }, + { + "epoch": 1.1017682228724082, + "ewc_loss": 0.037611059844493866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8805529180099256e-05, + "grad_norm": 26.372852325439453, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8654409050941467, + "num_tokens": 330324794.0, + "step": 8661 + }, + { + "epoch": 1.1018954331509987, + "ewc_loss": 0.03752506151795387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8762530089588836e-05, + "grad_norm": 26.331161499023438, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8649070858955383, + "num_tokens": 330365297.0, + "step": 8662 + }, + { + "epoch": 1.102022643429589, + "ewc_loss": 0.037654582411050797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.882729156932328e-05, + "grad_norm": 26.43630599975586, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.879896879196167, + "num_tokens": 330408046.0, + "step": 8663 + }, + { + "epoch": 1.1021498537081795, + "ewc_loss": 0.03758397698402405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.879198862297926e-05, + "grad_norm": 26.400821685791016, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8584470152854919, + "num_tokens": 330447815.0, + "step": 8664 + }, + { + "epoch": 1.10227706398677, + "ewc_loss": 0.03753631189465523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8768156223814003e-05, + "grad_norm": 26.33675765991211, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8509425520896912, + "num_tokens": 330487796.0, + "step": 8665 + }, + { + "epoch": 1.1024042742653606, + "ewc_loss": 0.03760237246751785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.880118543340359e-05, + "grad_norm": 26.424060821533203, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8725515604019165, + "num_tokens": 330531155.0, + "step": 8666 + }, + { + "epoch": 1.1025314845439511, + "ewc_loss": 0.03758697584271431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8793487470247783e-05, + "grad_norm": 26.40102195739746, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8722511529922485, + "num_tokens": 330568611.0, + "step": 8667 + }, + { + "epoch": 1.1026586948225416, + "ewc_loss": 0.03752434253692627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8762171748676337e-05, + "grad_norm": 26.366079330444336, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8568031787872314, + "num_tokens": 330608085.0, + "step": 8668 + }, + { + "epoch": 1.1027859051011322, + "ewc_loss": 0.03760581463575363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8802908016368747e-05, + "grad_norm": 26.534433364868164, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8447317481040955, + "num_tokens": 330644776.0, + "step": 8669 + }, + { + "epoch": 1.1029131153797227, + "ewc_loss": 0.037508562207221985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8754280972643755e-05, + "grad_norm": 26.386150360107422, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8516647815704346, + "num_tokens": 330686401.0, + "step": 8670 + }, + { + "epoch": 1.1030403256583132, + "ewc_loss": 0.03745851293206215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8729257135419175e-05, + "grad_norm": 26.350696563720703, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8675632476806641, + "num_tokens": 330723115.0, + "step": 8671 + }, + { + "epoch": 1.1031675359369038, + "ewc_loss": 0.037583038210868835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8791519323713146e-05, + "grad_norm": 26.414146423339844, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8784390687942505, + "num_tokens": 330761170.0, + "step": 8672 + }, + { + "epoch": 1.1032947462154943, + "ewc_loss": 0.037502966821193695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8751483366941102e-05, + "grad_norm": 26.33124351501465, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8632377982139587, + "num_tokens": 330790230.0, + "step": 8673 + }, + { + "epoch": 1.1034219564940848, + "ewc_loss": 0.03748676925897598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8743385226116516e-05, + "grad_norm": 26.320829391479492, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.873917818069458, + "num_tokens": 330826178.0, + "step": 8674 + }, + { + "epoch": 1.1035491667726753, + "ewc_loss": 0.03750735521316528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8753677068161778e-05, + "grad_norm": 26.285316467285156, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8689463138580322, + "num_tokens": 330866341.0, + "step": 8675 + }, + { + "epoch": 1.1036763770512656, + "ewc_loss": 0.037557605654001236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8778802768792957e-05, + "grad_norm": 26.361309051513672, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.848280668258667, + "num_tokens": 330903955.0, + "step": 8676 + }, + { + "epoch": 1.1038035873298562, + "ewc_loss": 0.037566691637039185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8783346604323015e-05, + "grad_norm": 26.298978805541992, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8583226203918457, + "num_tokens": 330943024.0, + "step": 8677 + }, + { + "epoch": 1.1039307976084467, + "ewc_loss": 0.03756798803806305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8783994164550677e-05, + "grad_norm": 26.27611541748047, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8574032187461853, + "num_tokens": 330981004.0, + "step": 8678 + }, + { + "epoch": 1.1040580078870372, + "ewc_loss": 0.037571560591459274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.878578041214496e-05, + "grad_norm": 26.368480682373047, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8618462085723877, + "num_tokens": 331021532.0, + "step": 8679 + }, + { + "epoch": 1.1041852181656278, + "ewc_loss": 0.03766728937625885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8833645299309865e-05, + "grad_norm": 26.288270950317383, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8709009885787964, + "num_tokens": 331058010.0, + "step": 8680 + }, + { + "epoch": 1.1043124284442183, + "ewc_loss": 0.03761018440127373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8805092622642405e-05, + "grad_norm": 26.451702117919922, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8709874749183655, + "num_tokens": 331090713.0, + "step": 8681 + }, + { + "epoch": 1.1044396387228088, + "ewc_loss": 0.03771929442882538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8859647752833553e-05, + "grad_norm": 26.35733413696289, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8677344918251038, + "num_tokens": 331127532.0, + "step": 8682 + }, + { + "epoch": 1.1045668490013993, + "ewc_loss": 0.037583548575639725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8791773982229643e-05, + "grad_norm": 26.322303771972656, + "learning_rate": 1e-06, + "loss": 0.516, + "mean_token_accuracy": 0.8414903879165649, + "num_tokens": 331172088.0, + "step": 8683 + }, + { + "epoch": 1.1046940592799899, + "ewc_loss": 0.03771023452281952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.885511665022932e-05, + "grad_norm": 26.40342903137207, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8566431403160095, + "num_tokens": 331213419.0, + "step": 8684 + }, + { + "epoch": 1.1048212695585804, + "ewc_loss": 0.037684790790081024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.884239463834092e-05, + "grad_norm": 26.442031860351562, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8624767065048218, + "num_tokens": 331250102.0, + "step": 8685 + }, + { + "epoch": 1.104948479837171, + "ewc_loss": 0.037599798291921616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8799899407895282e-05, + "grad_norm": 26.41903305053711, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8640618324279785, + "num_tokens": 331284597.0, + "step": 8686 + }, + { + "epoch": 1.1050756901157615, + "ewc_loss": 0.03765856847167015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.882928336272016e-05, + "grad_norm": 26.429100036621094, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8548486232757568, + "num_tokens": 331330688.0, + "step": 8687 + }, + { + "epoch": 1.1052029003943518, + "ewc_loss": 0.03753400593996048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8767002984532155e-05, + "grad_norm": 26.422809600830078, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8659741282463074, + "num_tokens": 331368777.0, + "step": 8688 + }, + { + "epoch": 1.1053301106729423, + "ewc_loss": 0.03767109289765358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.883554614323657e-05, + "grad_norm": 26.486146926879883, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8671371340751648, + "num_tokens": 331414369.0, + "step": 8689 + }, + { + "epoch": 1.1054573209515328, + "ewc_loss": 0.03760550543665886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8802753402269445e-05, + "grad_norm": 26.48406982421875, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8685760498046875, + "num_tokens": 331452788.0, + "step": 8690 + }, + { + "epoch": 1.1055845312301233, + "ewc_loss": 0.03751371055841446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8756854842649773e-05, + "grad_norm": 26.246868133544922, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8517332673072815, + "num_tokens": 331489575.0, + "step": 8691 + }, + { + "epoch": 1.1057117415087139, + "ewc_loss": 0.03758854418992996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.879427145468071e-05, + "grad_norm": 26.509780883789062, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8790469169616699, + "num_tokens": 331529129.0, + "step": 8692 + }, + { + "epoch": 1.1058389517873044, + "ewc_loss": 0.03766264393925667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8831322449841537e-05, + "grad_norm": 26.481184005737305, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8596062660217285, + "num_tokens": 331564989.0, + "step": 8693 + }, + { + "epoch": 1.105966162065895, + "ewc_loss": 0.03756262734532356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.878131297416985e-05, + "grad_norm": 26.42987823486328, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.863699197769165, + "num_tokens": 331602561.0, + "step": 8694 + }, + { + "epoch": 1.1060933723444855, + "ewc_loss": 0.03760010376572609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.880005220300518e-05, + "grad_norm": 26.44291114807129, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8633332252502441, + "num_tokens": 331646902.0, + "step": 8695 + }, + { + "epoch": 1.106220582623076, + "ewc_loss": 0.037602294236421585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8801147234626114e-05, + "grad_norm": 26.4122257232666, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8634483814239502, + "num_tokens": 331682347.0, + "step": 8696 + }, + { + "epoch": 1.1063477929016665, + "ewc_loss": 0.03760017082095146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8800084944814444e-05, + "grad_norm": 26.47974967956543, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8549020290374756, + "num_tokens": 331725813.0, + "step": 8697 + }, + { + "epoch": 1.106475003180257, + "ewc_loss": 0.03755750507116318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8778751837089658e-05, + "grad_norm": 26.348344802856445, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8417580723762512, + "num_tokens": 331767856.0, + "step": 8698 + }, + { + "epoch": 1.1066022134588476, + "ewc_loss": 0.0375114381313324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.875571979326196e-05, + "grad_norm": 26.346332550048828, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8716369867324829, + "num_tokens": 331804238.0, + "step": 8699 + }, + { + "epoch": 1.106729423737438, + "ewc_loss": 0.0375945158302784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8797258235281333e-05, + "grad_norm": 26.41522216796875, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8623358607292175, + "num_tokens": 331839963.0, + "step": 8700 + }, + { + "epoch": 1.1068566340160284, + "ewc_loss": 0.03758521005511284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8792605260387063e-05, + "grad_norm": 26.402606964111328, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8664438724517822, + "num_tokens": 331881563.0, + "step": 8701 + }, + { + "epoch": 1.106983844294619, + "ewc_loss": 0.03751351684331894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8756758436211385e-05, + "grad_norm": 26.340089797973633, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8583294153213501, + "num_tokens": 331925540.0, + "step": 8702 + }, + { + "epoch": 1.1071110545732095, + "ewc_loss": 0.03760794177651405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8803970306180418e-05, + "grad_norm": 26.433626174926758, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8701840043067932, + "num_tokens": 331963171.0, + "step": 8703 + }, + { + "epoch": 1.1072382648518, + "ewc_loss": 0.03758576139807701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8792879927787e-05, + "grad_norm": 26.48467254638672, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8819911479949951, + "num_tokens": 331998310.0, + "step": 8704 + }, + { + "epoch": 1.1073654751303905, + "ewc_loss": 0.037596508860588074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8798255041474476e-05, + "grad_norm": 26.40628433227539, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8691794872283936, + "num_tokens": 332034181.0, + "step": 8705 + }, + { + "epoch": 1.107492685408981, + "ewc_loss": 0.03753054887056351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8765274944598787e-05, + "grad_norm": 26.48690414428711, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.872990608215332, + "num_tokens": 332075368.0, + "step": 8706 + }, + { + "epoch": 1.1076198956875716, + "ewc_loss": 0.037566933780908585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.878346665762365e-05, + "grad_norm": 26.40795135498047, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8726267218589783, + "num_tokens": 332113281.0, + "step": 8707 + }, + { + "epoch": 1.107747105966162, + "ewc_loss": 0.03748467192053795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.874233566923067e-05, + "grad_norm": 26.393356323242188, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8546972274780273, + "num_tokens": 332154206.0, + "step": 8708 + }, + { + "epoch": 1.1078743162447526, + "ewc_loss": 0.037534479051828384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8767239453154616e-05, + "grad_norm": 26.48102569580078, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8476371169090271, + "num_tokens": 332195409.0, + "step": 8709 + }, + { + "epoch": 1.1080015265233432, + "ewc_loss": 0.03747602552175522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.873801193141844e-05, + "grad_norm": 26.367643356323242, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8712660074234009, + "num_tokens": 332233836.0, + "step": 8710 + }, + { + "epoch": 1.1081287368019337, + "ewc_loss": 0.03745616599917412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8728083887253888e-05, + "grad_norm": 26.465394973754883, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8681447505950928, + "num_tokens": 332274167.0, + "step": 8711 + }, + { + "epoch": 1.108255947080524, + "ewc_loss": 0.03757285326719284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8786426153383218e-05, + "grad_norm": 26.430130004882812, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8587971925735474, + "num_tokens": 332315856.0, + "step": 8712 + }, + { + "epoch": 1.1083831573591145, + "ewc_loss": 0.03748075291514397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.874037661764305e-05, + "grad_norm": 26.375179290771484, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8620473146438599, + "num_tokens": 332366365.0, + "step": 8713 + }, + { + "epoch": 1.108510367637705, + "ewc_loss": 0.037480562925338745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8740282030194066e-05, + "grad_norm": 26.34229278564453, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8601086139678955, + "num_tokens": 332400411.0, + "step": 8714 + }, + { + "epoch": 1.1086375779162956, + "ewc_loss": 0.037489429116249084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8744714907370508e-05, + "grad_norm": 26.34284019470215, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.864763081073761, + "num_tokens": 332441959.0, + "step": 8715 + }, + { + "epoch": 1.108764788194886, + "ewc_loss": 0.0374874472618103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8743723558145575e-05, + "grad_norm": 26.399839401245117, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8494681119918823, + "num_tokens": 332479545.0, + "step": 8716 + }, + { + "epoch": 1.1088919984734766, + "ewc_loss": 0.03756074979901314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.878037437563762e-05, + "grad_norm": 26.262678146362305, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8738994002342224, + "num_tokens": 332516240.0, + "step": 8717 + }, + { + "epoch": 1.1090192087520672, + "ewc_loss": 0.03754149377346039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8770746464724652e-05, + "grad_norm": 26.43096351623535, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8678584694862366, + "num_tokens": 332553488.0, + "step": 8718 + }, + { + "epoch": 1.1091464190306577, + "ewc_loss": 0.03760627657175064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8803139028022997e-05, + "grad_norm": 26.471038818359375, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8539404273033142, + "num_tokens": 332588646.0, + "step": 8719 + }, + { + "epoch": 1.1092736293092482, + "ewc_loss": 0.03755695000290871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.877847535070032e-05, + "grad_norm": 26.37892723083496, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8531057834625244, + "num_tokens": 332629103.0, + "step": 8720 + }, + { + "epoch": 1.1094008395878387, + "ewc_loss": 0.03758211061358452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8791055481415242e-05, + "grad_norm": 26.458837509155273, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8686602115631104, + "num_tokens": 332664820.0, + "step": 8721 + }, + { + "epoch": 1.1095280498664293, + "ewc_loss": 0.03756144270300865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8780721802613698e-05, + "grad_norm": 26.440975189208984, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8642826080322266, + "num_tokens": 332699772.0, + "step": 8722 + }, + { + "epoch": 1.1096552601450198, + "ewc_loss": 0.03759951516985893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8799757526721805e-05, + "grad_norm": 26.476511001586914, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8718191385269165, + "num_tokens": 332733478.0, + "step": 8723 + }, + { + "epoch": 1.1097824704236103, + "ewc_loss": 0.03755205497145653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8776026990963146e-05, + "grad_norm": 26.410629272460938, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.858392596244812, + "num_tokens": 332774733.0, + "step": 8724 + }, + { + "epoch": 1.1099096807022006, + "ewc_loss": 0.03760699927806854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.88034991879249e-05, + "grad_norm": 26.38288116455078, + "learning_rate": 1e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.8394885659217834, + "num_tokens": 332812970.0, + "step": 8725 + }, + { + "epoch": 1.1100368909807912, + "ewc_loss": 0.03761141002178192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.88057056220714e-05, + "grad_norm": 26.416378021240234, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8592875003814697, + "num_tokens": 332854974.0, + "step": 8726 + }, + { + "epoch": 1.1101641012593817, + "ewc_loss": 0.0376293919980526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8814696886693127e-05, + "grad_norm": 26.415611267089844, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8491770029067993, + "num_tokens": 332886462.0, + "step": 8727 + }, + { + "epoch": 1.1102913115379722, + "ewc_loss": 0.03769310191273689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8846551029128022e-05, + "grad_norm": 26.46823501586914, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8588716983795166, + "num_tokens": 332928614.0, + "step": 8728 + }, + { + "epoch": 1.1104185218165628, + "ewc_loss": 0.03761012479662895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8805061699822545e-05, + "grad_norm": 26.348573684692383, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8633294105529785, + "num_tokens": 332962447.0, + "step": 8729 + }, + { + "epoch": 1.1105457320951533, + "ewc_loss": 0.037622664123773575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8811331756296568e-05, + "grad_norm": 26.3238582611084, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8534349203109741, + "num_tokens": 332991395.0, + "step": 8730 + }, + { + "epoch": 1.1106729423737438, + "ewc_loss": 0.037680480629205704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8840240954887122e-05, + "grad_norm": 26.365432739257812, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8551267385482788, + "num_tokens": 333027852.0, + "step": 8731 + }, + { + "epoch": 1.1108001526523343, + "ewc_loss": 0.037664636969566345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.883231925603468e-05, + "grad_norm": 26.425228118896484, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8654818534851074, + "num_tokens": 333063337.0, + "step": 8732 + }, + { + "epoch": 1.1109273629309249, + "ewc_loss": 0.03780316561460495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.89015827345429e-05, + "grad_norm": 26.481830596923828, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8531697988510132, + "num_tokens": 333094632.0, + "step": 8733 + }, + { + "epoch": 1.1110545732095154, + "ewc_loss": 0.037690822035074234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8845410522772e-05, + "grad_norm": 26.370197296142578, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.877193033695221, + "num_tokens": 333134071.0, + "step": 8734 + }, + { + "epoch": 1.111181783488106, + "ewc_loss": 0.037778209894895554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8889104467234574e-05, + "grad_norm": 26.48468780517578, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8678275346755981, + "num_tokens": 333171288.0, + "step": 8735 + }, + { + "epoch": 1.1113089937666965, + "ewc_loss": 0.03771941736340523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.885970777948387e-05, + "grad_norm": 26.337385177612305, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8696459531784058, + "num_tokens": 333212111.0, + "step": 8736 + }, + { + "epoch": 1.1114362040452868, + "ewc_loss": 0.037769004702568054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8884502424043603e-05, + "grad_norm": 26.473724365234375, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8680299520492554, + "num_tokens": 333247557.0, + "step": 8737 + }, + { + "epoch": 1.1115634143238773, + "ewc_loss": 0.037808142602443695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.890407111204695e-05, + "grad_norm": 26.433279037475586, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8627796173095703, + "num_tokens": 333278705.0, + "step": 8738 + }, + { + "epoch": 1.1116906246024678, + "ewc_loss": 0.037777144461870193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8888571503339335e-05, + "grad_norm": 26.55654525756836, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8649879693984985, + "num_tokens": 333316493.0, + "step": 8739 + }, + { + "epoch": 1.1118178348810583, + "ewc_loss": 0.03770427778363228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8852138964575715e-05, + "grad_norm": 26.418668746948242, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8661230802536011, + "num_tokens": 333360384.0, + "step": 8740 + }, + { + "epoch": 1.1119450451596489, + "ewc_loss": 0.03769349679350853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.884674929897301e-05, + "grad_norm": 26.568511962890625, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8633548021316528, + "num_tokens": 333395568.0, + "step": 8741 + }, + { + "epoch": 1.1120722554382394, + "ewc_loss": 0.037749845534563065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.887492362584453e-05, + "grad_norm": 26.47294044494629, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.866922914981842, + "num_tokens": 333429267.0, + "step": 8742 + }, + { + "epoch": 1.11219946571683, + "ewc_loss": 0.037660904228687286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.883045297290664e-05, + "grad_norm": 26.50570297241211, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.869757890701294, + "num_tokens": 333469908.0, + "step": 8743 + }, + { + "epoch": 1.1123266759954205, + "ewc_loss": 0.03770885244011879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8854425434255973e-05, + "grad_norm": 26.443138122558594, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8669599890708923, + "num_tokens": 333507865.0, + "step": 8744 + }, + { + "epoch": 1.112453886274011, + "ewc_loss": 0.03774530813097954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8872653527068906e-05, + "grad_norm": 26.497798919677734, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8734512329101562, + "num_tokens": 333544011.0, + "step": 8745 + }, + { + "epoch": 1.1125810965526015, + "ewc_loss": 0.037687454372644424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.884372795757372e-05, + "grad_norm": 26.41328239440918, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.863932728767395, + "num_tokens": 333581949.0, + "step": 8746 + }, + { + "epoch": 1.112708306831192, + "ewc_loss": 0.03772619366645813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8863096556742676e-05, + "grad_norm": 26.626550674438477, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8700922131538391, + "num_tokens": 333615994.0, + "step": 8747 + }, + { + "epoch": 1.1128355171097826, + "ewc_loss": 0.03773540258407593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.886770041892305e-05, + "grad_norm": 26.41988182067871, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8782326579093933, + "num_tokens": 333654454.0, + "step": 8748 + }, + { + "epoch": 1.112962727388373, + "ewc_loss": 0.03763569891452789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8817849195329472e-05, + "grad_norm": 26.569595336914062, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8561918139457703, + "num_tokens": 333700206.0, + "step": 8749 + }, + { + "epoch": 1.1130899376669634, + "ewc_loss": 0.03772091120481491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8860455384128727e-05, + "grad_norm": 26.348308563232422, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8670550584793091, + "num_tokens": 333737538.0, + "step": 8750 + }, + { + "epoch": 1.113217147945554, + "ewc_loss": 0.03755449131131172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8777245713863522e-05, + "grad_norm": 26.36370086669922, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8650023937225342, + "num_tokens": 333771237.0, + "step": 8751 + }, + { + "epoch": 1.1133443582241445, + "ewc_loss": 0.03768335282802582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.884167613752652e-05, + "grad_norm": 26.36458396911621, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8599231839179993, + "num_tokens": 333806522.0, + "step": 8752 + }, + { + "epoch": 1.113471568502735, + "ewc_loss": 0.03771672770380974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.885836354631465e-05, + "grad_norm": 26.491628646850586, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8624480962753296, + "num_tokens": 333846656.0, + "step": 8753 + }, + { + "epoch": 1.1135987787813255, + "ewc_loss": 0.03767658770084381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8838294636225328e-05, + "grad_norm": 26.296142578125, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8552933931350708, + "num_tokens": 333886294.0, + "step": 8754 + }, + { + "epoch": 1.113725989059916, + "ewc_loss": 0.03768392279744148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8841961718862876e-05, + "grad_norm": 26.48379898071289, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8605726957321167, + "num_tokens": 333923868.0, + "step": 8755 + }, + { + "epoch": 1.1138531993385066, + "ewc_loss": 0.03779267147183418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.889633495011367e-05, + "grad_norm": 26.418123245239258, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8671342134475708, + "num_tokens": 333957187.0, + "step": 8756 + }, + { + "epoch": 1.113980409617097, + "ewc_loss": 0.03770732879638672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.885366509668529e-05, + "grad_norm": 26.432893753051758, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8769965171813965, + "num_tokens": 333999061.0, + "step": 8757 + }, + { + "epoch": 1.1141076198956876, + "ewc_loss": 0.037728577852249146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.88642898137914e-05, + "grad_norm": 26.529865264892578, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8583592176437378, + "num_tokens": 334043595.0, + "step": 8758 + }, + { + "epoch": 1.1142348301742782, + "ewc_loss": 0.03773993253707886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8869966879719868e-05, + "grad_norm": 26.402984619140625, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.867729902267456, + "num_tokens": 334078487.0, + "step": 8759 + }, + { + "epoch": 1.1143620404528687, + "ewc_loss": 0.03771694749593735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8858474504668266e-05, + "grad_norm": 26.491809844970703, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8719634413719177, + "num_tokens": 334119610.0, + "step": 8760 + }, + { + "epoch": 1.114489250731459, + "ewc_loss": 0.03774017095565796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.88700851140311e-05, + "grad_norm": 26.45502471923828, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8589534163475037, + "num_tokens": 334160149.0, + "step": 8761 + }, + { + "epoch": 1.1146164610100495, + "ewc_loss": 0.03773665800690651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8868329789256677e-05, + "grad_norm": 26.50499153137207, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8641567230224609, + "num_tokens": 334198667.0, + "step": 8762 + }, + { + "epoch": 1.11474367128864, + "ewc_loss": 0.03768192604184151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.884096309368033e-05, + "grad_norm": 26.332740783691406, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.864841103553772, + "num_tokens": 334233838.0, + "step": 8763 + }, + { + "epoch": 1.1148708815672306, + "ewc_loss": 0.0377015694975853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8850785636459477e-05, + "grad_norm": 26.5410099029541, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8550834655761719, + "num_tokens": 334275703.0, + "step": 8764 + }, + { + "epoch": 1.114998091845821, + "ewc_loss": 0.03775157034397125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.887578582682181e-05, + "grad_norm": 26.477489471435547, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8752995729446411, + "num_tokens": 334311119.0, + "step": 8765 + }, + { + "epoch": 1.1151253021244116, + "ewc_loss": 0.037691522389650345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8845761587726884e-05, + "grad_norm": 26.440946578979492, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8562337160110474, + "num_tokens": 334348298.0, + "step": 8766 + }, + { + "epoch": 1.1152525124030022, + "ewc_loss": 0.03771708533167839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8858541807276197e-05, + "grad_norm": 26.462635040283203, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8496330380439758, + "num_tokens": 334386986.0, + "step": 8767 + }, + { + "epoch": 1.1153797226815927, + "ewc_loss": 0.037717390805482864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8858694602386095e-05, + "grad_norm": 26.395509719848633, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8674144148826599, + "num_tokens": 334423263.0, + "step": 8768 + }, + { + "epoch": 1.1155069329601832, + "ewc_loss": 0.03776336461305618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8881682990468107e-05, + "grad_norm": 26.4764347076416, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8802264928817749, + "num_tokens": 334460363.0, + "step": 8769 + }, + { + "epoch": 1.1156341432387737, + "ewc_loss": 0.03772847726941109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8864238882088102e-05, + "grad_norm": 26.489803314208984, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8667373657226562, + "num_tokens": 334501883.0, + "step": 8770 + }, + { + "epoch": 1.1157613535173643, + "ewc_loss": 0.03774934262037277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8874670786317438e-05, + "grad_norm": 26.55229949951172, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8625258207321167, + "num_tokens": 334536242.0, + "step": 8771 + }, + { + "epoch": 1.1158885637959548, + "ewc_loss": 0.03768984600901604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.884492303361185e-05, + "grad_norm": 26.46091651916504, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8761881589889526, + "num_tokens": 334569685.0, + "step": 8772 + }, + { + "epoch": 1.1160157740745453, + "ewc_loss": 0.03772631660103798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8863158402382396e-05, + "grad_norm": 26.50057029724121, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8650306463241577, + "num_tokens": 334604946.0, + "step": 8773 + }, + { + "epoch": 1.1161429843531356, + "ewc_loss": 0.037714097648859024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8857048416975886e-05, + "grad_norm": 26.45860481262207, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8686681985855103, + "num_tokens": 334645495.0, + "step": 8774 + }, + { + "epoch": 1.1162701946317262, + "ewc_loss": 0.037640899419784546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8820450350176543e-05, + "grad_norm": 26.393766403198242, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.857781708240509, + "num_tokens": 334684239.0, + "step": 8775 + }, + { + "epoch": 1.1163974049103167, + "ewc_loss": 0.03777201846241951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.888600854726974e-05, + "grad_norm": 26.650405883789062, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8696146011352539, + "num_tokens": 334713799.0, + "step": 8776 + }, + { + "epoch": 1.1165246151889072, + "ewc_loss": 0.03771055117249489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.885527490230743e-05, + "grad_norm": 26.51213264465332, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8832793235778809, + "num_tokens": 334753374.0, + "step": 8777 + }, + { + "epoch": 1.1166518254674977, + "ewc_loss": 0.03765663504600525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.882831747934688e-05, + "grad_norm": 26.434396743774414, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8643573522567749, + "num_tokens": 334792867.0, + "step": 8778 + }, + { + "epoch": 1.1167790357460883, + "ewc_loss": 0.03772557154297829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.886278550955467e-05, + "grad_norm": 26.50322914123535, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8609459400177002, + "num_tokens": 334832664.0, + "step": 8779 + }, + { + "epoch": 1.1169062460246788, + "ewc_loss": 0.03770182654261589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8850912965717725e-05, + "grad_norm": 26.48039436340332, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.856553316116333, + "num_tokens": 334876763.0, + "step": 8780 + }, + { + "epoch": 1.1170334563032693, + "ewc_loss": 0.03774183988571167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8870920030167326e-05, + "grad_norm": 26.468351364135742, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8647196292877197, + "num_tokens": 334909431.0, + "step": 8781 + }, + { + "epoch": 1.1171606665818599, + "ewc_loss": 0.03765324130654335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.882662036223337e-05, + "grad_norm": 26.393762588500977, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8626517057418823, + "num_tokens": 334941939.0, + "step": 8782 + }, + { + "epoch": 1.1172878768604504, + "ewc_loss": 0.0377143993973732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.885719939309638e-05, + "grad_norm": 26.467845916748047, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8732762932777405, + "num_tokens": 334981338.0, + "step": 8783 + }, + { + "epoch": 1.117415087139041, + "ewc_loss": 0.037726256996393204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.886312929855194e-05, + "grad_norm": 26.431142807006836, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8625006079673767, + "num_tokens": 335020660.0, + "step": 8784 + }, + { + "epoch": 1.1175422974176314, + "ewc_loss": 0.03774171322584152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8870856365538202e-05, + "grad_norm": 26.431875228881836, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8662567734718323, + "num_tokens": 335057377.0, + "step": 8785 + }, + { + "epoch": 1.1176695076962218, + "ewc_loss": 0.03777821362018585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8889106286223978e-05, + "grad_norm": 26.53502082824707, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8537673950195312, + "num_tokens": 335092912.0, + "step": 8786 + }, + { + "epoch": 1.1177967179748123, + "ewc_loss": 0.03777000680565834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.888500264612958e-05, + "grad_norm": 26.402488708496094, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8777327537536621, + "num_tokens": 335129157.0, + "step": 8787 + }, + { + "epoch": 1.1179239282534028, + "ewc_loss": 0.03775973245501518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8879865820053965e-05, + "grad_norm": 26.594152450561523, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.86436927318573, + "num_tokens": 335168065.0, + "step": 8788 + }, + { + "epoch": 1.1180511385319933, + "ewc_loss": 0.03779785335063934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8898927010013722e-05, + "grad_norm": 26.414539337158203, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8622292280197144, + "num_tokens": 335218876.0, + "step": 8789 + }, + { + "epoch": 1.1181783488105839, + "ewc_loss": 0.037669774144887924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8834887669072486e-05, + "grad_norm": 26.495187759399414, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8723264932632446, + "num_tokens": 335256300.0, + "step": 8790 + }, + { + "epoch": 1.1183055590891744, + "ewc_loss": 0.03777175024151802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.888587576104328e-05, + "grad_norm": 26.439802169799805, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8735365271568298, + "num_tokens": 335291035.0, + "step": 8791 + }, + { + "epoch": 1.118432769367765, + "ewc_loss": 0.03770042210817337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8850210835807957e-05, + "grad_norm": 26.461618423461914, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8755339980125427, + "num_tokens": 335330092.0, + "step": 8792 + }, + { + "epoch": 1.1185599796463555, + "ewc_loss": 0.03776220977306366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.888110455183778e-05, + "grad_norm": 26.440279006958008, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8505904674530029, + "num_tokens": 335367146.0, + "step": 8793 + }, + { + "epoch": 1.118687189924946, + "ewc_loss": 0.03775154799222946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.887577491288539e-05, + "grad_norm": 26.512619018554688, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8638870716094971, + "num_tokens": 335405069.0, + "step": 8794 + }, + { + "epoch": 1.1188144002035365, + "ewc_loss": 0.03774821758270264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8874108718591742e-05, + "grad_norm": 26.39107894897461, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8542488813400269, + "num_tokens": 335443779.0, + "step": 8795 + }, + { + "epoch": 1.118941610482127, + "ewc_loss": 0.0377531535923481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8876577087212354e-05, + "grad_norm": 26.500144958496094, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8553929328918457, + "num_tokens": 335485978.0, + "step": 8796 + }, + { + "epoch": 1.1190688207607176, + "ewc_loss": 0.037789274007081985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.889463783300016e-05, + "grad_norm": 26.60013771057129, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8689388036727905, + "num_tokens": 335530375.0, + "step": 8797 + }, + { + "epoch": 1.119196031039308, + "ewc_loss": 0.03771590068936348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8857950635720044e-05, + "grad_norm": 26.451416015625, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8686414957046509, + "num_tokens": 335569109.0, + "step": 8798 + }, + { + "epoch": 1.1193232413178984, + "ewc_loss": 0.03773840144276619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8869201085180975e-05, + "grad_norm": 26.50373649597168, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8504068851470947, + "num_tokens": 335599283.0, + "step": 8799 + }, + { + "epoch": 1.119450451596489, + "ewc_loss": 0.03777032718062401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.888516271719709e-05, + "grad_norm": 26.488576889038086, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8712766766548157, + "num_tokens": 335634792.0, + "step": 8800 + }, + { + "epoch": 1.1195776618750795, + "ewc_loss": 0.03773564100265503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8867820472223684e-05, + "grad_norm": 26.44318389892578, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8774376511573792, + "num_tokens": 335672869.0, + "step": 8801 + }, + { + "epoch": 1.11970487215367, + "ewc_loss": 0.0377613790333271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.888068982225377e-05, + "grad_norm": 26.493072509765625, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8694707751274109, + "num_tokens": 335714996.0, + "step": 8802 + }, + { + "epoch": 1.1198320824322605, + "ewc_loss": 0.03777467831969261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8887340047513135e-05, + "grad_norm": 26.413307189941406, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8559769988059998, + "num_tokens": 335756884.0, + "step": 8803 + }, + { + "epoch": 1.119959292710851, + "ewc_loss": 0.03777528181672096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.888764018076472e-05, + "grad_norm": 26.5889892578125, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8735769987106323, + "num_tokens": 335793935.0, + "step": 8804 + }, + { + "epoch": 1.1200865029894416, + "ewc_loss": 0.03776973858475685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.888486985990312e-05, + "grad_norm": 26.442588806152344, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8759608268737793, + "num_tokens": 335834169.0, + "step": 8805 + }, + { + "epoch": 1.120213713268032, + "ewc_loss": 0.03770988807082176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8854943846235983e-05, + "grad_norm": 26.462810516357422, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.881797194480896, + "num_tokens": 335872302.0, + "step": 8806 + }, + { + "epoch": 1.1203409235466226, + "ewc_loss": 0.03784215450286865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.89210768439807e-05, + "grad_norm": 26.535533905029297, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8648462295532227, + "num_tokens": 335908074.0, + "step": 8807 + }, + { + "epoch": 1.1204681338252132, + "ewc_loss": 0.037796251475811005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8898126654676162e-05, + "grad_norm": 26.503564834594727, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8736820220947266, + "num_tokens": 335946268.0, + "step": 8808 + }, + { + "epoch": 1.1205953441038037, + "ewc_loss": 0.03777310624718666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.88865524251014e-05, + "grad_norm": 26.490863800048828, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8643472194671631, + "num_tokens": 335985364.0, + "step": 8809 + }, + { + "epoch": 1.120722554382394, + "ewc_loss": 0.03777066990733147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8885335521190427e-05, + "grad_norm": 26.46895980834961, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8674959540367126, + "num_tokens": 336025300.0, + "step": 8810 + }, + { + "epoch": 1.1208497646609845, + "ewc_loss": 0.03773678094148636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8868389815906994e-05, + "grad_norm": 26.352487564086914, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.868452250957489, + "num_tokens": 336062105.0, + "step": 8811 + }, + { + "epoch": 1.120976974939575, + "ewc_loss": 0.03776481747627258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8882408767240122e-05, + "grad_norm": 26.51024627685547, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8524627685546875, + "num_tokens": 336102944.0, + "step": 8812 + }, + { + "epoch": 1.1211041852181656, + "ewc_loss": 0.03778189420700073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8890947103500366e-05, + "grad_norm": 26.48137855529785, + "learning_rate": 1e-06, + "loss": 0.3523, + "mean_token_accuracy": 0.8903816938400269, + "num_tokens": 336142964.0, + "step": 8813 + }, + { + "epoch": 1.121231395496756, + "ewc_loss": 0.03771014139056206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.885507117549423e-05, + "grad_norm": 26.36325454711914, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8661072254180908, + "num_tokens": 336186962.0, + "step": 8814 + }, + { + "epoch": 1.1213586057753466, + "ewc_loss": 0.037766072899103165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8883036318584345e-05, + "grad_norm": 26.535478591918945, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8726764917373657, + "num_tokens": 336220775.0, + "step": 8815 + }, + { + "epoch": 1.1214858160539372, + "ewc_loss": 0.037783607840538025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8891803847509436e-05, + "grad_norm": 26.330421447753906, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.860950231552124, + "num_tokens": 336263067.0, + "step": 8816 + }, + { + "epoch": 1.1216130263325277, + "ewc_loss": 0.03778943791985512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.889471968752332e-05, + "grad_norm": 26.53337287902832, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8678796887397766, + "num_tokens": 336308463.0, + "step": 8817 + }, + { + "epoch": 1.1217402366111182, + "ewc_loss": 0.0378325991332531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.891630017780699e-05, + "grad_norm": 26.58596420288086, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8780332803726196, + "num_tokens": 336346965.0, + "step": 8818 + }, + { + "epoch": 1.1218674468897087, + "ewc_loss": 0.03768698498606682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8843493307940662e-05, + "grad_norm": 26.546850204467773, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8659076690673828, + "num_tokens": 336382114.0, + "step": 8819 + }, + { + "epoch": 1.1219946571682993, + "ewc_loss": 0.037711065262556076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8855533198802732e-05, + "grad_norm": 26.47237777709961, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8696768283843994, + "num_tokens": 336414020.0, + "step": 8820 + }, + { + "epoch": 1.1221218674468898, + "ewc_loss": 0.03772923722863197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8864619050873443e-05, + "grad_norm": 26.49556541442871, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8625161051750183, + "num_tokens": 336452522.0, + "step": 8821 + }, + { + "epoch": 1.1222490777254803, + "ewc_loss": 0.03771322965621948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8856615497497842e-05, + "grad_norm": 26.513572692871094, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8759053945541382, + "num_tokens": 336484593.0, + "step": 8822 + }, + { + "epoch": 1.1223762880040706, + "ewc_loss": 0.03772891312837601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8864457160816528e-05, + "grad_norm": 26.450199127197266, + "learning_rate": 1e-06, + "loss": 0.4965, + "mean_token_accuracy": 0.848603367805481, + "num_tokens": 336518964.0, + "step": 8823 + }, + { + "epoch": 1.1225034982826612, + "ewc_loss": 0.037705399096012115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8852699213312007e-05, + "grad_norm": 26.48139762878418, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8516595363616943, + "num_tokens": 336556320.0, + "step": 8824 + }, + { + "epoch": 1.1226307085612517, + "ewc_loss": 0.037739481776952744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.886974132503383e-05, + "grad_norm": 26.38416290283203, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8546748161315918, + "num_tokens": 336595473.0, + "step": 8825 + }, + { + "epoch": 1.1227579188398422, + "ewc_loss": 0.037783652544021606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.889182567538228e-05, + "grad_norm": 26.55237579345703, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8635590672492981, + "num_tokens": 336631793.0, + "step": 8826 + }, + { + "epoch": 1.1228851291184327, + "ewc_loss": 0.037799425423145294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8899712813436054e-05, + "grad_norm": 26.35224151611328, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8499961495399475, + "num_tokens": 336671230.0, + "step": 8827 + }, + { + "epoch": 1.1230123393970233, + "ewc_loss": 0.03777395188808441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8886976249632426e-05, + "grad_norm": 26.453298568725586, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8656653165817261, + "num_tokens": 336712015.0, + "step": 8828 + }, + { + "epoch": 1.1231395496756138, + "ewc_loss": 0.03785776346921921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8928882127511315e-05, + "grad_norm": 26.60457420349121, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8486604690551758, + "num_tokens": 336741206.0, + "step": 8829 + }, + { + "epoch": 1.1232667599542043, + "ewc_loss": 0.03781786188483238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8908931451733224e-05, + "grad_norm": 26.499019622802734, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.856212854385376, + "num_tokens": 336778838.0, + "step": 8830 + }, + { + "epoch": 1.1233939702327949, + "ewc_loss": 0.03771744295954704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8858721887227148e-05, + "grad_norm": 26.451984405517578, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8695069551467896, + "num_tokens": 336815471.0, + "step": 8831 + }, + { + "epoch": 1.1235211805113854, + "ewc_loss": 0.03781472146511078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8907361663877964e-05, + "grad_norm": 26.48250389099121, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8667967915534973, + "num_tokens": 336850870.0, + "step": 8832 + }, + { + "epoch": 1.123648390789976, + "ewc_loss": 0.037820808589458466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8910404833150096e-05, + "grad_norm": 26.4985408782959, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8684527277946472, + "num_tokens": 336891779.0, + "step": 8833 + }, + { + "epoch": 1.1237756010685664, + "ewc_loss": 0.037858329713344574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8929164070868865e-05, + "grad_norm": 26.489721298217773, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8752334117889404, + "num_tokens": 336931171.0, + "step": 8834 + }, + { + "epoch": 1.1239028113471567, + "ewc_loss": 0.037875946611166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8937973436550237e-05, + "grad_norm": 26.604511260986328, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8748078346252441, + "num_tokens": 336963080.0, + "step": 8835 + }, + { + "epoch": 1.1240300216257473, + "ewc_loss": 0.037822067737579346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8911034203483723e-05, + "grad_norm": 26.345701217651367, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8502998948097229, + "num_tokens": 337000817.0, + "step": 8836 + }, + { + "epoch": 1.1241572319043378, + "ewc_loss": 0.03786278888583183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8931394151877612e-05, + "grad_norm": 26.562902450561523, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8583757281303406, + "num_tokens": 337037249.0, + "step": 8837 + }, + { + "epoch": 1.1242844421829283, + "ewc_loss": 0.03796470910310745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8982354959007353e-05, + "grad_norm": 26.44086456298828, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8722222447395325, + "num_tokens": 337077752.0, + "step": 8838 + }, + { + "epoch": 1.1244116524615189, + "ewc_loss": 0.03780035674571991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.890017847472336e-05, + "grad_norm": 26.451162338256836, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8618571758270264, + "num_tokens": 337119649.0, + "step": 8839 + }, + { + "epoch": 1.1245388627401094, + "ewc_loss": 0.037935320287942886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8967659343616106e-05, + "grad_norm": 26.495689392089844, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8492705821990967, + "num_tokens": 337154376.0, + "step": 8840 + }, + { + "epoch": 1.1246660730187, + "ewc_loss": 0.03788948059082031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8944740077131428e-05, + "grad_norm": 26.41942024230957, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.873258113861084, + "num_tokens": 337193377.0, + "step": 8841 + }, + { + "epoch": 1.1247932832972904, + "ewc_loss": 0.03791586309671402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8957931388285942e-05, + "grad_norm": 26.428586959838867, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8545735478401184, + "num_tokens": 337233176.0, + "step": 8842 + }, + { + "epoch": 1.124920493575881, + "ewc_loss": 0.037848059087991714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8924029063782655e-05, + "grad_norm": 26.49897003173828, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8499814867973328, + "num_tokens": 337262995.0, + "step": 8843 + }, + { + "epoch": 1.1250477038544715, + "ewc_loss": 0.03789440542459488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.894720298878383e-05, + "grad_norm": 26.461036682128906, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8630478978157043, + "num_tokens": 337302466.0, + "step": 8844 + }, + { + "epoch": 1.125174914133062, + "ewc_loss": 0.037855833768844604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8927916244138032e-05, + "grad_norm": 26.396989822387695, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8755972981452942, + "num_tokens": 337341197.0, + "step": 8845 + }, + { + "epoch": 1.1253021244116526, + "ewc_loss": 0.03790517896413803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8952589016407728e-05, + "grad_norm": 26.492023468017578, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8653860092163086, + "num_tokens": 337375602.0, + "step": 8846 + }, + { + "epoch": 1.125429334690243, + "ewc_loss": 0.037935834378004074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.896791764011141e-05, + "grad_norm": 26.49384307861328, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8550040125846863, + "num_tokens": 337406685.0, + "step": 8847 + }, + { + "epoch": 1.1255565449688334, + "ewc_loss": 0.037900660187006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8950329831568524e-05, + "grad_norm": 26.532501220703125, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8590713739395142, + "num_tokens": 337447364.0, + "step": 8848 + }, + { + "epoch": 1.125683755247424, + "ewc_loss": 0.03795399144291878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8976996216224506e-05, + "grad_norm": 26.700687408447266, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8640320301055908, + "num_tokens": 337487840.0, + "step": 8849 + }, + { + "epoch": 1.1258109655260145, + "ewc_loss": 0.037844497710466385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8922248273156583e-05, + "grad_norm": 26.384733200073242, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8679201006889343, + "num_tokens": 337525117.0, + "step": 8850 + }, + { + "epoch": 1.125938175804605, + "ewc_loss": 0.03785325959324837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8926630218629725e-05, + "grad_norm": 26.552278518676758, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.857058584690094, + "num_tokens": 337567024.0, + "step": 8851 + }, + { + "epoch": 1.1260653860831955, + "ewc_loss": 0.03793550655245781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.896775393106509e-05, + "grad_norm": 26.646244049072266, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8615678548812866, + "num_tokens": 337600117.0, + "step": 8852 + }, + { + "epoch": 1.126192596361786, + "ewc_loss": 0.03783424571156502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.891712236101739e-05, + "grad_norm": 26.535354614257812, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8485534191131592, + "num_tokens": 337637188.0, + "step": 8853 + }, + { + "epoch": 1.1263198066403766, + "ewc_loss": 0.037831176072359085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8915588952950202e-05, + "grad_norm": 26.50081443786621, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8583998680114746, + "num_tokens": 337676492.0, + "step": 8854 + }, + { + "epoch": 1.126447016918967, + "ewc_loss": 0.03792926296591759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8964630726259202e-05, + "grad_norm": 26.54997444152832, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8769181966781616, + "num_tokens": 337713468.0, + "step": 8855 + }, + { + "epoch": 1.1265742271975576, + "ewc_loss": 0.0378914475440979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8945724150398746e-05, + "grad_norm": 26.57907485961914, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8505122661590576, + "num_tokens": 337750944.0, + "step": 8856 + }, + { + "epoch": 1.1267014374761481, + "ewc_loss": 0.03785647824406624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.892823820526246e-05, + "grad_norm": 26.381174087524414, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8776118755340576, + "num_tokens": 337793802.0, + "step": 8857 + }, + { + "epoch": 1.1268286477547387, + "ewc_loss": 0.03788641095161438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8943204850074835e-05, + "grad_norm": 26.610124588012695, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8769957423210144, + "num_tokens": 337828550.0, + "step": 8858 + }, + { + "epoch": 1.126955858033329, + "ewc_loss": 0.03794638440012932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8973192709381692e-05, + "grad_norm": 26.4067325592041, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8733362555503845, + "num_tokens": 337862168.0, + "step": 8859 + }, + { + "epoch": 1.1270830683119195, + "ewc_loss": 0.037853967398405075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8926983102574013e-05, + "grad_norm": 26.556041717529297, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8600929975509644, + "num_tokens": 337898837.0, + "step": 8860 + }, + { + "epoch": 1.12721027859051, + "ewc_loss": 0.037961989641189575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.89809943549335e-05, + "grad_norm": 26.41903305053711, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.849859893321991, + "num_tokens": 337939608.0, + "step": 8861 + }, + { + "epoch": 1.1273374888691006, + "ewc_loss": 0.03786643594503403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.893321859824937e-05, + "grad_norm": 26.496944427490234, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8633184432983398, + "num_tokens": 337983412.0, + "step": 8862 + }, + { + "epoch": 1.127464699147691, + "ewc_loss": 0.03800481557846069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9002407498192042e-05, + "grad_norm": 26.604625701904297, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8734065294265747, + "num_tokens": 338022171.0, + "step": 8863 + }, + { + "epoch": 1.1275919094262816, + "ewc_loss": 0.0378556028008461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.892780164780561e-05, + "grad_norm": 26.56859588623047, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8605012893676758, + "num_tokens": 338058838.0, + "step": 8864 + }, + { + "epoch": 1.1277191197048722, + "ewc_loss": 0.03789767622947693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8948838260257617e-05, + "grad_norm": 26.514928817749023, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8594146370887756, + "num_tokens": 338095080.0, + "step": 8865 + }, + { + "epoch": 1.1278463299834627, + "ewc_loss": 0.03788077458739281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8940387235488743e-05, + "grad_norm": 26.494321823120117, + "learning_rate": 1e-06, + "loss": 0.4953, + "mean_token_accuracy": 0.8458852171897888, + "num_tokens": 338134206.0, + "step": 8866 + }, + { + "epoch": 1.1279735402620532, + "ewc_loss": 0.0378829725086689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8941485905088484e-05, + "grad_norm": 26.479534149169922, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8701293468475342, + "num_tokens": 338173794.0, + "step": 8867 + }, + { + "epoch": 1.1281007505406437, + "ewc_loss": 0.03789466246962547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.894733213703148e-05, + "grad_norm": 26.528106689453125, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8612065315246582, + "num_tokens": 338206591.0, + "step": 8868 + }, + { + "epoch": 1.1282279608192343, + "ewc_loss": 0.03789689019322395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.894844535854645e-05, + "grad_norm": 26.5460147857666, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8765576481819153, + "num_tokens": 338238728.0, + "step": 8869 + }, + { + "epoch": 1.1283551710978248, + "ewc_loss": 0.037869203835725784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8934601030196063e-05, + "grad_norm": 26.537723541259766, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8731801509857178, + "num_tokens": 338282372.0, + "step": 8870 + }, + { + "epoch": 1.1284823813764153, + "ewc_loss": 0.03787627071142197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8938135326607153e-05, + "grad_norm": 26.655803680419922, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8588004112243652, + "num_tokens": 338319175.0, + "step": 8871 + }, + { + "epoch": 1.1286095916550056, + "ewc_loss": 0.03786090016365051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.893045009637717e-05, + "grad_norm": 26.455310821533203, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8602294921875, + "num_tokens": 338357967.0, + "step": 8872 + }, + { + "epoch": 1.1287368019335962, + "ewc_loss": 0.037877753376960754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.89388774742838e-05, + "grad_norm": 26.5267276763916, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8712489604949951, + "num_tokens": 338391431.0, + "step": 8873 + }, + { + "epoch": 1.1288640122121867, + "ewc_loss": 0.03792296350002289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8961482055601664e-05, + "grad_norm": 26.437509536743164, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8726478815078735, + "num_tokens": 338426298.0, + "step": 8874 + }, + { + "epoch": 1.1289912224907772, + "ewc_loss": 0.03787175193428993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.893587614176795e-05, + "grad_norm": 26.517240524291992, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8466148972511292, + "num_tokens": 338468147.0, + "step": 8875 + }, + { + "epoch": 1.1291184327693677, + "ewc_loss": 0.03793683648109436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8968417862197384e-05, + "grad_norm": 26.480979919433594, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8475463390350342, + "num_tokens": 338517095.0, + "step": 8876 + }, + { + "epoch": 1.1292456430479583, + "ewc_loss": 0.037893589586019516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8946795535157435e-05, + "grad_norm": 26.461097717285156, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8715815544128418, + "num_tokens": 338549925.0, + "step": 8877 + }, + { + "epoch": 1.1293728533265488, + "ewc_loss": 0.03794656693935394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.897328365885187e-05, + "grad_norm": 26.479705810546875, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.867277979850769, + "num_tokens": 338583966.0, + "step": 8878 + }, + { + "epoch": 1.1295000636051393, + "ewc_loss": 0.03791537880897522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.895768946269527e-05, + "grad_norm": 26.50826644897461, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8683845400810242, + "num_tokens": 338622715.0, + "step": 8879 + }, + { + "epoch": 1.1296272738837299, + "ewc_loss": 0.03790436312556267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8952181562781334e-05, + "grad_norm": 26.493410110473633, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8507429361343384, + "num_tokens": 338659920.0, + "step": 8880 + }, + { + "epoch": 1.1297544841623204, + "ewc_loss": 0.03790563344955444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.895281639008317e-05, + "grad_norm": 26.515289306640625, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8587888479232788, + "num_tokens": 338698183.0, + "step": 8881 + }, + { + "epoch": 1.129881694440911, + "ewc_loss": 0.03790910542011261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8954553524963558e-05, + "grad_norm": 26.47604751586914, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.853073239326477, + "num_tokens": 338735853.0, + "step": 8882 + }, + { + "epoch": 1.1300089047195012, + "ewc_loss": 0.037912141531705856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8956070562126115e-05, + "grad_norm": 26.551055908203125, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8534847497940063, + "num_tokens": 338771714.0, + "step": 8883 + }, + { + "epoch": 1.1301361149980917, + "ewc_loss": 0.037887971848249435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8943985196528956e-05, + "grad_norm": 26.572507858276367, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8817076683044434, + "num_tokens": 338813823.0, + "step": 8884 + }, + { + "epoch": 1.1302633252766823, + "ewc_loss": 0.03790583088994026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8952914615510963e-05, + "grad_norm": 26.45150375366211, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8670110702514648, + "num_tokens": 338858137.0, + "step": 8885 + }, + { + "epoch": 1.1303905355552728, + "ewc_loss": 0.03786531835794449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8932658349513076e-05, + "grad_norm": 26.456636428833008, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8534092903137207, + "num_tokens": 338893653.0, + "step": 8886 + }, + { + "epoch": 1.1305177458338633, + "ewc_loss": 0.037884943187236786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8942471797345206e-05, + "grad_norm": 26.47320556640625, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8461165428161621, + "num_tokens": 338931230.0, + "step": 8887 + }, + { + "epoch": 1.1306449561124539, + "ewc_loss": 0.037885259836912155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8942630049423315e-05, + "grad_norm": 26.452449798583984, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8634095191955566, + "num_tokens": 338966921.0, + "step": 8888 + }, + { + "epoch": 1.1307721663910444, + "ewc_loss": 0.03798619657754898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8993097910424694e-05, + "grad_norm": 26.55583953857422, + "learning_rate": 1e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8433828353881836, + "num_tokens": 339000957.0, + "step": 8889 + }, + { + "epoch": 1.130899376669635, + "ewc_loss": 0.03795958682894707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.897979382192716e-05, + "grad_norm": 26.435428619384766, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8556391000747681, + "num_tokens": 339035876.0, + "step": 8890 + }, + { + "epoch": 1.1310265869482254, + "ewc_loss": 0.037938088178634644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8969043594552204e-05, + "grad_norm": 26.55363655090332, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8646042943000793, + "num_tokens": 339073700.0, + "step": 8891 + }, + { + "epoch": 1.131153797226816, + "ewc_loss": 0.03801671415567398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.900835741253104e-05, + "grad_norm": 26.45364761352539, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8679208755493164, + "num_tokens": 339108222.0, + "step": 8892 + }, + { + "epoch": 1.1312810075054065, + "ewc_loss": 0.03795209899544716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8976050341734663e-05, + "grad_norm": 26.460468292236328, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8758105039596558, + "num_tokens": 339145749.0, + "step": 8893 + }, + { + "epoch": 1.131408217783997, + "ewc_loss": 0.0380609855055809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.903049269458279e-05, + "grad_norm": 26.572738647460938, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8492969274520874, + "num_tokens": 339183852.0, + "step": 8894 + }, + { + "epoch": 1.1315354280625876, + "ewc_loss": 0.03797170892357826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8985854694619775e-05, + "grad_norm": 26.409086227416992, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8779382109642029, + "num_tokens": 339218766.0, + "step": 8895 + }, + { + "epoch": 1.131662638341178, + "ewc_loss": 0.037989597767591476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.899479866551701e-05, + "grad_norm": 26.54334259033203, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8645579814910889, + "num_tokens": 339259932.0, + "step": 8896 + }, + { + "epoch": 1.1317898486197684, + "ewc_loss": 0.0380370169878006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.901850919239223e-05, + "grad_norm": 26.484813690185547, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8673156499862671, + "num_tokens": 339302201.0, + "step": 8897 + }, + { + "epoch": 1.131917058898359, + "ewc_loss": 0.03799189627170563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.899594826682005e-05, + "grad_norm": 26.477205276489258, + "learning_rate": 1e-06, + "loss": 0.5493, + "mean_token_accuracy": 0.8345797061920166, + "num_tokens": 339342091.0, + "step": 8898 + }, + { + "epoch": 1.1320442691769494, + "ewc_loss": 0.038031913340091705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9015957150259055e-05, + "grad_norm": 26.48332405090332, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8584728837013245, + "num_tokens": 339379951.0, + "step": 8899 + }, + { + "epoch": 1.13217147945554, + "ewc_loss": 0.03806914761662483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9034574506804347e-05, + "grad_norm": 26.566547393798828, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8582034707069397, + "num_tokens": 339422243.0, + "step": 8900 + }, + { + "epoch": 1.1322986897341305, + "ewc_loss": 0.038073018193244934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.903650991152972e-05, + "grad_norm": 26.55026626586914, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.877669095993042, + "num_tokens": 339464969.0, + "step": 8901 + }, + { + "epoch": 1.132425900012721, + "ewc_loss": 0.03796829283237457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8984146663569845e-05, + "grad_norm": 26.488420486450195, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.867536187171936, + "num_tokens": 339502382.0, + "step": 8902 + }, + { + "epoch": 1.1325531102913116, + "ewc_loss": 0.03797300159931183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8986500435858034e-05, + "grad_norm": 26.503684997558594, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8487350940704346, + "num_tokens": 339542875.0, + "step": 8903 + }, + { + "epoch": 1.132680320569902, + "ewc_loss": 0.03800204023718834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9001019609277137e-05, + "grad_norm": 26.541719436645508, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8505240678787231, + "num_tokens": 339580776.0, + "step": 8904 + }, + { + "epoch": 1.1328075308484926, + "ewc_loss": 0.03800231218338013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9001156033482403e-05, + "grad_norm": 26.53053855895996, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8770709037780762, + "num_tokens": 339620195.0, + "step": 8905 + }, + { + "epoch": 1.1329347411270831, + "ewc_loss": 0.03801887109875679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9009436073247343e-05, + "grad_norm": 26.65633773803711, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.871721625328064, + "num_tokens": 339659234.0, + "step": 8906 + }, + { + "epoch": 1.1330619514056737, + "ewc_loss": 0.037989914417266846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8994956917595118e-05, + "grad_norm": 26.468923568725586, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8547009825706482, + "num_tokens": 339703259.0, + "step": 8907 + }, + { + "epoch": 1.133189161684264, + "ewc_loss": 0.03792666643857956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8963333786814474e-05, + "grad_norm": 26.444580078125, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8706421852111816, + "num_tokens": 339745901.0, + "step": 8908 + }, + { + "epoch": 1.1333163719628545, + "ewc_loss": 0.03799116611480713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8995582649949938e-05, + "grad_norm": 26.45586395263672, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.85440993309021, + "num_tokens": 339790417.0, + "step": 8909 + }, + { + "epoch": 1.133443582241445, + "ewc_loss": 0.03792329505085945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8961647583637387e-05, + "grad_norm": 26.474720001220703, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8548365831375122, + "num_tokens": 339827224.0, + "step": 8910 + }, + { + "epoch": 1.1335707925200356, + "ewc_loss": 0.037972621619701385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8986311260960065e-05, + "grad_norm": 26.496023178100586, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8586714863777161, + "num_tokens": 339859932.0, + "step": 8911 + }, + { + "epoch": 1.133698002798626, + "ewc_loss": 0.03803428262472153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9017141312360764e-05, + "grad_norm": 26.536666870117188, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.851746678352356, + "num_tokens": 339902416.0, + "step": 8912 + }, + { + "epoch": 1.1338252130772166, + "ewc_loss": 0.03799492493271828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.89974616660038e-05, + "grad_norm": 26.496856689453125, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8713421821594238, + "num_tokens": 339940495.0, + "step": 8913 + }, + { + "epoch": 1.1339524233558071, + "ewc_loss": 0.03798380494117737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8991902834386565e-05, + "grad_norm": 26.473146438598633, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8450890183448792, + "num_tokens": 339983010.0, + "step": 8914 + }, + { + "epoch": 1.1340796336343977, + "ewc_loss": 0.037886589765548706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8943295799545012e-05, + "grad_norm": 26.41024398803711, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8635209798812866, + "num_tokens": 340020683.0, + "step": 8915 + }, + { + "epoch": 1.1342068439129882, + "ewc_loss": 0.03794752433896065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8973762053065002e-05, + "grad_norm": 26.441680908203125, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8549904823303223, + "num_tokens": 340063179.0, + "step": 8916 + }, + { + "epoch": 1.1343340541915787, + "ewc_loss": 0.03797806054353714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8989030650118366e-05, + "grad_norm": 26.54066276550293, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8733734488487244, + "num_tokens": 340100166.0, + "step": 8917 + }, + { + "epoch": 1.1344612644701693, + "ewc_loss": 0.037986185401678085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8993092453456484e-05, + "grad_norm": 26.464406967163086, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8597226142883301, + "num_tokens": 340144325.0, + "step": 8918 + }, + { + "epoch": 1.1345884747487598, + "ewc_loss": 0.03797106817364693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.898553455248475e-05, + "grad_norm": 26.583200454711914, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8628043532371521, + "num_tokens": 340180563.0, + "step": 8919 + }, + { + "epoch": 1.1347156850273503, + "ewc_loss": 0.03801734372973442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9008672097697854e-05, + "grad_norm": 26.52899742126465, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8504722118377686, + "num_tokens": 340224621.0, + "step": 8920 + }, + { + "epoch": 1.1348428953059406, + "ewc_loss": 0.03797614201903343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8988070223713294e-05, + "grad_norm": 26.5311279296875, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.868384599685669, + "num_tokens": 340264810.0, + "step": 8921 + }, + { + "epoch": 1.1349701055845312, + "ewc_loss": 0.038016680628061295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.900834104162641e-05, + "grad_norm": 26.607927322387695, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8756067752838135, + "num_tokens": 340301158.0, + "step": 8922 + }, + { + "epoch": 1.1350973158631217, + "ewc_loss": 0.037959884852170944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.897994297905825e-05, + "grad_norm": 26.545957565307617, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8636334538459778, + "num_tokens": 340339806.0, + "step": 8923 + }, + { + "epoch": 1.1352245261417122, + "ewc_loss": 0.03796250745654106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8981254470418207e-05, + "grad_norm": 26.558773040771484, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8691425323486328, + "num_tokens": 340386412.0, + "step": 8924 + }, + { + "epoch": 1.1353517364203027, + "ewc_loss": 0.03795194998383522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8975975763169117e-05, + "grad_norm": 26.48120880126953, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8732410669326782, + "num_tokens": 340431391.0, + "step": 8925 + }, + { + "epoch": 1.1354789466988933, + "ewc_loss": 0.03793894499540329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.896947287605144e-05, + "grad_norm": 26.60931968688965, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8674712181091309, + "num_tokens": 340465904.0, + "step": 8926 + }, + { + "epoch": 1.1356061569774838, + "ewc_loss": 0.03789028152823448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8945140254800208e-05, + "grad_norm": 26.437206268310547, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8674274682998657, + "num_tokens": 340504290.0, + "step": 8927 + }, + { + "epoch": 1.1357333672560743, + "ewc_loss": 0.03791781887412071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.895891000458505e-05, + "grad_norm": 26.551424026489258, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8585021495819092, + "num_tokens": 340539100.0, + "step": 8928 + }, + { + "epoch": 1.1358605775346649, + "ewc_loss": 0.03792387619614601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8961938621941954e-05, + "grad_norm": 26.50072479248047, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.878254771232605, + "num_tokens": 340569005.0, + "step": 8929 + }, + { + "epoch": 1.1359877878132554, + "ewc_loss": 0.03795531392097473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8977656509377994e-05, + "grad_norm": 26.561429977416992, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8631646037101746, + "num_tokens": 340613476.0, + "step": 8930 + }, + { + "epoch": 1.136114998091846, + "ewc_loss": 0.03794247657060623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8971239114762284e-05, + "grad_norm": 26.481462478637695, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8558793067932129, + "num_tokens": 340647982.0, + "step": 8931 + }, + { + "epoch": 1.1362422083704362, + "ewc_loss": 0.0379645861685276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8982293113367632e-05, + "grad_norm": 26.617626190185547, + "learning_rate": 1e-06, + "loss": 0.5162, + "mean_token_accuracy": 0.8410006761550903, + "num_tokens": 340683270.0, + "step": 8932 + }, + { + "epoch": 1.1363694186490267, + "ewc_loss": 0.03794819116592407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.897409492812585e-05, + "grad_norm": 26.58098602294922, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8663004040718079, + "num_tokens": 340719955.0, + "step": 8933 + }, + { + "epoch": 1.1364966289276173, + "ewc_loss": 0.03789447247982025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8947235730593093e-05, + "grad_norm": 26.508424758911133, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8637851476669312, + "num_tokens": 340753934.0, + "step": 8934 + }, + { + "epoch": 1.1366238392062078, + "ewc_loss": 0.0379500538110733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8975026250700466e-05, + "grad_norm": 26.52523422241211, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8699722290039062, + "num_tokens": 340787545.0, + "step": 8935 + }, + { + "epoch": 1.1367510494847983, + "ewc_loss": 0.03797499090433121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8987495423061773e-05, + "grad_norm": 26.560209274291992, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8732101321220398, + "num_tokens": 340826362.0, + "step": 8936 + }, + { + "epoch": 1.1368782597633889, + "ewc_loss": 0.037991978228092194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.899598828458693e-05, + "grad_norm": 26.471521377563477, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8683068752288818, + "num_tokens": 340857871.0, + "step": 8937 + }, + { + "epoch": 1.1370054700419794, + "ewc_loss": 0.038014013320207596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9007005903404206e-05, + "grad_norm": 26.552377700805664, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8695254921913147, + "num_tokens": 340897736.0, + "step": 8938 + }, + { + "epoch": 1.13713268032057, + "ewc_loss": 0.03797902911901474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.898951450129971e-05, + "grad_norm": 26.52404022216797, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8593575954437256, + "num_tokens": 340935774.0, + "step": 8939 + }, + { + "epoch": 1.1372598905991604, + "ewc_loss": 0.03805949538946152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9029746908927336e-05, + "grad_norm": 26.60223960876465, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.866677463054657, + "num_tokens": 340971497.0, + "step": 8940 + }, + { + "epoch": 1.137387100877751, + "ewc_loss": 0.03797836974263191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8989185264217667e-05, + "grad_norm": 26.50465202331543, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.864849865436554, + "num_tokens": 341016209.0, + "step": 8941 + }, + { + "epoch": 1.1375143111563415, + "ewc_loss": 0.038010433316230774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9005216017831117e-05, + "grad_norm": 26.50967025756836, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8584405183792114, + "num_tokens": 341052137.0, + "step": 8942 + }, + { + "epoch": 1.137641521434932, + "ewc_loss": 0.0380360446870327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.901802170323208e-05, + "grad_norm": 26.58820152282715, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8663861751556396, + "num_tokens": 341083417.0, + "step": 8943 + }, + { + "epoch": 1.1377687317135226, + "ewc_loss": 0.03800240904092789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9001205146196298e-05, + "grad_norm": 26.48293113708496, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8672515153884888, + "num_tokens": 341127798.0, + "step": 8944 + }, + { + "epoch": 1.137895941992113, + "ewc_loss": 0.03801299259066582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9006496586371213e-05, + "grad_norm": 26.50844955444336, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8494499921798706, + "num_tokens": 341173485.0, + "step": 8945 + }, + { + "epoch": 1.1380231522707034, + "ewc_loss": 0.03800560534000397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.900280221889261e-05, + "grad_norm": 26.57134246826172, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8530611991882324, + "num_tokens": 341212834.0, + "step": 8946 + }, + { + "epoch": 1.138150362549294, + "ewc_loss": 0.03805989399552345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9029946997761726e-05, + "grad_norm": 26.603147506713867, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8564313054084778, + "num_tokens": 341246497.0, + "step": 8947 + }, + { + "epoch": 1.1382775728278844, + "ewc_loss": 0.0380038358271122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9001918190042488e-05, + "grad_norm": 26.579715728759766, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8473759889602661, + "num_tokens": 341291498.0, + "step": 8948 + }, + { + "epoch": 1.138404783106475, + "ewc_loss": 0.03802656754851341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9013283235835843e-05, + "grad_norm": 26.598560333251953, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8575425148010254, + "num_tokens": 341328104.0, + "step": 8949 + }, + { + "epoch": 1.1385319933850655, + "ewc_loss": 0.03802217170596123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.901108589663636e-05, + "grad_norm": 26.54375457763672, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.875484049320221, + "num_tokens": 341359042.0, + "step": 8950 + }, + { + "epoch": 1.138659203663656, + "ewc_loss": 0.03801055625081062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9005277863470837e-05, + "grad_norm": 26.502910614013672, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8627365827560425, + "num_tokens": 341403529.0, + "step": 8951 + }, + { + "epoch": 1.1387864139422466, + "ewc_loss": 0.03803253173828125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9016266378457658e-05, + "grad_norm": 26.478012084960938, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8712509870529175, + "num_tokens": 341442509.0, + "step": 8952 + }, + { + "epoch": 1.138913624220837, + "ewc_loss": 0.03797943517565727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8989718228112906e-05, + "grad_norm": 26.614181518554688, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8609272241592407, + "num_tokens": 341483015.0, + "step": 8953 + }, + { + "epoch": 1.1390408344994276, + "ewc_loss": 0.03806067258119583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9030336261494085e-05, + "grad_norm": 26.557031631469727, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8628070950508118, + "num_tokens": 341519676.0, + "step": 8954 + }, + { + "epoch": 1.1391680447780181, + "ewc_loss": 0.037934791296720505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.896739559015259e-05, + "grad_norm": 26.590770721435547, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8632134795188904, + "num_tokens": 341563483.0, + "step": 8955 + }, + { + "epoch": 1.1392952550566087, + "ewc_loss": 0.037979889661073685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.898994560178835e-05, + "grad_norm": 26.56421661376953, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8541322946548462, + "num_tokens": 341598387.0, + "step": 8956 + }, + { + "epoch": 1.139422465335199, + "ewc_loss": 0.037932656705379486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.896632784337271e-05, + "grad_norm": 26.545202255249023, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8671718239784241, + "num_tokens": 341640253.0, + "step": 8957 + }, + { + "epoch": 1.1395496756137895, + "ewc_loss": 0.037987787276506424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8993892808794044e-05, + "grad_norm": 26.55481719970703, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8909069299697876, + "num_tokens": 341675788.0, + "step": 8958 + }, + { + "epoch": 1.13967688589238, + "ewc_loss": 0.037955354899168015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8977678337250836e-05, + "grad_norm": 26.468175888061523, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8636041283607483, + "num_tokens": 341716917.0, + "step": 8959 + }, + { + "epoch": 1.1398040961709706, + "ewc_loss": 0.038059186190366745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9029592294828035e-05, + "grad_norm": 26.498355865478516, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.863238513469696, + "num_tokens": 341758936.0, + "step": 8960 + }, + { + "epoch": 1.139931306449561, + "ewc_loss": 0.03801816329360008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9009081370313652e-05, + "grad_norm": 26.530282974243164, + "learning_rate": 1e-06, + "loss": 0.5094, + "mean_token_accuracy": 0.8396562933921814, + "num_tokens": 341797612.0, + "step": 8961 + }, + { + "epoch": 1.1400585167281516, + "ewc_loss": 0.03802085667848587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9010427422472276e-05, + "grad_norm": 26.473979949951172, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8597429990768433, + "num_tokens": 341836061.0, + "step": 8962 + }, + { + "epoch": 1.1401857270067421, + "ewc_loss": 0.0380360409617424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9018019884242676e-05, + "grad_norm": 26.62995719909668, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8636693954467773, + "num_tokens": 341875465.0, + "step": 8963 + }, + { + "epoch": 1.1403129372853327, + "ewc_loss": 0.038050804287195206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9025401343242265e-05, + "grad_norm": 26.537555694580078, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8577322959899902, + "num_tokens": 341919795.0, + "step": 8964 + }, + { + "epoch": 1.1404401475639232, + "ewc_loss": 0.03795516490936279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8977581930812448e-05, + "grad_norm": 26.608909606933594, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8596784472465515, + "num_tokens": 341953291.0, + "step": 8965 + }, + { + "epoch": 1.1405673578425137, + "ewc_loss": 0.0379515141248703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8975757484440692e-05, + "grad_norm": 26.415939331054688, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8825063705444336, + "num_tokens": 341988364.0, + "step": 8966 + }, + { + "epoch": 1.1406945681211043, + "ewc_loss": 0.03803735971450806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9018680177396163e-05, + "grad_norm": 26.643840789794922, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8673210144042969, + "num_tokens": 342028478.0, + "step": 8967 + }, + { + "epoch": 1.1408217783996948, + "ewc_loss": 0.038092583417892456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.904629243654199e-05, + "grad_norm": 26.531530380249023, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8703848123550415, + "num_tokens": 342067348.0, + "step": 8968 + }, + { + "epoch": 1.1409489886782853, + "ewc_loss": 0.03793829679489136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.896914909593761e-05, + "grad_norm": 26.47075080871582, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8597049713134766, + "num_tokens": 342105845.0, + "step": 8969 + }, + { + "epoch": 1.1410761989568756, + "ewc_loss": 0.03806278482079506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9031393094337545e-05, + "grad_norm": 26.561925888061523, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8633860349655151, + "num_tokens": 342142072.0, + "step": 8970 + }, + { + "epoch": 1.1412034092354661, + "ewc_loss": 0.037992317229509354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8996159269590862e-05, + "grad_norm": 26.462482452392578, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8655048608779907, + "num_tokens": 342176592.0, + "step": 8971 + }, + { + "epoch": 1.1413306195140567, + "ewc_loss": 0.03801698237657547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9008492017746903e-05, + "grad_norm": 26.59614372253418, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8668504953384399, + "num_tokens": 342211896.0, + "step": 8972 + }, + { + "epoch": 1.1414578297926472, + "ewc_loss": 0.0380512997508049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.902565054479055e-05, + "grad_norm": 26.4931583404541, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8672698140144348, + "num_tokens": 342256078.0, + "step": 8973 + }, + { + "epoch": 1.1415850400712377, + "ewc_loss": 0.037982381880283356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.899119160952978e-05, + "grad_norm": 26.531429290771484, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.865994930267334, + "num_tokens": 342296766.0, + "step": 8974 + }, + { + "epoch": 1.1417122503498283, + "ewc_loss": 0.038019269704818726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.900963434309233e-05, + "grad_norm": 26.50054168701172, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.863081693649292, + "num_tokens": 342333689.0, + "step": 8975 + }, + { + "epoch": 1.1418394606284188, + "ewc_loss": 0.03807373717427254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.903686825244222e-05, + "grad_norm": 26.579782485961914, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8736737966537476, + "num_tokens": 342369605.0, + "step": 8976 + }, + { + "epoch": 1.1419666709070093, + "ewc_loss": 0.03796045109629631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.89802249224158e-05, + "grad_norm": 26.43937873840332, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8725956678390503, + "num_tokens": 342410806.0, + "step": 8977 + }, + { + "epoch": 1.1420938811855998, + "ewc_loss": 0.03806593269109726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9032966520171613e-05, + "grad_norm": 26.688430786132812, + "learning_rate": 1e-06, + "loss": 0.3584, + "mean_token_accuracy": 0.8897194266319275, + "num_tokens": 342446957.0, + "step": 8978 + }, + { + "epoch": 1.1422210914641904, + "ewc_loss": 0.03803926706314087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.901963332784362e-05, + "grad_norm": 26.575931549072266, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8516976833343506, + "num_tokens": 342480717.0, + "step": 8979 + }, + { + "epoch": 1.142348301742781, + "ewc_loss": 0.03801795095205307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9008975868928246e-05, + "grad_norm": 26.609621047973633, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8713648915290833, + "num_tokens": 342520431.0, + "step": 8980 + }, + { + "epoch": 1.1424755120213712, + "ewc_loss": 0.03795835003256798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8979175365529954e-05, + "grad_norm": 26.558996200561523, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8600757718086243, + "num_tokens": 342561735.0, + "step": 8981 + }, + { + "epoch": 1.1426027222999617, + "ewc_loss": 0.03799985349178314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8999926396645606e-05, + "grad_norm": 26.6076717376709, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.875917911529541, + "num_tokens": 342593899.0, + "step": 8982 + }, + { + "epoch": 1.1427299325785523, + "ewc_loss": 0.03796711936593056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.89835591299925e-05, + "grad_norm": 26.49965476989746, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8622291088104248, + "num_tokens": 342628044.0, + "step": 8983 + }, + { + "epoch": 1.1428571428571428, + "ewc_loss": 0.037999898195266724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8999948224518448e-05, + "grad_norm": 26.6055850982666, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8612580299377441, + "num_tokens": 342665018.0, + "step": 8984 + }, + { + "epoch": 1.1429843531357333, + "ewc_loss": 0.03802211955189705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.901106043078471e-05, + "grad_norm": 26.523181915283203, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.852984607219696, + "num_tokens": 342704832.0, + "step": 8985 + }, + { + "epoch": 1.1431115634143239, + "ewc_loss": 0.03801405057311058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9007025912287645e-05, + "grad_norm": 26.581663131713867, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8703779578208923, + "num_tokens": 342741598.0, + "step": 8986 + }, + { + "epoch": 1.1432387736929144, + "ewc_loss": 0.037956591695547104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8978294974658638e-05, + "grad_norm": 26.51905059814453, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8590072989463806, + "num_tokens": 342777302.0, + "step": 8987 + }, + { + "epoch": 1.143365983971505, + "ewc_loss": 0.03807922080159187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9039611288462766e-05, + "grad_norm": 26.54022216796875, + "learning_rate": 1e-06, + "loss": 0.5051, + "mean_token_accuracy": 0.8440282344818115, + "num_tokens": 342809578.0, + "step": 8988 + }, + { + "epoch": 1.1434931942500954, + "ewc_loss": 0.0380420945584774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9021046682610177e-05, + "grad_norm": 26.545433044433594, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8655692934989929, + "num_tokens": 342845809.0, + "step": 8989 + }, + { + "epoch": 1.143620404528686, + "ewc_loss": 0.03811486437916756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9057431927649304e-05, + "grad_norm": 26.50497817993164, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8494291305541992, + "num_tokens": 342886118.0, + "step": 8990 + }, + { + "epoch": 1.1437476148072765, + "ewc_loss": 0.03812786191701889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9063931176788174e-05, + "grad_norm": 26.56633186340332, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8585135340690613, + "num_tokens": 342922517.0, + "step": 8991 + }, + { + "epoch": 1.143874825085867, + "ewc_loss": 0.03813715651631355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9068578694714233e-05, + "grad_norm": 26.464170455932617, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8578306436538696, + "num_tokens": 342959741.0, + "step": 8992 + }, + { + "epoch": 1.1440020353644575, + "ewc_loss": 0.03809607774019241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9048038666369393e-05, + "grad_norm": 26.53217124938965, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.876663863658905, + "num_tokens": 342991418.0, + "step": 8993 + }, + { + "epoch": 1.144129245643048, + "ewc_loss": 0.038170233368873596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9085116946371272e-05, + "grad_norm": 26.54633903503418, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.882310152053833, + "num_tokens": 343024263.0, + "step": 8994 + }, + { + "epoch": 1.1442564559216384, + "ewc_loss": 0.03816268593072891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9081342543358915e-05, + "grad_norm": 26.547346115112305, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8642708659172058, + "num_tokens": 343066388.0, + "step": 8995 + }, + { + "epoch": 1.144383666200229, + "ewc_loss": 0.03815630078315735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.907815021695569e-05, + "grad_norm": 26.453880310058594, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8644800186157227, + "num_tokens": 343107631.0, + "step": 8996 + }, + { + "epoch": 1.1445108764788194, + "ewc_loss": 0.03817712143063545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9088560293312185e-05, + "grad_norm": 26.587129592895508, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8698451519012451, + "num_tokens": 343143645.0, + "step": 8997 + }, + { + "epoch": 1.14463808675741, + "ewc_loss": 0.038233138620853424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9116569092147984e-05, + "grad_norm": 26.509899139404297, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8509609699249268, + "num_tokens": 343182076.0, + "step": 8998 + }, + { + "epoch": 1.1447652970360005, + "ewc_loss": 0.03816327825188637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9081639038631693e-05, + "grad_norm": 26.48968505859375, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8822720646858215, + "num_tokens": 343222808.0, + "step": 8999 + }, + { + "epoch": 1.144892507314591, + "ewc_loss": 0.03825130686163902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.912565312522929e-05, + "grad_norm": 26.604738235473633, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8599807024002075, + "num_tokens": 343268229.0, + "step": 9000 + }, + { + "epoch": 1.1450197175931816, + "ewc_loss": 0.038195669651031494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9097835320280865e-05, + "grad_norm": 26.603944778442383, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8551723957061768, + "num_tokens": 343304459.0, + "step": 9001 + }, + { + "epoch": 1.145146927871772, + "ewc_loss": 0.038168299943208694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9084149244008586e-05, + "grad_norm": 26.549381256103516, + "learning_rate": 1e-06, + "loss": 0.5656, + "mean_token_accuracy": 0.8264783024787903, + "num_tokens": 343343211.0, + "step": 9002 + }, + { + "epoch": 1.1452741381503626, + "ewc_loss": 0.03816438838839531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9082193830399774e-05, + "grad_norm": 26.54571533203125, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8595008850097656, + "num_tokens": 343388498.0, + "step": 9003 + }, + { + "epoch": 1.1454013484289531, + "ewc_loss": 0.03811504691839218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.905752287711948e-05, + "grad_norm": 26.535863876342773, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8617008924484253, + "num_tokens": 343430101.0, + "step": 9004 + }, + { + "epoch": 1.1455285587075437, + "ewc_loss": 0.038083791732788086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.904189593915362e-05, + "grad_norm": 26.513792037963867, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.852552056312561, + "num_tokens": 343466454.0, + "step": 9005 + }, + { + "epoch": 1.145655768986134, + "ewc_loss": 0.03812752664089203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9063763829763047e-05, + "grad_norm": 26.500272750854492, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8794695734977722, + "num_tokens": 343506960.0, + "step": 9006 + }, + { + "epoch": 1.1457829792647245, + "ewc_loss": 0.038033295422792435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9016648366232403e-05, + "grad_norm": 26.428678512573242, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8718405365943909, + "num_tokens": 343541133.0, + "step": 9007 + }, + { + "epoch": 1.145910189543315, + "ewc_loss": 0.03814004361629486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9070021153311245e-05, + "grad_norm": 26.55093765258789, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8717784881591797, + "num_tokens": 343580266.0, + "step": 9008 + }, + { + "epoch": 1.1460373998219056, + "ewc_loss": 0.03818735107779503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9093675291514955e-05, + "grad_norm": 26.69926643371582, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8615908026695251, + "num_tokens": 343611691.0, + "step": 9009 + }, + { + "epoch": 1.146164610100496, + "ewc_loss": 0.03816889226436615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9084445739281364e-05, + "grad_norm": 26.57790184020996, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8694412708282471, + "num_tokens": 343649355.0, + "step": 9010 + }, + { + "epoch": 1.1462918203790866, + "ewc_loss": 0.03810947760939598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9054738004342653e-05, + "grad_norm": 26.695383071899414, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.878022313117981, + "num_tokens": 343681606.0, + "step": 9011 + }, + { + "epoch": 1.1464190306576771, + "ewc_loss": 0.03816152736544609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9080764104728587e-05, + "grad_norm": 26.569032669067383, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8747007250785828, + "num_tokens": 343721592.0, + "step": 9012 + }, + { + "epoch": 1.1465462409362677, + "ewc_loss": 0.03812422603368759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9062112187384628e-05, + "grad_norm": 26.63231086730957, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8603628873825073, + "num_tokens": 343764084.0, + "step": 9013 + }, + { + "epoch": 1.1466734512148582, + "ewc_loss": 0.03813960403203964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.906980287458282e-05, + "grad_norm": 26.579500198364258, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8659999966621399, + "num_tokens": 343801213.0, + "step": 9014 + }, + { + "epoch": 1.1468006614934487, + "ewc_loss": 0.03804393857717514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9021968910237774e-05, + "grad_norm": 26.552194595336914, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8644146919250488, + "num_tokens": 343839761.0, + "step": 9015 + }, + { + "epoch": 1.1469278717720393, + "ewc_loss": 0.03814540430903435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9072702343692072e-05, + "grad_norm": 26.65325355529785, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8632179498672485, + "num_tokens": 343877792.0, + "step": 9016 + }, + { + "epoch": 1.1470550820506298, + "ewc_loss": 0.03808416798710823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9042083295062184e-05, + "grad_norm": 26.559663772583008, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8703550100326538, + "num_tokens": 343915107.0, + "step": 9017 + }, + { + "epoch": 1.1471822923292203, + "ewc_loss": 0.03803705424070358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9018527382286265e-05, + "grad_norm": 26.55620765686035, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.868098258972168, + "num_tokens": 343949783.0, + "step": 9018 + }, + { + "epoch": 1.1473095026078106, + "ewc_loss": 0.03817245364189148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9086226529907435e-05, + "grad_norm": 26.619876861572266, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8453162312507629, + "num_tokens": 343991058.0, + "step": 9019 + }, + { + "epoch": 1.1474367128864011, + "ewc_loss": 0.03810260072350502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.905130011436995e-05, + "grad_norm": 26.62750816345215, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8642646670341492, + "num_tokens": 344030424.0, + "step": 9020 + }, + { + "epoch": 1.1475639231649917, + "ewc_loss": 0.03814192861318588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9070965208811685e-05, + "grad_norm": 26.57459831237793, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8650186061859131, + "num_tokens": 344071385.0, + "step": 9021 + }, + { + "epoch": 1.1476911334435822, + "ewc_loss": 0.03799488767981529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8997443476109765e-05, + "grad_norm": 26.57691764831543, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8734465837478638, + "num_tokens": 344109725.0, + "step": 9022 + }, + { + "epoch": 1.1478183437221727, + "ewc_loss": 0.038106519728899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.905325916595757e-05, + "grad_norm": 26.7041015625, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8712059259414673, + "num_tokens": 344143947.0, + "step": 9023 + }, + { + "epoch": 1.1479455540007633, + "ewc_loss": 0.038062311708927155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9031156625715084e-05, + "grad_norm": 26.655685424804688, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8552349805831909, + "num_tokens": 344179969.0, + "step": 9024 + }, + { + "epoch": 1.1480727642793538, + "ewc_loss": 0.0380164198577404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9008210074389353e-05, + "grad_norm": 26.715591430664062, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8749768137931824, + "num_tokens": 344212279.0, + "step": 9025 + }, + { + "epoch": 1.1481999745579443, + "ewc_loss": 0.0380251482129097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.901257382996846e-05, + "grad_norm": 26.615747451782227, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.870023250579834, + "num_tokens": 344248402.0, + "step": 9026 + }, + { + "epoch": 1.1483271848365348, + "ewc_loss": 0.03796177729964256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.8980888853548095e-05, + "grad_norm": 26.51427459716797, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8602541089057922, + "num_tokens": 344286557.0, + "step": 9027 + }, + { + "epoch": 1.1484543951151254, + "ewc_loss": 0.03808220475912094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9041102859773673e-05, + "grad_norm": 26.566476821899414, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8645960688591003, + "num_tokens": 344331014.0, + "step": 9028 + }, + { + "epoch": 1.148581605393716, + "ewc_loss": 0.038089483976364136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.904474265757017e-05, + "grad_norm": 26.58722686767578, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8611739873886108, + "num_tokens": 344373239.0, + "step": 9029 + }, + { + "epoch": 1.1487088156723062, + "ewc_loss": 0.038074567914009094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.903728480101563e-05, + "grad_norm": 26.44746971130371, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8781968355178833, + "num_tokens": 344409871.0, + "step": 9030 + }, + { + "epoch": 1.1488360259508967, + "ewc_loss": 0.03808584809303284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9042923668166623e-05, + "grad_norm": 26.55540657043457, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8534493446350098, + "num_tokens": 344452108.0, + "step": 9031 + }, + { + "epoch": 1.1489632362294873, + "ewc_loss": 0.03813773766160011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.90688697330188e-05, + "grad_norm": 26.55023765563965, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8585324287414551, + "num_tokens": 344489798.0, + "step": 9032 + }, + { + "epoch": 1.1490904465080778, + "ewc_loss": 0.038167908787727356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9083954612142406e-05, + "grad_norm": 26.614402770996094, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8715835809707642, + "num_tokens": 344528194.0, + "step": 9033 + }, + { + "epoch": 1.1492176567866683, + "ewc_loss": 0.03816394507884979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.908197191369254e-05, + "grad_norm": 26.572107315063477, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8498294949531555, + "num_tokens": 344559699.0, + "step": 9034 + }, + { + "epoch": 1.1493448670652588, + "ewc_loss": 0.03809712082147598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.904856071632821e-05, + "grad_norm": 26.62905502319336, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8568510413169861, + "num_tokens": 344598234.0, + "step": 9035 + }, + { + "epoch": 1.1494720773438494, + "ewc_loss": 0.03817380592226982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9086903193965554e-05, + "grad_norm": 26.57539176940918, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8567976951599121, + "num_tokens": 344634780.0, + "step": 9036 + }, + { + "epoch": 1.14959928762244, + "ewc_loss": 0.0381142757833004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.905713725136593e-05, + "grad_norm": 26.591514587402344, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8682566285133362, + "num_tokens": 344677167.0, + "step": 9037 + }, + { + "epoch": 1.1497264979010304, + "ewc_loss": 0.03819020465016365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.909510319819674e-05, + "grad_norm": 26.59832763671875, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8514531850814819, + "num_tokens": 344713824.0, + "step": 9038 + }, + { + "epoch": 1.149853708179621, + "ewc_loss": 0.03813136741518974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9065682863583788e-05, + "grad_norm": 26.503244400024414, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8506662845611572, + "num_tokens": 344755814.0, + "step": 9039 + }, + { + "epoch": 1.1499809184582115, + "ewc_loss": 0.03817514702677727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9087574401055463e-05, + "grad_norm": 26.65395736694336, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8783090114593506, + "num_tokens": 344795022.0, + "step": 9040 + }, + { + "epoch": 1.150108128736802, + "ewc_loss": 0.0381619893014431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9080995116382837e-05, + "grad_norm": 26.48822784423828, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8733800649642944, + "num_tokens": 344833906.0, + "step": 9041 + }, + { + "epoch": 1.1502353390153925, + "ewc_loss": 0.03815878927707672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9079394405707717e-05, + "grad_norm": 26.71064567565918, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8700875043869019, + "num_tokens": 344875975.0, + "step": 9042 + }, + { + "epoch": 1.150362549293983, + "ewc_loss": 0.03814909607172012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.907454861793667e-05, + "grad_norm": 26.515336990356445, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.866634726524353, + "num_tokens": 344915433.0, + "step": 9043 + }, + { + "epoch": 1.1504897595725734, + "ewc_loss": 0.03808320686221123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9041603081859648e-05, + "grad_norm": 26.533023834228516, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8689039349555969, + "num_tokens": 344949038.0, + "step": 9044 + }, + { + "epoch": 1.150616969851164, + "ewc_loss": 0.038157761096954346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9078879631706513e-05, + "grad_norm": 26.546091079711914, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8657788038253784, + "num_tokens": 344988260.0, + "step": 9045 + }, + { + "epoch": 1.1507441801297544, + "ewc_loss": 0.03817640244960785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9088201952399686e-05, + "grad_norm": 26.517166137695312, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8762860894203186, + "num_tokens": 345024355.0, + "step": 9046 + }, + { + "epoch": 1.150871390408345, + "ewc_loss": 0.03820240870118141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9101204088656232e-05, + "grad_norm": 26.596302032470703, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8736705780029297, + "num_tokens": 345063041.0, + "step": 9047 + }, + { + "epoch": 1.1509986006869355, + "ewc_loss": 0.03819500654935837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9097502445220016e-05, + "grad_norm": 26.68411636352539, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8844483494758606, + "num_tokens": 345097813.0, + "step": 9048 + }, + { + "epoch": 1.151125810965526, + "ewc_loss": 0.038139913231134415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.906995748868212e-05, + "grad_norm": 26.66274070739746, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8601890206336975, + "num_tokens": 345134534.0, + "step": 9049 + }, + { + "epoch": 1.1512530212441165, + "ewc_loss": 0.038130972534418106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9065486412728205e-05, + "grad_norm": 26.515439987182617, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8683279752731323, + "num_tokens": 345171018.0, + "step": 9050 + }, + { + "epoch": 1.151380231522707, + "ewc_loss": 0.03808571398258209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.904285636555869e-05, + "grad_norm": 26.563953399658203, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8757786154747009, + "num_tokens": 345208006.0, + "step": 9051 + }, + { + "epoch": 1.1515074418012976, + "ewc_loss": 0.03818044066429138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9090221030637622e-05, + "grad_norm": 26.585994720458984, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8816630840301514, + "num_tokens": 345246616.0, + "step": 9052 + }, + { + "epoch": 1.1516346520798881, + "ewc_loss": 0.03812611475586891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.906305806187447e-05, + "grad_norm": 26.581912994384766, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8807482719421387, + "num_tokens": 345288095.0, + "step": 9053 + }, + { + "epoch": 1.1517618623584787, + "ewc_loss": 0.03818564862012863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9092824004474096e-05, + "grad_norm": 26.706266403198242, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8500388860702515, + "num_tokens": 345333291.0, + "step": 9054 + }, + { + "epoch": 1.151889072637069, + "ewc_loss": 0.03806859999895096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.903429983940441e-05, + "grad_norm": 26.481788635253906, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8634837865829468, + "num_tokens": 345375093.0, + "step": 9055 + }, + { + "epoch": 1.1520162829156595, + "ewc_loss": 0.03809274360537529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9046372472075745e-05, + "grad_norm": 26.5786075592041, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8584790825843811, + "num_tokens": 345415547.0, + "step": 9056 + }, + { + "epoch": 1.15214349319425, + "ewc_loss": 0.03819773718714714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.909886850626208e-05, + "grad_norm": 26.65003204345703, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.867531418800354, + "num_tokens": 345457340.0, + "step": 9057 + }, + { + "epoch": 1.1522707034728406, + "ewc_loss": 0.038127075880765915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9063538275077008e-05, + "grad_norm": 26.636516571044922, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8602761626243591, + "num_tokens": 345494711.0, + "step": 9058 + }, + { + "epoch": 1.152397913751431, + "ewc_loss": 0.038099680095911026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9049839465878904e-05, + "grad_norm": 26.55131721496582, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8562394380569458, + "num_tokens": 345534249.0, + "step": 9059 + }, + { + "epoch": 1.1525251240300216, + "ewc_loss": 0.038149990141391754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.907499427034054e-05, + "grad_norm": 26.651386260986328, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8633214235305786, + "num_tokens": 345570832.0, + "step": 9060 + }, + { + "epoch": 1.1526523343086121, + "ewc_loss": 0.0381045900285244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.905229510157369e-05, + "grad_norm": 26.622779846191406, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8589667081832886, + "num_tokens": 345609007.0, + "step": 9061 + }, + { + "epoch": 1.1527795445872027, + "ewc_loss": 0.038104720413684845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.905236058519222e-05, + "grad_norm": 26.60154914855957, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8759186267852783, + "num_tokens": 345647979.0, + "step": 9062 + }, + { + "epoch": 1.1529067548657932, + "ewc_loss": 0.03801363334059715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9006816728506237e-05, + "grad_norm": 26.57077980041504, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8565225005149841, + "num_tokens": 345681821.0, + "step": 9063 + }, + { + "epoch": 1.1530339651443837, + "ewc_loss": 0.03809424489736557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9047121895710006e-05, + "grad_norm": 26.626506805419922, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.853766918182373, + "num_tokens": 345724274.0, + "step": 9064 + }, + { + "epoch": 1.1531611754229742, + "ewc_loss": 0.03807216137647629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9036080630030483e-05, + "grad_norm": 26.569780349731445, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.879610002040863, + "num_tokens": 345763558.0, + "step": 9065 + }, + { + "epoch": 1.1532883857015648, + "ewc_loss": 0.038114335387945175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.905716817418579e-05, + "grad_norm": 26.593116760253906, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8594633340835571, + "num_tokens": 345798486.0, + "step": 9066 + }, + { + "epoch": 1.1534155959801553, + "ewc_loss": 0.03810090199112892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9050450646318495e-05, + "grad_norm": 26.568973541259766, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8600695133209229, + "num_tokens": 345839379.0, + "step": 9067 + }, + { + "epoch": 1.1535428062587456, + "ewc_loss": 0.038126833736896515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.906341640278697e-05, + "grad_norm": 26.52920913696289, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8603715300559998, + "num_tokens": 345876675.0, + "step": 9068 + }, + { + "epoch": 1.1536700165373361, + "ewc_loss": 0.03806494548916817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.903247357404325e-05, + "grad_norm": 26.554378509521484, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8832930326461792, + "num_tokens": 345913482.0, + "step": 9069 + }, + { + "epoch": 1.1537972268159267, + "ewc_loss": 0.03816899657249451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9084498489974067e-05, + "grad_norm": 26.569358825683594, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8595235347747803, + "num_tokens": 345947671.0, + "step": 9070 + }, + { + "epoch": 1.1539244370945172, + "ewc_loss": 0.03816620633006096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9083103325101547e-05, + "grad_norm": 26.652780532836914, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8664959073066711, + "num_tokens": 345981520.0, + "step": 9071 + }, + { + "epoch": 1.1540516473731077, + "ewc_loss": 0.03824197128415108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9120985598419793e-05, + "grad_norm": 26.587757110595703, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8473715782165527, + "num_tokens": 346020050.0, + "step": 9072 + }, + { + "epoch": 1.1541788576516983, + "ewc_loss": 0.03813321143388748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9066605091211386e-05, + "grad_norm": 26.524587631225586, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8561501502990723, + "num_tokens": 346050000.0, + "step": 9073 + }, + { + "epoch": 1.1543060679302888, + "ewc_loss": 0.0382050946354866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.910254650283605e-05, + "grad_norm": 26.613422393798828, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8591614961624146, + "num_tokens": 346085038.0, + "step": 9074 + }, + { + "epoch": 1.1544332782088793, + "ewc_loss": 0.038212988525629044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9106493709841743e-05, + "grad_norm": 26.589303970336914, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8742789030075073, + "num_tokens": 346125871.0, + "step": 9075 + }, + { + "epoch": 1.1545604884874698, + "ewc_loss": 0.038267284631729126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9133642126689665e-05, + "grad_norm": 26.625640869140625, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8727308511734009, + "num_tokens": 346160460.0, + "step": 9076 + }, + { + "epoch": 1.1546876987660604, + "ewc_loss": 0.038270093500614166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9135046386509202e-05, + "grad_norm": 26.63850975036621, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8542516231536865, + "num_tokens": 346201621.0, + "step": 9077 + }, + { + "epoch": 1.154814909044651, + "ewc_loss": 0.03822999447584152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.911499748530332e-05, + "grad_norm": 26.56085205078125, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8671425580978394, + "num_tokens": 346242680.0, + "step": 9078 + }, + { + "epoch": 1.1549421193232412, + "ewc_loss": 0.0382101908326149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9105094906990416e-05, + "grad_norm": 26.680341720581055, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8550501465797424, + "num_tokens": 346275774.0, + "step": 9079 + }, + { + "epoch": 1.1550693296018317, + "ewc_loss": 0.03828751668334007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9143757526762784e-05, + "grad_norm": 26.61005210876465, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8657606840133667, + "num_tokens": 346317648.0, + "step": 9080 + }, + { + "epoch": 1.1551965398804223, + "ewc_loss": 0.03817478567361832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9087392502115108e-05, + "grad_norm": 26.59257698059082, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8678053021430969, + "num_tokens": 346357736.0, + "step": 9081 + }, + { + "epoch": 1.1553237501590128, + "ewc_loss": 0.03823551908135414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9117758711217903e-05, + "grad_norm": 26.612852096557617, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8629909753799438, + "num_tokens": 346397961.0, + "step": 9082 + }, + { + "epoch": 1.1554509604376033, + "ewc_loss": 0.03813682869076729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9068414985667914e-05, + "grad_norm": 26.46978759765625, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8753782510757446, + "num_tokens": 346438268.0, + "step": 9083 + }, + { + "epoch": 1.1555781707161938, + "ewc_loss": 0.03826800361275673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9134002286591567e-05, + "grad_norm": 26.75200080871582, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8547991514205933, + "num_tokens": 346479024.0, + "step": 9084 + }, + { + "epoch": 1.1557053809947844, + "ewc_loss": 0.038203999400138855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.910199898702558e-05, + "grad_norm": 26.482614517211914, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8715558648109436, + "num_tokens": 346523526.0, + "step": 9085 + }, + { + "epoch": 1.155832591273375, + "ewc_loss": 0.038145240396261215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9072620489168912e-05, + "grad_norm": 26.576059341430664, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8730484247207642, + "num_tokens": 346561985.0, + "step": 9086 + }, + { + "epoch": 1.1559598015519654, + "ewc_loss": 0.03825715184211731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.912857624120079e-05, + "grad_norm": 26.58026695251465, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8507169485092163, + "num_tokens": 346605600.0, + "step": 9087 + }, + { + "epoch": 1.156087011830556, + "ewc_loss": 0.03818101808428764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9090508430963382e-05, + "grad_norm": 26.57795524597168, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8772093057632446, + "num_tokens": 346640798.0, + "step": 9088 + }, + { + "epoch": 1.1562142221091465, + "ewc_loss": 0.038262926042079926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9131462977384217e-05, + "grad_norm": 26.565393447875977, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8600865602493286, + "num_tokens": 346678483.0, + "step": 9089 + }, + { + "epoch": 1.156341432387737, + "ewc_loss": 0.03815840184688568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.907920159283094e-05, + "grad_norm": 26.592296600341797, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8793349266052246, + "num_tokens": 346714604.0, + "step": 9090 + }, + { + "epoch": 1.1564686426663275, + "ewc_loss": 0.03821594640612602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9107972548226826e-05, + "grad_norm": 26.67778205871582, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.874879002571106, + "num_tokens": 346747203.0, + "step": 9091 + }, + { + "epoch": 1.156595852944918, + "ewc_loss": 0.03812452405691147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.906226134451572e-05, + "grad_norm": 26.570051193237305, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8627906441688538, + "num_tokens": 346790650.0, + "step": 9092 + }, + { + "epoch": 1.1567230632235084, + "ewc_loss": 0.03817880526185036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9089402485406026e-05, + "grad_norm": 26.569547653198242, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8871393203735352, + "num_tokens": 346822872.0, + "step": 9093 + }, + { + "epoch": 1.156850273502099, + "ewc_loss": 0.038171347230672836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9085673557128757e-05, + "grad_norm": 26.681352615356445, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8709332942962646, + "num_tokens": 346857999.0, + "step": 9094 + }, + { + "epoch": 1.1569774837806894, + "ewc_loss": 0.038193538784980774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9096769392490387e-05, + "grad_norm": 26.681293487548828, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8718859553337097, + "num_tokens": 346891804.0, + "step": 9095 + }, + { + "epoch": 1.15710469405928, + "ewc_loss": 0.038134798407554626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9067399989580736e-05, + "grad_norm": 26.62171173095703, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8632932305335999, + "num_tokens": 346931554.0, + "step": 9096 + }, + { + "epoch": 1.1572319043378705, + "ewc_loss": 0.038161952048540115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9080975107499398e-05, + "grad_norm": 26.59428596496582, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8513368964195251, + "num_tokens": 346965214.0, + "step": 9097 + }, + { + "epoch": 1.157359114616461, + "ewc_loss": 0.038208555430173874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9104278180748224e-05, + "grad_norm": 26.783367156982422, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8753130435943604, + "num_tokens": 347001219.0, + "step": 9098 + }, + { + "epoch": 1.1574863248950515, + "ewc_loss": 0.03823167085647583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9115836039418355e-05, + "grad_norm": 26.673927307128906, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8467739820480347, + "num_tokens": 347043212.0, + "step": 9099 + }, + { + "epoch": 1.157613535173642, + "ewc_loss": 0.03813347592949867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9066737877437845e-05, + "grad_norm": 26.654804229736328, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8772053718566895, + "num_tokens": 347082249.0, + "step": 9100 + }, + { + "epoch": 1.1577407454522326, + "ewc_loss": 0.03825997933745384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9129989595967345e-05, + "grad_norm": 26.67245101928711, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8617919683456421, + "num_tokens": 347119286.0, + "step": 9101 + }, + { + "epoch": 1.1578679557308231, + "ewc_loss": 0.03817594796419144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9087974578724243e-05, + "grad_norm": 26.553300857543945, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8774118423461914, + "num_tokens": 347155549.0, + "step": 9102 + }, + { + "epoch": 1.1579951660094137, + "ewc_loss": 0.038191549479961395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9095774405286647e-05, + "grad_norm": 26.59998321533203, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8673354983329773, + "num_tokens": 347196846.0, + "step": 9103 + }, + { + "epoch": 1.158122376288004, + "ewc_loss": 0.03818679228425026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9093396986136213e-05, + "grad_norm": 26.49557876586914, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8784955739974976, + "num_tokens": 347234028.0, + "step": 9104 + }, + { + "epoch": 1.1582495865665945, + "ewc_loss": 0.03828151896595955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.914075983222574e-05, + "grad_norm": 26.691926956176758, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8759177923202515, + "num_tokens": 347276931.0, + "step": 9105 + }, + { + "epoch": 1.158376796845185, + "ewc_loss": 0.038306474685668945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9153238099534065e-05, + "grad_norm": 26.516258239746094, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8612339496612549, + "num_tokens": 347317139.0, + "step": 9106 + }, + { + "epoch": 1.1585040071237755, + "ewc_loss": 0.03818407282233238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.909203638206236e-05, + "grad_norm": 26.748979568481445, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8672602772712708, + "num_tokens": 347355718.0, + "step": 9107 + }, + { + "epoch": 1.158631217402366, + "ewc_loss": 0.038286663591861725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9143331883242354e-05, + "grad_norm": 26.625490188598633, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8555924892425537, + "num_tokens": 347393403.0, + "step": 9108 + }, + { + "epoch": 1.1587584276809566, + "ewc_loss": 0.0381791815161705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.908958984131459e-05, + "grad_norm": 26.758207321166992, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8730642795562744, + "num_tokens": 347433736.0, + "step": 9109 + }, + { + "epoch": 1.1588856379595471, + "ewc_loss": 0.038287051022052765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.914352469611913e-05, + "grad_norm": 26.591812133789062, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8667532205581665, + "num_tokens": 347474574.0, + "step": 9110 + }, + { + "epoch": 1.1590128482381377, + "ewc_loss": 0.038147978484630585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.907398836920038e-05, + "grad_norm": 26.57782554626465, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8712269067764282, + "num_tokens": 347518785.0, + "step": 9111 + }, + { + "epoch": 1.1591400585167282, + "ewc_loss": 0.0381963737308979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.909818638523575e-05, + "grad_norm": 26.684141159057617, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8524467945098877, + "num_tokens": 347557518.0, + "step": 9112 + }, + { + "epoch": 1.1592672687953187, + "ewc_loss": 0.038147199898958206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.907359910546802e-05, + "grad_norm": 26.667339324951172, + "learning_rate": 1e-06, + "loss": 0.5223, + "mean_token_accuracy": 0.8362581729888916, + "num_tokens": 347600094.0, + "step": 9113 + }, + { + "epoch": 1.1593944790739092, + "ewc_loss": 0.03812345862388611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9061730199609883e-05, + "grad_norm": 26.603731155395508, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8641570806503296, + "num_tokens": 347633709.0, + "step": 9114 + }, + { + "epoch": 1.1595216893524998, + "ewc_loss": 0.03814080357551575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9070401322096586e-05, + "grad_norm": 26.5477294921875, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8718459010124207, + "num_tokens": 347670027.0, + "step": 9115 + }, + { + "epoch": 1.1596488996310903, + "ewc_loss": 0.038219161331653595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.910958053485956e-05, + "grad_norm": 26.563560485839844, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8695315718650818, + "num_tokens": 347707488.0, + "step": 9116 + }, + { + "epoch": 1.1597761099096806, + "ewc_loss": 0.03821861743927002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9109309505438432e-05, + "grad_norm": 26.69101333618164, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8594087362289429, + "num_tokens": 347746030.0, + "step": 9117 + }, + { + "epoch": 1.1599033201882711, + "ewc_loss": 0.03821282088756561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.910641003632918e-05, + "grad_norm": 26.5456600189209, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8567814826965332, + "num_tokens": 347783409.0, + "step": 9118 + }, + { + "epoch": 1.1600305304668617, + "ewc_loss": 0.038119636476039886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9059818441746756e-05, + "grad_norm": 26.552825927734375, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8720182180404663, + "num_tokens": 347821335.0, + "step": 9119 + }, + { + "epoch": 1.1601577407454522, + "ewc_loss": 0.038179218769073486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.908960985019803e-05, + "grad_norm": 26.611127853393555, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.874952495098114, + "num_tokens": 347857590.0, + "step": 9120 + }, + { + "epoch": 1.1602849510240427, + "ewc_loss": 0.03824804723262787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9124023310723715e-05, + "grad_norm": 26.642641067504883, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8771874308586121, + "num_tokens": 347897000.0, + "step": 9121 + }, + { + "epoch": 1.1604121613026332, + "ewc_loss": 0.03819127753376961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.909563798108138e-05, + "grad_norm": 26.55072021484375, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8642824292182922, + "num_tokens": 347928947.0, + "step": 9122 + }, + { + "epoch": 1.1605393715812238, + "ewc_loss": 0.038273610174655914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.913680534926243e-05, + "grad_norm": 26.704927444458008, + "learning_rate": 1e-06, + "loss": 0.5496, + "mean_token_accuracy": 0.8352077007293701, + "num_tokens": 347959294.0, + "step": 9123 + }, + { + "epoch": 1.1606665818598143, + "ewc_loss": 0.038232579827308655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.911629078676924e-05, + "grad_norm": 26.532018661499023, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.870642364025116, + "num_tokens": 347993103.0, + "step": 9124 + }, + { + "epoch": 1.1607937921384048, + "ewc_loss": 0.03820323571562767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9101616999250837e-05, + "grad_norm": 26.614402770996094, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8782555460929871, + "num_tokens": 348028412.0, + "step": 9125 + }, + { + "epoch": 1.1609210024169954, + "ewc_loss": 0.03836004063487053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9180020899511874e-05, + "grad_norm": 26.630292892456055, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8490031957626343, + "num_tokens": 348071429.0, + "step": 9126 + }, + { + "epoch": 1.161048212695586, + "ewc_loss": 0.03824206814169884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.912103471113369e-05, + "grad_norm": 26.573022842407227, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8706461787223816, + "num_tokens": 348113222.0, + "step": 9127 + }, + { + "epoch": 1.1611754229741762, + "ewc_loss": 0.038309771567583084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9154886103933677e-05, + "grad_norm": 26.68335723876953, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8637694716453552, + "num_tokens": 348150188.0, + "step": 9128 + }, + { + "epoch": 1.1613026332527667, + "ewc_loss": 0.03830293193459511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.915146640385501e-05, + "grad_norm": 26.678186416625977, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8681078553199768, + "num_tokens": 348188547.0, + "step": 9129 + }, + { + "epoch": 1.1614298435313573, + "ewc_loss": 0.03827976435422897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9139881260343827e-05, + "grad_norm": 26.66590690612793, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8601279854774475, + "num_tokens": 348228624.0, + "step": 9130 + }, + { + "epoch": 1.1615570538099478, + "ewc_loss": 0.03832099959254265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.916049950523302e-05, + "grad_norm": 26.604198455810547, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8619357943534851, + "num_tokens": 348264932.0, + "step": 9131 + }, + { + "epoch": 1.1616842640885383, + "ewc_loss": 0.03824644163250923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.912322113639675e-05, + "grad_norm": 26.584266662597656, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8670568466186523, + "num_tokens": 348307680.0, + "step": 9132 + }, + { + "epoch": 1.1618114743671288, + "ewc_loss": 0.038318391889333725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9159195289830677e-05, + "grad_norm": 26.67630958557129, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8787155747413635, + "num_tokens": 348346449.0, + "step": 9133 + }, + { + "epoch": 1.1619386846457194, + "ewc_loss": 0.038318533450365067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9159266230417415e-05, + "grad_norm": 26.61769676208496, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8751134872436523, + "num_tokens": 348386393.0, + "step": 9134 + }, + { + "epoch": 1.16206589492431, + "ewc_loss": 0.03827379271388054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.913689629873261e-05, + "grad_norm": 26.721376419067383, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8593846559524536, + "num_tokens": 348429458.0, + "step": 9135 + }, + { + "epoch": 1.1621931052029004, + "ewc_loss": 0.0383133627474308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9156681446474977e-05, + "grad_norm": 26.693361282348633, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8528159260749817, + "num_tokens": 348471365.0, + "step": 9136 + }, + { + "epoch": 1.162320315481491, + "ewc_loss": 0.03819916024804115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9099579731118865e-05, + "grad_norm": 26.628625869750977, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8496682643890381, + "num_tokens": 348513425.0, + "step": 9137 + }, + { + "epoch": 1.1624475257600815, + "ewc_loss": 0.03823728859424591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.911864455905743e-05, + "grad_norm": 26.642610549926758, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8626010417938232, + "num_tokens": 348551181.0, + "step": 9138 + }, + { + "epoch": 1.162574736038672, + "ewc_loss": 0.03816423937678337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.908211925183423e-05, + "grad_norm": 26.512548446655273, + "learning_rate": 1e-06, + "loss": 0.3577, + "mean_token_accuracy": 0.8858124017715454, + "num_tokens": 348583611.0, + "step": 9139 + }, + { + "epoch": 1.1627019463172625, + "ewc_loss": 0.03829813003540039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.914906533784233e-05, + "grad_norm": 26.701993942260742, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8632495999336243, + "num_tokens": 348624281.0, + "step": 9140 + }, + { + "epoch": 1.162829156595853, + "ewc_loss": 0.038228876888751984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.911443905555643e-05, + "grad_norm": 26.573823928833008, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8635212779045105, + "num_tokens": 348661416.0, + "step": 9141 + }, + { + "epoch": 1.1629563668744434, + "ewc_loss": 0.03825409710407257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.912704829010181e-05, + "grad_norm": 26.736650466918945, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8448488116264343, + "num_tokens": 348703907.0, + "step": 9142 + }, + { + "epoch": 1.163083577153034, + "ewc_loss": 0.03825400397181511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9127002815366723e-05, + "grad_norm": 26.59980010986328, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8768003582954407, + "num_tokens": 348735617.0, + "step": 9143 + }, + { + "epoch": 1.1632107874316244, + "ewc_loss": 0.038227494806051254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9113747839583084e-05, + "grad_norm": 26.704078674316406, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8624018430709839, + "num_tokens": 348773552.0, + "step": 9144 + }, + { + "epoch": 1.163337997710215, + "ewc_loss": 0.038287870585918427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.914393578772433e-05, + "grad_norm": 26.698999404907227, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8664726614952087, + "num_tokens": 348811274.0, + "step": 9145 + }, + { + "epoch": 1.1634652079888055, + "ewc_loss": 0.03824777528643608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.912388688651845e-05, + "grad_norm": 26.7150936126709, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.853461503982544, + "num_tokens": 348847773.0, + "step": 9146 + }, + { + "epoch": 1.163592418267396, + "ewc_loss": 0.03825393691658974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9126968254568055e-05, + "grad_norm": 26.646623611450195, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.856252908706665, + "num_tokens": 348886356.0, + "step": 9147 + }, + { + "epoch": 1.1637196285459865, + "ewc_loss": 0.03819826990365982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9099134078714997e-05, + "grad_norm": 26.66400146484375, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8762621879577637, + "num_tokens": 348922892.0, + "step": 9148 + }, + { + "epoch": 1.163846838824577, + "ewc_loss": 0.03816226124763489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.90811297215987e-05, + "grad_norm": 26.598880767822266, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.883922815322876, + "num_tokens": 348954924.0, + "step": 9149 + }, + { + "epoch": 1.1639740491031676, + "ewc_loss": 0.038225363940000534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9112681911792606e-05, + "grad_norm": 26.567596435546875, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8706820607185364, + "num_tokens": 348997335.0, + "step": 9150 + }, + { + "epoch": 1.1641012593817581, + "ewc_loss": 0.03823426365852356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.911713115987368e-05, + "grad_norm": 26.789470672607422, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8538423180580139, + "num_tokens": 349038221.0, + "step": 9151 + }, + { + "epoch": 1.1642284696603487, + "ewc_loss": 0.038283079862594604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.914154017867986e-05, + "grad_norm": 26.667680740356445, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8605533838272095, + "num_tokens": 349076559.0, + "step": 9152 + }, + { + "epoch": 1.164355679938939, + "ewc_loss": 0.038217976689338684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9108987544314004e-05, + "grad_norm": 26.701457977294922, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8432315587997437, + "num_tokens": 349112264.0, + "step": 9153 + }, + { + "epoch": 1.1644828902175295, + "ewc_loss": 0.03822983056306839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.911491563078016e-05, + "grad_norm": 26.774017333984375, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8779382705688477, + "num_tokens": 349146571.0, + "step": 9154 + }, + { + "epoch": 1.16461010049612, + "ewc_loss": 0.038238558918237686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9119279386359267e-05, + "grad_norm": 26.676897048950195, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8717552423477173, + "num_tokens": 349183674.0, + "step": 9155 + }, + { + "epoch": 1.1647373107747105, + "ewc_loss": 0.03818180039525032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9090899513685144e-05, + "grad_norm": 26.756853103637695, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8693839311599731, + "num_tokens": 349227042.0, + "step": 9156 + }, + { + "epoch": 1.164864521053301, + "ewc_loss": 0.03827659785747528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9138298739562742e-05, + "grad_norm": 26.72923469543457, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.859668493270874, + "num_tokens": 349264072.0, + "step": 9157 + }, + { + "epoch": 1.1649917313318916, + "ewc_loss": 0.03815716132521629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9078581317444332e-05, + "grad_norm": 26.628278732299805, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8660291433334351, + "num_tokens": 349304427.0, + "step": 9158 + }, + { + "epoch": 1.1651189416104821, + "ewc_loss": 0.03823715075850487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9118575437460095e-05, + "grad_norm": 26.805587768554688, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8647006750106812, + "num_tokens": 349346708.0, + "step": 9159 + }, + { + "epoch": 1.1652461518890727, + "ewc_loss": 0.038170360028743744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9085180611000396e-05, + "grad_norm": 26.64434051513672, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8638547658920288, + "num_tokens": 349387716.0, + "step": 9160 + }, + { + "epoch": 1.1653733621676632, + "ewc_loss": 0.03812592476606369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9062961655436084e-05, + "grad_norm": 26.723167419433594, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8614534139633179, + "num_tokens": 349425757.0, + "step": 9161 + }, + { + "epoch": 1.1655005724462537, + "ewc_loss": 0.03827916458249092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9139582946081646e-05, + "grad_norm": 26.76234245300293, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8717325329780579, + "num_tokens": 349472097.0, + "step": 9162 + }, + { + "epoch": 1.1656277827248442, + "ewc_loss": 0.038150738924741745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.907536898215767e-05, + "grad_norm": 26.673063278198242, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8618986010551453, + "num_tokens": 349513809.0, + "step": 9163 + }, + { + "epoch": 1.1657549930034348, + "ewc_loss": 0.03814316913485527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.907158366520889e-05, + "grad_norm": 26.66399574279785, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8802639245986938, + "num_tokens": 349544906.0, + "step": 9164 + }, + { + "epoch": 1.1658822032820253, + "ewc_loss": 0.03819455951452255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9097280528512783e-05, + "grad_norm": 26.706401824951172, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8595625758171082, + "num_tokens": 349582529.0, + "step": 9165 + }, + { + "epoch": 1.1660094135606156, + "ewc_loss": 0.038155291229486465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.907764635689091e-05, + "grad_norm": 26.64835548400879, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8603160381317139, + "num_tokens": 349624871.0, + "step": 9166 + }, + { + "epoch": 1.1661366238392061, + "ewc_loss": 0.03816049173474312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9080245692748576e-05, + "grad_norm": 26.763242721557617, + "learning_rate": 1e-06, + "loss": 0.537, + "mean_token_accuracy": 0.8297886848449707, + "num_tokens": 349660783.0, + "step": 9167 + }, + { + "epoch": 1.1662638341177967, + "ewc_loss": 0.038165364414453506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9082681319559924e-05, + "grad_norm": 26.640411376953125, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.874384343624115, + "num_tokens": 349698430.0, + "step": 9168 + }, + { + "epoch": 1.1663910443963872, + "ewc_loss": 0.03823864087462425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.911932122311555e-05, + "grad_norm": 26.72810935974121, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8574855327606201, + "num_tokens": 349734981.0, + "step": 9169 + }, + { + "epoch": 1.1665182546749777, + "ewc_loss": 0.03826388716697693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9131943190586753e-05, + "grad_norm": 26.765172958374023, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8592087030410767, + "num_tokens": 349774267.0, + "step": 9170 + }, + { + "epoch": 1.1666454649535682, + "ewc_loss": 0.03816309571266174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9081548089161515e-05, + "grad_norm": 26.668540954589844, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8608701229095459, + "num_tokens": 349811400.0, + "step": 9171 + }, + { + "epoch": 1.1667726752321588, + "ewc_loss": 0.03813724219799042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9068620531470515e-05, + "grad_norm": 26.608060836791992, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8558551669120789, + "num_tokens": 349850031.0, + "step": 9172 + }, + { + "epoch": 1.1668998855107493, + "ewc_loss": 0.03824148699641228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.912074367282912e-05, + "grad_norm": 26.69480323791504, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8676126599311829, + "num_tokens": 349889967.0, + "step": 9173 + }, + { + "epoch": 1.1670270957893398, + "ewc_loss": 0.03824257850646973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9121289369650185e-05, + "grad_norm": 26.73413848876953, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8589689135551453, + "num_tokens": 349930276.0, + "step": 9174 + }, + { + "epoch": 1.1671543060679304, + "ewc_loss": 0.03830639272928238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9153196262777783e-05, + "grad_norm": 26.758880615234375, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8597193956375122, + "num_tokens": 349969889.0, + "step": 9175 + }, + { + "epoch": 1.1672815163465209, + "ewc_loss": 0.03818302974104881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9091514332103543e-05, + "grad_norm": 26.65068244934082, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8627346754074097, + "num_tokens": 350005729.0, + "step": 9176 + }, + { + "epoch": 1.1674087266251112, + "ewc_loss": 0.03824914991855621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.912457446451299e-05, + "grad_norm": 26.779504776000977, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8718288540840149, + "num_tokens": 350043130.0, + "step": 9177 + }, + { + "epoch": 1.1675359369037017, + "ewc_loss": 0.038194891065359116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9097446056548506e-05, + "grad_norm": 26.614253997802734, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8826406002044678, + "num_tokens": 350081233.0, + "step": 9178 + }, + { + "epoch": 1.1676631471822922, + "ewc_loss": 0.038250282406806946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9125141989206895e-05, + "grad_norm": 26.77444839477539, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8526260852813721, + "num_tokens": 350119403.0, + "step": 9179 + }, + { + "epoch": 1.1677903574608828, + "ewc_loss": 0.03824087977409363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.912043990159873e-05, + "grad_norm": 26.727874755859375, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8523861765861511, + "num_tokens": 350159099.0, + "step": 9180 + }, + { + "epoch": 1.1679175677394733, + "ewc_loss": 0.03816897049546242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9084485757048242e-05, + "grad_norm": 26.71398162841797, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8639243841171265, + "num_tokens": 350195642.0, + "step": 9181 + }, + { + "epoch": 1.1680447780180638, + "ewc_loss": 0.03835112601518631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9175562556483783e-05, + "grad_norm": 26.82949447631836, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8723581433296204, + "num_tokens": 350236721.0, + "step": 9182 + }, + { + "epoch": 1.1681719882966544, + "ewc_loss": 0.03812454268336296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.906227225845214e-05, + "grad_norm": 26.677326202392578, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8602501153945923, + "num_tokens": 350272691.0, + "step": 9183 + }, + { + "epoch": 1.168299198575245, + "ewc_loss": 0.03820808604359627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9104043531115167e-05, + "grad_norm": 26.630048751831055, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8636363744735718, + "num_tokens": 350318299.0, + "step": 9184 + }, + { + "epoch": 1.1684264088538354, + "ewc_loss": 0.03822698816657066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.911349500005599e-05, + "grad_norm": 26.643112182617188, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8646570444107056, + "num_tokens": 350360973.0, + "step": 9185 + }, + { + "epoch": 1.168553619132426, + "ewc_loss": 0.038258641958236694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.912932020786684e-05, + "grad_norm": 26.582406997680664, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8725847005844116, + "num_tokens": 350400516.0, + "step": 9186 + }, + { + "epoch": 1.1686808294110165, + "ewc_loss": 0.03822698816657066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9113493181066588e-05, + "grad_norm": 26.72259521484375, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8603026866912842, + "num_tokens": 350438147.0, + "step": 9187 + }, + { + "epoch": 1.168808039689607, + "ewc_loss": 0.038316741585731506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.915837128763087e-05, + "grad_norm": 26.60689926147461, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8696575164794922, + "num_tokens": 350481604.0, + "step": 9188 + }, + { + "epoch": 1.1689352499681975, + "ewc_loss": 0.03829440847039223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9147204511682503e-05, + "grad_norm": 26.67550277709961, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.884710967540741, + "num_tokens": 350518225.0, + "step": 9189 + }, + { + "epoch": 1.169062460246788, + "ewc_loss": 0.03831617534160614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9158087525283918e-05, + "grad_norm": 26.675865173339844, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8639073371887207, + "num_tokens": 350553752.0, + "step": 9190 + }, + { + "epoch": 1.1691896705253784, + "ewc_loss": 0.03830772638320923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9153863831888884e-05, + "grad_norm": 26.666919708251953, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8688485622406006, + "num_tokens": 350593676.0, + "step": 9191 + }, + { + "epoch": 1.169316880803969, + "ewc_loss": 0.03829844668507576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.914922358992044e-05, + "grad_norm": 26.6922664642334, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8630240559577942, + "num_tokens": 350633618.0, + "step": 9192 + }, + { + "epoch": 1.1694440910825594, + "ewc_loss": 0.038369596004486084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9184797565685585e-05, + "grad_norm": 26.738224029541016, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8629092574119568, + "num_tokens": 350678221.0, + "step": 9193 + }, + { + "epoch": 1.16957130136115, + "ewc_loss": 0.038213979452848434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.910699029394891e-05, + "grad_norm": 26.497772216796875, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8752106428146362, + "num_tokens": 350718258.0, + "step": 9194 + }, + { + "epoch": 1.1696985116397405, + "ewc_loss": 0.03823060542345047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9115303075523116e-05, + "grad_norm": 26.58271598815918, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8747174143791199, + "num_tokens": 350757370.0, + "step": 9195 + }, + { + "epoch": 1.169825721918331, + "ewc_loss": 0.038386452943086624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9193226762581617e-05, + "grad_norm": 26.75479507446289, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8653874397277832, + "num_tokens": 350801394.0, + "step": 9196 + }, + { + "epoch": 1.1699529321969215, + "ewc_loss": 0.03834255039691925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9171275198459625e-05, + "grad_norm": 26.658004760742188, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8564551472663879, + "num_tokens": 350840635.0, + "step": 9197 + }, + { + "epoch": 1.170080142475512, + "ewc_loss": 0.03828210011124611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9141050870530307e-05, + "grad_norm": 26.798913955688477, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8801424503326416, + "num_tokens": 350879616.0, + "step": 9198 + }, + { + "epoch": 1.1702073527541026, + "ewc_loss": 0.03827999159693718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.913999585667625e-05, + "grad_norm": 26.663127899169922, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8645691275596619, + "num_tokens": 350921710.0, + "step": 9199 + }, + { + "epoch": 1.1703345630326931, + "ewc_loss": 0.038183655589818954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9091827198280953e-05, + "grad_norm": 26.73100471496582, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8528873920440674, + "num_tokens": 350964218.0, + "step": 9200 + }, + { + "epoch": 1.1704617733112836, + "ewc_loss": 0.0383153073489666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.915765278681647e-05, + "grad_norm": 26.66663932800293, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8745922446250916, + "num_tokens": 351004511.0, + "step": 9201 + }, + { + "epoch": 1.170588983589874, + "ewc_loss": 0.03821822255849838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9109111235593446e-05, + "grad_norm": 26.666519165039062, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8599337339401245, + "num_tokens": 351044583.0, + "step": 9202 + }, + { + "epoch": 1.1707161938684645, + "ewc_loss": 0.03829844668507576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.914922358992044e-05, + "grad_norm": 26.670297622680664, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8478755950927734, + "num_tokens": 351083613.0, + "step": 9203 + }, + { + "epoch": 1.170843404147055, + "ewc_loss": 0.03829050064086914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9145250917063095e-05, + "grad_norm": 26.680831909179688, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8569130897521973, + "num_tokens": 351117412.0, + "step": 9204 + }, + { + "epoch": 1.1709706144256455, + "ewc_loss": 0.0382908470928669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.914542372105643e-05, + "grad_norm": 26.725006103515625, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8505170345306396, + "num_tokens": 351154779.0, + "step": 9205 + }, + { + "epoch": 1.171097824704236, + "ewc_loss": 0.03826669976115227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9133349269395694e-05, + "grad_norm": 26.705549240112305, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8745832443237305, + "num_tokens": 351191412.0, + "step": 9206 + }, + { + "epoch": 1.1712250349828266, + "ewc_loss": 0.03832516819238663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9162584067089483e-05, + "grad_norm": 26.788862228393555, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.867648184299469, + "num_tokens": 351235465.0, + "step": 9207 + }, + { + "epoch": 1.1713522452614171, + "ewc_loss": 0.03820410743355751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9102053556707688e-05, + "grad_norm": 26.538284301757812, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8695729374885559, + "num_tokens": 351279326.0, + "step": 9208 + }, + { + "epoch": 1.1714794555400077, + "ewc_loss": 0.0382947213947773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9147360944771208e-05, + "grad_norm": 26.7172794342041, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8810190558433533, + "num_tokens": 351316678.0, + "step": 9209 + }, + { + "epoch": 1.1716066658185982, + "ewc_loss": 0.038314174860715866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9157087081111968e-05, + "grad_norm": 26.67859649658203, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8615276217460632, + "num_tokens": 351352641.0, + "step": 9210 + }, + { + "epoch": 1.1717338760971887, + "ewc_loss": 0.03829463943839073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9147319108014926e-05, + "grad_norm": 26.801315307617188, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8554865121841431, + "num_tokens": 351385845.0, + "step": 9211 + }, + { + "epoch": 1.1718610863757792, + "ewc_loss": 0.038319122046232224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.915956090670079e-05, + "grad_norm": 26.67808723449707, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8710457682609558, + "num_tokens": 351425220.0, + "step": 9212 + }, + { + "epoch": 1.1719882966543698, + "ewc_loss": 0.038332533091306686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.916626570164226e-05, + "grad_norm": 26.767318725585938, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8600184917449951, + "num_tokens": 351467515.0, + "step": 9213 + }, + { + "epoch": 1.1721155069329603, + "ewc_loss": 0.038312021642923355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.915601023938507e-05, + "grad_norm": 26.77716827392578, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.862335205078125, + "num_tokens": 351507571.0, + "step": 9214 + }, + { + "epoch": 1.1722427172115506, + "ewc_loss": 0.038284726440906525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9142362361890264e-05, + "grad_norm": 26.784215927124023, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8614288568496704, + "num_tokens": 351546150.0, + "step": 9215 + }, + { + "epoch": 1.1723699274901411, + "ewc_loss": 0.038268081843853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.913404048536904e-05, + "grad_norm": 26.72500228881836, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8577618598937988, + "num_tokens": 351588574.0, + "step": 9216 + }, + { + "epoch": 1.1724971377687317, + "ewc_loss": 0.03821643441915512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9108218111796305e-05, + "grad_norm": 26.6649112701416, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8656343221664429, + "num_tokens": 351626914.0, + "step": 9217 + }, + { + "epoch": 1.1726243480473222, + "ewc_loss": 0.03829464316368103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.914732092700433e-05, + "grad_norm": 26.784587860107422, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8548303246498108, + "num_tokens": 351663574.0, + "step": 9218 + }, + { + "epoch": 1.1727515583259127, + "ewc_loss": 0.03825702890753746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.912851439556107e-05, + "grad_norm": 26.654462814331055, + "learning_rate": 1e-06, + "loss": 0.4885, + "mean_token_accuracy": 0.8483552932739258, + "num_tokens": 351703275.0, + "step": 9219 + }, + { + "epoch": 1.1728787686045032, + "ewc_loss": 0.03829532861709595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.91476647160016e-05, + "grad_norm": 26.79456329345703, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.8556860685348511, + "num_tokens": 351745713.0, + "step": 9220 + }, + { + "epoch": 1.1730059788830938, + "ewc_loss": 0.03835470229387283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.917735062306747e-05, + "grad_norm": 26.793603897094727, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8493683934211731, + "num_tokens": 351780789.0, + "step": 9221 + }, + { + "epoch": 1.1731331891616843, + "ewc_loss": 0.038255661725997925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9127830455545336e-05, + "grad_norm": 26.618619918823242, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8744778037071228, + "num_tokens": 351820352.0, + "step": 9222 + }, + { + "epoch": 1.1732603994402748, + "ewc_loss": 0.038318928331136703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9159464500262402e-05, + "grad_norm": 26.73567771911621, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8718289136886597, + "num_tokens": 351860011.0, + "step": 9223 + }, + { + "epoch": 1.1733876097188654, + "ewc_loss": 0.03834666311740875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9173332475475036e-05, + "grad_norm": 26.768726348876953, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8590225577354431, + "num_tokens": 351898742.0, + "step": 9224 + }, + { + "epoch": 1.1735148199974559, + "ewc_loss": 0.03830205276608467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9151026208419353e-05, + "grad_norm": 26.67738151550293, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8525620102882385, + "num_tokens": 351940201.0, + "step": 9225 + }, + { + "epoch": 1.1736420302760462, + "ewc_loss": 0.038312867283821106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9156434063916095e-05, + "grad_norm": 26.729970932006836, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8623917102813721, + "num_tokens": 351971041.0, + "step": 9226 + }, + { + "epoch": 1.1737692405546367, + "ewc_loss": 0.03835108131170273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.917554072861094e-05, + "grad_norm": 26.68408966064453, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8467621207237244, + "num_tokens": 352013002.0, + "step": 9227 + }, + { + "epoch": 1.1738964508332272, + "ewc_loss": 0.03835365176200867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9176826754119247e-05, + "grad_norm": 26.818288803100586, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8633909225463867, + "num_tokens": 352051435.0, + "step": 9228 + }, + { + "epoch": 1.1740236611118178, + "ewc_loss": 0.03833566606044769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9167833670508116e-05, + "grad_norm": 26.708818435668945, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8701298832893372, + "num_tokens": 352093295.0, + "step": 9229 + }, + { + "epoch": 1.1741508713904083, + "ewc_loss": 0.03836287930607796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.918143971124664e-05, + "grad_norm": 26.729812622070312, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8606444597244263, + "num_tokens": 352136401.0, + "step": 9230 + }, + { + "epoch": 1.1742780816689988, + "ewc_loss": 0.038424890488386154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.921244438563008e-05, + "grad_norm": 26.817583084106445, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8754410147666931, + "num_tokens": 352178696.0, + "step": 9231 + }, + { + "epoch": 1.1744052919475894, + "ewc_loss": 0.038369499146938324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9184750271961093e-05, + "grad_norm": 26.7825870513916, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8457955718040466, + "num_tokens": 352214759.0, + "step": 9232 + }, + { + "epoch": 1.1745325022261799, + "ewc_loss": 0.03833325207233429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9166625861544162e-05, + "grad_norm": 26.702693939208984, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8779206871986389, + "num_tokens": 352250520.0, + "step": 9233 + }, + { + "epoch": 1.1746597125047704, + "ewc_loss": 0.038406554609537125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9203276679036207e-05, + "grad_norm": 26.94339942932129, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.862040638923645, + "num_tokens": 352288773.0, + "step": 9234 + }, + { + "epoch": 1.174786922783361, + "ewc_loss": 0.03838016465306282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9190081729902886e-05, + "grad_norm": 26.84383773803711, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8717505931854248, + "num_tokens": 352326548.0, + "step": 9235 + }, + { + "epoch": 1.1749141330619515, + "ewc_loss": 0.03823414817452431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.911707477120217e-05, + "grad_norm": 26.674184799194336, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8740936517715454, + "num_tokens": 352366537.0, + "step": 9236 + }, + { + "epoch": 1.175041343340542, + "ewc_loss": 0.038333695381879807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9166847778251395e-05, + "grad_norm": 26.8665828704834, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8468717336654663, + "num_tokens": 352402390.0, + "step": 9237 + }, + { + "epoch": 1.1751685536191325, + "ewc_loss": 0.03829525411128998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9147626517224126e-05, + "grad_norm": 26.644899368286133, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.861404538154602, + "num_tokens": 352442357.0, + "step": 9238 + }, + { + "epoch": 1.175295763897723, + "ewc_loss": 0.03836916759610176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9184582924935967e-05, + "grad_norm": 26.867998123168945, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.861562967300415, + "num_tokens": 352476884.0, + "step": 9239 + }, + { + "epoch": 1.1754229741763134, + "ewc_loss": 0.038322027772665024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9161014279234223e-05, + "grad_norm": 26.6524600982666, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8639775514602661, + "num_tokens": 352513448.0, + "step": 9240 + }, + { + "epoch": 1.175550184454904, + "ewc_loss": 0.038318730890750885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.915936627483461e-05, + "grad_norm": 26.779401779174805, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8826190233230591, + "num_tokens": 352548420.0, + "step": 9241 + }, + { + "epoch": 1.1756773947334944, + "ewc_loss": 0.03839169815182686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.919584974530153e-05, + "grad_norm": 26.728776931762695, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8671379089355469, + "num_tokens": 352592157.0, + "step": 9242 + }, + { + "epoch": 1.175804605012085, + "ewc_loss": 0.03832803666591644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9164017430739477e-05, + "grad_norm": 26.76218605041504, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8556350469589233, + "num_tokens": 352626427.0, + "step": 9243 + }, + { + "epoch": 1.1759318152906755, + "ewc_loss": 0.03838011994957924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9190059902030043e-05, + "grad_norm": 26.74933624267578, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8740271329879761, + "num_tokens": 352667075.0, + "step": 9244 + }, + { + "epoch": 1.176059025569266, + "ewc_loss": 0.03829944133758545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9149720174027607e-05, + "grad_norm": 26.665048599243164, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8624755144119263, + "num_tokens": 352707114.0, + "step": 9245 + }, + { + "epoch": 1.1761862358478565, + "ewc_loss": 0.03834957629442215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9174787666997872e-05, + "grad_norm": 26.778409957885742, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8696484565734863, + "num_tokens": 352738064.0, + "step": 9246 + }, + { + "epoch": 1.176313446126447, + "ewc_loss": 0.03842185065150261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9210925529478118e-05, + "grad_norm": 26.7784423828125, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8493496775627136, + "num_tokens": 352779964.0, + "step": 9247 + }, + { + "epoch": 1.1764406564050376, + "ewc_loss": 0.03832937031984329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.916468499985058e-05, + "grad_norm": 26.658184051513672, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8725708723068237, + "num_tokens": 352819574.0, + "step": 9248 + }, + { + "epoch": 1.1765678666836281, + "ewc_loss": 0.0383511483669281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9175573470420204e-05, + "grad_norm": 26.772239685058594, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8727907538414001, + "num_tokens": 352859151.0, + "step": 9249 + }, + { + "epoch": 1.1766950769622184, + "ewc_loss": 0.03840593621134758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9202967450837605e-05, + "grad_norm": 26.68706703186035, + "learning_rate": 1e-06, + "loss": 0.5313, + "mean_token_accuracy": 0.84343421459198, + "num_tokens": 352901119.0, + "step": 9250 + }, + { + "epoch": 1.176822287240809, + "ewc_loss": 0.03829367831349373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.914683889481239e-05, + "grad_norm": 26.75812339782715, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8633689880371094, + "num_tokens": 352945878.0, + "step": 9251 + }, + { + "epoch": 1.1769494975193995, + "ewc_loss": 0.03839026018977165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9195129425497726e-05, + "grad_norm": 26.73834228515625, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8585717082023621, + "num_tokens": 352979819.0, + "step": 9252 + }, + { + "epoch": 1.17707670779799, + "ewc_loss": 0.038367222994565964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9183611584594473e-05, + "grad_norm": 26.802831649780273, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8537538051605225, + "num_tokens": 353023552.0, + "step": 9253 + }, + { + "epoch": 1.1772039180765805, + "ewc_loss": 0.03838202357292175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9191011233488098e-05, + "grad_norm": 26.547391891479492, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8650857210159302, + "num_tokens": 353058178.0, + "step": 9254 + }, + { + "epoch": 1.177331128355171, + "ewc_loss": 0.03839004039764404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9195020286133513e-05, + "grad_norm": 26.822357177734375, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8734257221221924, + "num_tokens": 353091428.0, + "step": 9255 + }, + { + "epoch": 1.1774583386337616, + "ewc_loss": 0.03846036270260811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9230181351304054e-05, + "grad_norm": 26.577600479125977, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8676692247390747, + "num_tokens": 353131943.0, + "step": 9256 + }, + { + "epoch": 1.1775855489123521, + "ewc_loss": 0.03833390399813652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9166951460647397e-05, + "grad_norm": 26.684831619262695, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8574262857437134, + "num_tokens": 353171213.0, + "step": 9257 + }, + { + "epoch": 1.1777127591909426, + "ewc_loss": 0.03846937417984009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9234686988056637e-05, + "grad_norm": 26.691118240356445, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8723970651626587, + "num_tokens": 353205057.0, + "step": 9258 + }, + { + "epoch": 1.1778399694695332, + "ewc_loss": 0.03845527768135071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9227638404117897e-05, + "grad_norm": 26.644620895385742, + "learning_rate": 1e-06, + "loss": 0.5003, + "mean_token_accuracy": 0.8495070934295654, + "num_tokens": 353251164.0, + "step": 9259 + }, + { + "epoch": 1.1779671797481237, + "ewc_loss": 0.03845888003706932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9229439203627408e-05, + "grad_norm": 26.69552230834961, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.869010329246521, + "num_tokens": 353288202.0, + "step": 9260 + }, + { + "epoch": 1.1780943900267142, + "ewc_loss": 0.03844890743494034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.922445335367229e-05, + "grad_norm": 26.604103088378906, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8626732230186462, + "num_tokens": 353327396.0, + "step": 9261 + }, + { + "epoch": 1.1782216003053048, + "ewc_loss": 0.03840000554919243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9200002498109825e-05, + "grad_norm": 26.712556838989258, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8521611094474792, + "num_tokens": 353366512.0, + "step": 9262 + }, + { + "epoch": 1.1783488105838953, + "ewc_loss": 0.038485631346702576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9242816051701084e-05, + "grad_norm": 26.640687942504883, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.856113851070404, + "num_tokens": 353408546.0, + "step": 9263 + }, + { + "epoch": 1.1784760208624856, + "ewc_loss": 0.0384817011654377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.924084972415585e-05, + "grad_norm": 26.893709182739258, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8728943467140198, + "num_tokens": 353444950.0, + "step": 9264 + }, + { + "epoch": 1.1786032311410761, + "ewc_loss": 0.03848734125494957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.924367097672075e-05, + "grad_norm": 26.691387176513672, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8715018033981323, + "num_tokens": 353483306.0, + "step": 9265 + }, + { + "epoch": 1.1787304414196667, + "ewc_loss": 0.038375116884708405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9187558791600168e-05, + "grad_norm": 26.656206130981445, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8633043766021729, + "num_tokens": 353523038.0, + "step": 9266 + }, + { + "epoch": 1.1788576516982572, + "ewc_loss": 0.03844224661588669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9221122784074396e-05, + "grad_norm": 26.701738357543945, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8736194968223572, + "num_tokens": 353564266.0, + "step": 9267 + }, + { + "epoch": 1.1789848619768477, + "ewc_loss": 0.03846840187907219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.923420131788589e-05, + "grad_norm": 26.67053985595703, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8683129549026489, + "num_tokens": 353603558.0, + "step": 9268 + }, + { + "epoch": 1.1791120722554382, + "ewc_loss": 0.03845395892858505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9226979929953814e-05, + "grad_norm": 26.677583694458008, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8710468411445618, + "num_tokens": 353648097.0, + "step": 9269 + }, + { + "epoch": 1.1792392825340288, + "ewc_loss": 0.038458019495010376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.922900992212817e-05, + "grad_norm": 26.659818649291992, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8489898443222046, + "num_tokens": 353688970.0, + "step": 9270 + }, + { + "epoch": 1.1793664928126193, + "ewc_loss": 0.03838150203227997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.919075111800339e-05, + "grad_norm": 26.763540267944336, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8534181118011475, + "num_tokens": 353732505.0, + "step": 9271 + }, + { + "epoch": 1.1794937030912098, + "ewc_loss": 0.03849470242857933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9247350792284124e-05, + "grad_norm": 26.60199546813965, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8722850680351257, + "num_tokens": 353767460.0, + "step": 9272 + }, + { + "epoch": 1.1796209133698004, + "ewc_loss": 0.038366347551345825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.918317320814822e-05, + "grad_norm": 26.729841232299805, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8732821941375732, + "num_tokens": 353801951.0, + "step": 9273 + }, + { + "epoch": 1.1797481236483909, + "ewc_loss": 0.03850208967924118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9251045159762725e-05, + "grad_norm": 26.69679832458496, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8718428611755371, + "num_tokens": 353836815.0, + "step": 9274 + }, + { + "epoch": 1.1798753339269812, + "ewc_loss": 0.038420211523771286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.921010516525712e-05, + "grad_norm": 26.714702606201172, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.856606662273407, + "num_tokens": 353876296.0, + "step": 9275 + }, + { + "epoch": 1.1800025442055717, + "ewc_loss": 0.038482051342725754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9241026166127995e-05, + "grad_norm": 26.78081512451172, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8470985293388367, + "num_tokens": 353911792.0, + "step": 9276 + }, + { + "epoch": 1.1801297544841622, + "ewc_loss": 0.03845013678073883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9225068172090687e-05, + "grad_norm": 26.733640670776367, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8531071543693542, + "num_tokens": 353949532.0, + "step": 9277 + }, + { + "epoch": 1.1802569647627528, + "ewc_loss": 0.038506146520376205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.925307333294768e-05, + "grad_norm": 26.80778694152832, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8514033555984497, + "num_tokens": 353988622.0, + "step": 9278 + }, + { + "epoch": 1.1803841750413433, + "ewc_loss": 0.03836528956890106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.918264570122119e-05, + "grad_norm": 26.675567626953125, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8675384521484375, + "num_tokens": 354028136.0, + "step": 9279 + }, + { + "epoch": 1.1805113853199338, + "ewc_loss": 0.03843219578266144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9216098735341802e-05, + "grad_norm": 26.708271026611328, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8664790391921997, + "num_tokens": 354065029.0, + "step": 9280 + }, + { + "epoch": 1.1806385955985244, + "ewc_loss": 0.03848093003034592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.92404640984023e-05, + "grad_norm": 26.82518768310547, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8755316734313965, + "num_tokens": 354099279.0, + "step": 9281 + }, + { + "epoch": 1.1807658058771149, + "ewc_loss": 0.03841589391231537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9207947843824513e-05, + "grad_norm": 26.701366424560547, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8593283891677856, + "num_tokens": 354138870.0, + "step": 9282 + }, + { + "epoch": 1.1808930161557054, + "ewc_loss": 0.03849531337618828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.924765638250392e-05, + "grad_norm": 26.824201583862305, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8513888716697693, + "num_tokens": 354176498.0, + "step": 9283 + }, + { + "epoch": 1.181020226434296, + "ewc_loss": 0.03836911544203758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9184557459084317e-05, + "grad_norm": 26.69057273864746, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.8460729122161865, + "num_tokens": 354210063.0, + "step": 9284 + }, + { + "epoch": 1.1811474367128865, + "ewc_loss": 0.03841451555490494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.920725844684057e-05, + "grad_norm": 26.804536819458008, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8563390374183655, + "num_tokens": 354248095.0, + "step": 9285 + }, + { + "epoch": 1.181274646991477, + "ewc_loss": 0.03847062960267067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.923531453940086e-05, + "grad_norm": 26.718284606933594, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8609756827354431, + "num_tokens": 354286897.0, + "step": 9286 + }, + { + "epoch": 1.1814018572700675, + "ewc_loss": 0.038384389132261276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9192195395589806e-05, + "grad_norm": 26.733909606933594, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8723257780075073, + "num_tokens": 354330685.0, + "step": 9287 + }, + { + "epoch": 1.181529067548658, + "ewc_loss": 0.03852535039186478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9262675778008997e-05, + "grad_norm": 26.841567993164062, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8547850847244263, + "num_tokens": 354371120.0, + "step": 9288 + }, + { + "epoch": 1.1816562778272484, + "ewc_loss": 0.03837462514638901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.918731322803069e-05, + "grad_norm": 26.703630447387695, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8613276481628418, + "num_tokens": 354410825.0, + "step": 9289 + }, + { + "epoch": 1.1817834881058389, + "ewc_loss": 0.0383714996278286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9185750716133043e-05, + "grad_norm": 26.693359375, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8601822853088379, + "num_tokens": 354453623.0, + "step": 9290 + }, + { + "epoch": 1.1819106983844294, + "ewc_loss": 0.03847396373748779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9236982552683912e-05, + "grad_norm": 26.690954208374023, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8634792566299438, + "num_tokens": 354491854.0, + "step": 9291 + }, + { + "epoch": 1.18203790866302, + "ewc_loss": 0.038422539830207825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9211269318475388e-05, + "grad_norm": 26.697555541992188, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.850140392780304, + "num_tokens": 354530788.0, + "step": 9292 + }, + { + "epoch": 1.1821651189416105, + "ewc_loss": 0.03848399221897125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9241995687480085e-05, + "grad_norm": 26.714374542236328, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8832027316093445, + "num_tokens": 354568576.0, + "step": 9293 + }, + { + "epoch": 1.182292329220201, + "ewc_loss": 0.03846152499318123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9230761608923785e-05, + "grad_norm": 26.72706413269043, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8638754487037659, + "num_tokens": 354611307.0, + "step": 9294 + }, + { + "epoch": 1.1824195394987915, + "ewc_loss": 0.038481488823890686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9240744222770445e-05, + "grad_norm": 26.76796531677246, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.870577335357666, + "num_tokens": 354648810.0, + "step": 9295 + }, + { + "epoch": 1.182546749777382, + "ewc_loss": 0.038443673402071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9221835827920586e-05, + "grad_norm": 26.74701499938965, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8659512996673584, + "num_tokens": 354688411.0, + "step": 9296 + }, + { + "epoch": 1.1826739600559726, + "ewc_loss": 0.03838096186518669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9190480088582262e-05, + "grad_norm": 26.718210220336914, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8624638319015503, + "num_tokens": 354734700.0, + "step": 9297 + }, + { + "epoch": 1.1828011703345631, + "ewc_loss": 0.03842725604772568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9213628547731787e-05, + "grad_norm": 26.65253448486328, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8604821562767029, + "num_tokens": 354765517.0, + "step": 9298 + }, + { + "epoch": 1.1829283806131534, + "ewc_loss": 0.03847379982471466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9236900698160753e-05, + "grad_norm": 26.682653427124023, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8686320185661316, + "num_tokens": 354808563.0, + "step": 9299 + }, + { + "epoch": 1.183055590891744, + "ewc_loss": 0.03854404389858246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9272021745564416e-05, + "grad_norm": 26.786670684814453, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8454076051712036, + "num_tokens": 354846618.0, + "step": 9300 + }, + { + "epoch": 1.1831828011703345, + "ewc_loss": 0.03842103108763695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.921051625686232e-05, + "grad_norm": 26.80832862854004, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8668171167373657, + "num_tokens": 354881293.0, + "step": 9301 + }, + { + "epoch": 1.183310011448925, + "ewc_loss": 0.03845420852303505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9227103621233255e-05, + "grad_norm": 26.719127655029297, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8613142967224121, + "num_tokens": 354925239.0, + "step": 9302 + }, + { + "epoch": 1.1834372217275155, + "ewc_loss": 0.03845621272921562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.922810588439461e-05, + "grad_norm": 26.759233474731445, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.850813627243042, + "num_tokens": 354967836.0, + "step": 9303 + }, + { + "epoch": 1.183564432006106, + "ewc_loss": 0.038437698036432266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9218849047319964e-05, + "grad_norm": 26.740663528442383, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8476427793502808, + "num_tokens": 355003386.0, + "step": 9304 + }, + { + "epoch": 1.1836916422846966, + "ewc_loss": 0.038537394255399704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9268696632934734e-05, + "grad_norm": 26.85708236694336, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8416863679885864, + "num_tokens": 355043391.0, + "step": 9305 + }, + { + "epoch": 1.1838188525632871, + "ewc_loss": 0.03840276971459389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.920138493005652e-05, + "grad_norm": 26.75973892211914, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8652961254119873, + "num_tokens": 355082820.0, + "step": 9306 + }, + { + "epoch": 1.1839460628418776, + "ewc_loss": 0.03845106065273285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9225530195399188e-05, + "grad_norm": 26.785791397094727, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8839719295501709, + "num_tokens": 355117531.0, + "step": 9307 + }, + { + "epoch": 1.1840732731204682, + "ewc_loss": 0.03845301270484924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.922650699270889e-05, + "grad_norm": 26.674318313598633, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8505678176879883, + "num_tokens": 355159485.0, + "step": 9308 + }, + { + "epoch": 1.1842004833990587, + "ewc_loss": 0.03849194943904877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.924597381730564e-05, + "grad_norm": 26.86112403869629, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8589729070663452, + "num_tokens": 355206051.0, + "step": 9309 + }, + { + "epoch": 1.1843276936776492, + "ewc_loss": 0.03855500742793083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9277504179626703e-05, + "grad_norm": 26.78683090209961, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8541738986968994, + "num_tokens": 355242933.0, + "step": 9310 + }, + { + "epoch": 1.1844549039562398, + "ewc_loss": 0.038409315049648285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.92046572919935e-05, + "grad_norm": 26.623470306396484, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8754264712333679, + "num_tokens": 355286311.0, + "step": 9311 + }, + { + "epoch": 1.1845821142348303, + "ewc_loss": 0.03843691200017929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9218456145608798e-05, + "grad_norm": 26.697595596313477, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8633179664611816, + "num_tokens": 355327844.0, + "step": 9312 + }, + { + "epoch": 1.1847093245134206, + "ewc_loss": 0.03853146359324455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9265731680206954e-05, + "grad_norm": 26.851119995117188, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8726499080657959, + "num_tokens": 355368977.0, + "step": 9313 + }, + { + "epoch": 1.1848365347920111, + "ewc_loss": 0.03841720521450043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.920860268000979e-05, + "grad_norm": 26.690704345703125, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8535575270652771, + "num_tokens": 355411479.0, + "step": 9314 + }, + { + "epoch": 1.1849637450706016, + "ewc_loss": 0.03847666457295418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9238332242821343e-05, + "grad_norm": 26.76692771911621, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8816876411437988, + "num_tokens": 355447370.0, + "step": 9315 + }, + { + "epoch": 1.1850909553491922, + "ewc_loss": 0.03848212957382202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.924106436490547e-05, + "grad_norm": 26.621612548828125, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8601984977722168, + "num_tokens": 355490335.0, + "step": 9316 + }, + { + "epoch": 1.1852181656277827, + "ewc_loss": 0.03845718875527382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9228595192544162e-05, + "grad_norm": 26.683504104614258, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8642361164093018, + "num_tokens": 355527843.0, + "step": 9317 + }, + { + "epoch": 1.1853453759063732, + "ewc_loss": 0.03851161152124405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9255805455031805e-05, + "grad_norm": 26.704830169677734, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8612985014915466, + "num_tokens": 355566094.0, + "step": 9318 + }, + { + "epoch": 1.1854725861849638, + "ewc_loss": 0.03846539929509163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9232698832638562e-05, + "grad_norm": 26.716951370239258, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.871849536895752, + "num_tokens": 355609505.0, + "step": 9319 + }, + { + "epoch": 1.1855997964635543, + "ewc_loss": 0.03851671889424324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9258359316154383e-05, + "grad_norm": 26.835268020629883, + "learning_rate": 1e-06, + "loss": 0.5158, + "mean_token_accuracy": 0.8385667204856873, + "num_tokens": 355650930.0, + "step": 9320 + }, + { + "epoch": 1.1857270067421448, + "ewc_loss": 0.03848855197429657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.924427670019213e-05, + "grad_norm": 26.763206481933594, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8734509944915771, + "num_tokens": 355684075.0, + "step": 9321 + }, + { + "epoch": 1.1858542170207353, + "ewc_loss": 0.03840070590376854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.920035356306471e-05, + "grad_norm": 26.736282348632812, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8694936633110046, + "num_tokens": 355723567.0, + "step": 9322 + }, + { + "epoch": 1.1859814272993259, + "ewc_loss": 0.03845997899770737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9229990357416682e-05, + "grad_norm": 26.941091537475586, + "learning_rate": 1e-06, + "loss": 0.4889, + "mean_token_accuracy": 0.8504540324211121, + "num_tokens": 355764915.0, + "step": 9323 + }, + { + "epoch": 1.1861086375779162, + "ewc_loss": 0.03840749338269234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9203745978302322e-05, + "grad_norm": 26.651830673217773, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8644195795059204, + "num_tokens": 355804950.0, + "step": 9324 + }, + { + "epoch": 1.1862358478565067, + "ewc_loss": 0.03833916038274765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.916957990033552e-05, + "grad_norm": 26.92796516418457, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8706845045089722, + "num_tokens": 355844155.0, + "step": 9325 + }, + { + "epoch": 1.1863630581350972, + "ewc_loss": 0.038481052964925766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.924052594404202e-05, + "grad_norm": 26.733484268188477, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8799389600753784, + "num_tokens": 355883971.0, + "step": 9326 + }, + { + "epoch": 1.1864902684136878, + "ewc_loss": 0.03829026222229004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.914513086376246e-05, + "grad_norm": 26.725568771362305, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8585200309753418, + "num_tokens": 355922271.0, + "step": 9327 + }, + { + "epoch": 1.1866174786922783, + "ewc_loss": 0.038379743695259094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9189872546121478e-05, + "grad_norm": 26.784931182861328, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8625869750976562, + "num_tokens": 355961087.0, + "step": 9328 + }, + { + "epoch": 1.1867446889708688, + "ewc_loss": 0.038418594747781754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9209297533961944e-05, + "grad_norm": 26.80890655517578, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8678686022758484, + "num_tokens": 355999553.0, + "step": 9329 + }, + { + "epoch": 1.1868718992494594, + "ewc_loss": 0.03830563649535179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9152817912981845e-05, + "grad_norm": 26.66526222229004, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8689912557601929, + "num_tokens": 356038503.0, + "step": 9330 + }, + { + "epoch": 1.1869991095280499, + "ewc_loss": 0.038410402834415436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.920520116982516e-05, + "grad_norm": 26.852684020996094, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8734201192855835, + "num_tokens": 356078538.0, + "step": 9331 + }, + { + "epoch": 1.1871263198066404, + "ewc_loss": 0.03850043565034866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9250217519584112e-05, + "grad_norm": 26.810258865356445, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8752850294113159, + "num_tokens": 356115031.0, + "step": 9332 + }, + { + "epoch": 1.187253530085231, + "ewc_loss": 0.03833039849996567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9165199773851782e-05, + "grad_norm": 26.745708465576172, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8778479099273682, + "num_tokens": 356154751.0, + "step": 9333 + }, + { + "epoch": 1.1873807403638215, + "ewc_loss": 0.038395076990127563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.919753776746802e-05, + "grad_norm": 26.8426570892334, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8486320972442627, + "num_tokens": 356192105.0, + "step": 9334 + }, + { + "epoch": 1.187507950642412, + "ewc_loss": 0.03840489685535431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9202449038857594e-05, + "grad_norm": 26.694015502929688, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8529959321022034, + "num_tokens": 356224964.0, + "step": 9335 + }, + { + "epoch": 1.1876351609210025, + "ewc_loss": 0.03835165128111839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9175826309947297e-05, + "grad_norm": 27.05884552001953, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8688732385635376, + "num_tokens": 356261565.0, + "step": 9336 + }, + { + "epoch": 1.187762371199593, + "ewc_loss": 0.038464780896902084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.923238960443996e-05, + "grad_norm": 26.863882064819336, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8651958703994751, + "num_tokens": 356301886.0, + "step": 9337 + }, + { + "epoch": 1.1878895814781834, + "ewc_loss": 0.03820887207984924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9104436432826333e-05, + "grad_norm": 26.62610626220703, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8714163303375244, + "num_tokens": 356339481.0, + "step": 9338 + }, + { + "epoch": 1.1880167917567739, + "ewc_loss": 0.03841494396328926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9207471268600784e-05, + "grad_norm": 26.94996452331543, + "learning_rate": 1e-06, + "loss": 0.5105, + "mean_token_accuracy": 0.8415382504463196, + "num_tokens": 356387038.0, + "step": 9339 + }, + { + "epoch": 1.1881440020353644, + "ewc_loss": 0.03847038000822067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.923519084812142e-05, + "grad_norm": 26.707660675048828, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8596397638320923, + "num_tokens": 356433256.0, + "step": 9340 + }, + { + "epoch": 1.188271212313955, + "ewc_loss": 0.03834107518196106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9170536688761786e-05, + "grad_norm": 26.752206802368164, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8525667190551758, + "num_tokens": 356471282.0, + "step": 9341 + }, + { + "epoch": 1.1883984225925455, + "ewc_loss": 0.038456641137599945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9228320525144227e-05, + "grad_norm": 26.796688079833984, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8696122765541077, + "num_tokens": 356508951.0, + "step": 9342 + }, + { + "epoch": 1.188525632871136, + "ewc_loss": 0.038430698215961456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.921534931170754e-05, + "grad_norm": 26.88815689086914, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8667445182800293, + "num_tokens": 356544487.0, + "step": 9343 + }, + { + "epoch": 1.1886528431497265, + "ewc_loss": 0.03845972940325737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9229864847147837e-05, + "grad_norm": 26.945226669311523, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8709800243377686, + "num_tokens": 356580873.0, + "step": 9344 + }, + { + "epoch": 1.188780053428317, + "ewc_loss": 0.038393162190914154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9196580979041755e-05, + "grad_norm": 26.793062210083008, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8607438206672668, + "num_tokens": 356613622.0, + "step": 9345 + }, + { + "epoch": 1.1889072637069076, + "ewc_loss": 0.03846682608127594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9233413695474155e-05, + "grad_norm": 26.95524787902832, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8407551050186157, + "num_tokens": 356660058.0, + "step": 9346 + }, + { + "epoch": 1.189034473985498, + "ewc_loss": 0.03844098746776581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.922049341374077e-05, + "grad_norm": 26.794626235961914, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8637863993644714, + "num_tokens": 356701614.0, + "step": 9347 + }, + { + "epoch": 1.1891616842640884, + "ewc_loss": 0.03835828974843025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9179144146619365e-05, + "grad_norm": 27.004716873168945, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8783233165740967, + "num_tokens": 356737105.0, + "step": 9348 + }, + { + "epoch": 1.189288894542679, + "ewc_loss": 0.03848471865057945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9242359485360794e-05, + "grad_norm": 27.031333923339844, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8550996780395508, + "num_tokens": 356779382.0, + "step": 9349 + }, + { + "epoch": 1.1894161048212695, + "ewc_loss": 0.0382922887802124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9146144040860236e-05, + "grad_norm": 26.69301986694336, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8574947118759155, + "num_tokens": 356813836.0, + "step": 9350 + }, + { + "epoch": 1.18954331509986, + "ewc_loss": 0.038392048329114914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.919602436828427e-05, + "grad_norm": 27.20167350769043, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8547231554985046, + "num_tokens": 356852591.0, + "step": 9351 + }, + { + "epoch": 1.1896705253784505, + "ewc_loss": 0.03841615840792656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.920807881106157e-05, + "grad_norm": 26.634845733642578, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8612241744995117, + "num_tokens": 356889940.0, + "step": 9352 + }, + { + "epoch": 1.189797735657041, + "ewc_loss": 0.03823062777519226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9115313989459537e-05, + "grad_norm": 26.785306930541992, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8672561645507812, + "num_tokens": 356927878.0, + "step": 9353 + }, + { + "epoch": 1.1899249459356316, + "ewc_loss": 0.038529470562934875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.926473487401381e-05, + "grad_norm": 26.821977615356445, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8726115226745605, + "num_tokens": 356963339.0, + "step": 9354 + }, + { + "epoch": 1.1900521562142221, + "ewc_loss": 0.038343772292137146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9171886378899217e-05, + "grad_norm": 26.894817352294922, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.866742730140686, + "num_tokens": 357002999.0, + "step": 9355 + }, + { + "epoch": 1.1901793664928126, + "ewc_loss": 0.038512203842401505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9256101950304583e-05, + "grad_norm": 26.80406379699707, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8665357232093811, + "num_tokens": 357044305.0, + "step": 9356 + }, + { + "epoch": 1.1903065767714032, + "ewc_loss": 0.038370344787836075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9185172277502716e-05, + "grad_norm": 26.871307373046875, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.86632239818573, + "num_tokens": 357082197.0, + "step": 9357 + }, + { + "epoch": 1.1904337870499937, + "ewc_loss": 0.038506053388118744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9253026039223187e-05, + "grad_norm": 26.88026237487793, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8437883853912354, + "num_tokens": 357119541.0, + "step": 9358 + }, + { + "epoch": 1.1905609973285842, + "ewc_loss": 0.03838275372982025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.919137685035821e-05, + "grad_norm": 26.68079376220703, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8853382468223572, + "num_tokens": 357159176.0, + "step": 9359 + }, + { + "epoch": 1.1906882076071748, + "ewc_loss": 0.03846214711666107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9231072656111792e-05, + "grad_norm": 26.972536087036133, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8634659051895142, + "num_tokens": 357199204.0, + "step": 9360 + }, + { + "epoch": 1.1908154178857653, + "ewc_loss": 0.03846379742026329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9231898477301e-05, + "grad_norm": 26.847299575805664, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8779053092002869, + "num_tokens": 357236900.0, + "step": 9361 + }, + { + "epoch": 1.1909426281643556, + "ewc_loss": 0.038405705243349075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.920285285450518e-05, + "grad_norm": 26.866683959960938, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8636751174926758, + "num_tokens": 357271622.0, + "step": 9362 + }, + { + "epoch": 1.1910698384429461, + "ewc_loss": 0.03843718767166138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9218594388803467e-05, + "grad_norm": 26.74175453186035, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8746476769447327, + "num_tokens": 357311145.0, + "step": 9363 + }, + { + "epoch": 1.1911970487215366, + "ewc_loss": 0.0383971743285656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9198587324353866e-05, + "grad_norm": 26.881103515625, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8812428712844849, + "num_tokens": 357344609.0, + "step": 9364 + }, + { + "epoch": 1.1913242590001272, + "ewc_loss": 0.03843843191862106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.921921648317948e-05, + "grad_norm": 26.697017669677734, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8612583875656128, + "num_tokens": 357378896.0, + "step": 9365 + }, + { + "epoch": 1.1914514692787177, + "ewc_loss": 0.03842068463563919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.921034163387958e-05, + "grad_norm": 26.86930274963379, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8681989908218384, + "num_tokens": 357424730.0, + "step": 9366 + }, + { + "epoch": 1.1915786795573082, + "ewc_loss": 0.03847535699605942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.923767922562547e-05, + "grad_norm": 26.829204559326172, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8455632328987122, + "num_tokens": 357461152.0, + "step": 9367 + }, + { + "epoch": 1.1917058898358988, + "ewc_loss": 0.0384320430457592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9216022337786853e-05, + "grad_norm": 26.734895706176758, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.857428789138794, + "num_tokens": 357502305.0, + "step": 9368 + }, + { + "epoch": 1.1918331001144893, + "ewc_loss": 0.03850574046373367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9252869606134482e-05, + "grad_norm": 26.854291915893555, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8582529425621033, + "num_tokens": 357539187.0, + "step": 9369 + }, + { + "epoch": 1.1919603103930798, + "ewc_loss": 0.03849121183156967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9245606381446123e-05, + "grad_norm": 26.817825317382812, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8530075550079346, + "num_tokens": 357569520.0, + "step": 9370 + }, + { + "epoch": 1.1920875206716703, + "ewc_loss": 0.03849645331501961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9248227545176633e-05, + "grad_norm": 26.80850601196289, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8635852336883545, + "num_tokens": 357605253.0, + "step": 9371 + }, + { + "epoch": 1.1922147309502609, + "ewc_loss": 0.038445934653282166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.922296723932959e-05, + "grad_norm": 26.674535751342773, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8738448023796082, + "num_tokens": 357639701.0, + "step": 9372 + }, + { + "epoch": 1.1923419412288512, + "ewc_loss": 0.0385250449180603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.92625229828991e-05, + "grad_norm": 26.806413650512695, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8721154928207397, + "num_tokens": 357675611.0, + "step": 9373 + }, + { + "epoch": 1.1924691515074417, + "ewc_loss": 0.03861396759748459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.930698454088997e-05, + "grad_norm": 26.764856338500977, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.875407338142395, + "num_tokens": 357708521.0, + "step": 9374 + }, + { + "epoch": 1.1925963617860322, + "ewc_loss": 0.038510143756866455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9255072402302176e-05, + "grad_norm": 26.78860855102539, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8543634414672852, + "num_tokens": 357749970.0, + "step": 9375 + }, + { + "epoch": 1.1927235720646228, + "ewc_loss": 0.03862806037068367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9314029486849904e-05, + "grad_norm": 26.819679260253906, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8732579350471497, + "num_tokens": 357783287.0, + "step": 9376 + }, + { + "epoch": 1.1928507823432133, + "ewc_loss": 0.03854969143867493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.927484481711872e-05, + "grad_norm": 26.71445083618164, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8649424910545349, + "num_tokens": 357818883.0, + "step": 9377 + }, + { + "epoch": 1.1929779926218038, + "ewc_loss": 0.0386270210146904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9313511074869893e-05, + "grad_norm": 26.831857681274414, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8765291571617126, + "num_tokens": 357862862.0, + "step": 9378 + }, + { + "epoch": 1.1931052029003943, + "ewc_loss": 0.038608331233263016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9304165107314475e-05, + "grad_norm": 26.793590545654297, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8525111675262451, + "num_tokens": 357901574.0, + "step": 9379 + }, + { + "epoch": 1.1932324131789849, + "ewc_loss": 0.03860940784215927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9304703528177924e-05, + "grad_norm": 26.840721130371094, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8627563714981079, + "num_tokens": 357941822.0, + "step": 9380 + }, + { + "epoch": 1.1933596234575754, + "ewc_loss": 0.038570716977119446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9285358575871214e-05, + "grad_norm": 26.715158462524414, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8573679327964783, + "num_tokens": 357982820.0, + "step": 9381 + }, + { + "epoch": 1.193486833736166, + "ewc_loss": 0.038640331476926804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9320164938108064e-05, + "grad_norm": 26.938077926635742, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8621646761894226, + "num_tokens": 358020491.0, + "step": 9382 + }, + { + "epoch": 1.1936140440147565, + "ewc_loss": 0.038565754890441895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9282877474324778e-05, + "grad_norm": 26.738597869873047, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8653290271759033, + "num_tokens": 358059775.0, + "step": 9383 + }, + { + "epoch": 1.193741254293347, + "ewc_loss": 0.038557350635528564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9278675608802587e-05, + "grad_norm": 26.832738876342773, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8714622855186462, + "num_tokens": 358098672.0, + "step": 9384 + }, + { + "epoch": 1.1938684645719375, + "ewc_loss": 0.03858760744333267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9293804143671878e-05, + "grad_norm": 26.712574005126953, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8560369610786438, + "num_tokens": 358140850.0, + "step": 9385 + }, + { + "epoch": 1.193995674850528, + "ewc_loss": 0.03855772316455841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.927886114572175e-05, + "grad_norm": 26.768571853637695, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.870945394039154, + "num_tokens": 358175503.0, + "step": 9386 + }, + { + "epoch": 1.1941228851291183, + "ewc_loss": 0.038615792989730835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9307895854581147e-05, + "grad_norm": 26.834632873535156, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.866912305355072, + "num_tokens": 358210615.0, + "step": 9387 + }, + { + "epoch": 1.1942500954077089, + "ewc_loss": 0.03859955817461014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9299779523862526e-05, + "grad_norm": 26.83540153503418, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8666480779647827, + "num_tokens": 358246605.0, + "step": 9388 + }, + { + "epoch": 1.1943773056862994, + "ewc_loss": 0.03853568062186241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9267839888925664e-05, + "grad_norm": 26.737043380737305, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8612202405929565, + "num_tokens": 358283136.0, + "step": 9389 + }, + { + "epoch": 1.19450451596489, + "ewc_loss": 0.038551751524209976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.927587618411053e-05, + "grad_norm": 26.87175750732422, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8727940320968628, + "num_tokens": 358325437.0, + "step": 9390 + }, + { + "epoch": 1.1946317262434805, + "ewc_loss": 0.03864100202918053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.932050145114772e-05, + "grad_norm": 26.809123992919922, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8636417984962463, + "num_tokens": 358365360.0, + "step": 9391 + }, + { + "epoch": 1.194758936522071, + "ewc_loss": 0.03863589093089104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9317945771035738e-05, + "grad_norm": 27.02584457397461, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8805835247039795, + "num_tokens": 358406502.0, + "step": 9392 + }, + { + "epoch": 1.1948861468006615, + "ewc_loss": 0.03858564421534538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9292821889393963e-05, + "grad_norm": 26.813615798950195, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8593637943267822, + "num_tokens": 358437220.0, + "step": 9393 + }, + { + "epoch": 1.195013357079252, + "ewc_loss": 0.03853096812963486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9265484297648072e-05, + "grad_norm": 27.05841827392578, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.851826548576355, + "num_tokens": 358475314.0, + "step": 9394 + }, + { + "epoch": 1.1951405673578426, + "ewc_loss": 0.03860253468155861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9301267457194626e-05, + "grad_norm": 26.848875045776367, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8743876218795776, + "num_tokens": 358520777.0, + "step": 9395 + }, + { + "epoch": 1.195267777636433, + "ewc_loss": 0.038459740579128265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9229870304116048e-05, + "grad_norm": 26.891807556152344, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.884362518787384, + "num_tokens": 358557973.0, + "step": 9396 + }, + { + "epoch": 1.1953949879150234, + "ewc_loss": 0.03859065845608711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.929532845679205e-05, + "grad_norm": 26.946537017822266, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8719625473022461, + "num_tokens": 358596795.0, + "step": 9397 + }, + { + "epoch": 1.195522198193614, + "ewc_loss": 0.03851519525051117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9257597159594297e-05, + "grad_norm": 26.804685592651367, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8659626245498657, + "num_tokens": 358632146.0, + "step": 9398 + }, + { + "epoch": 1.1956494084722045, + "ewc_loss": 0.03847946971654892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9239734683651477e-05, + "grad_norm": 26.764081954956055, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8722825646400452, + "num_tokens": 358672194.0, + "step": 9399 + }, + { + "epoch": 1.195776618750795, + "ewc_loss": 0.03857952356338501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.92897623492172e-05, + "grad_norm": 26.962295532226562, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8722673654556274, + "num_tokens": 358710192.0, + "step": 9400 + }, + { + "epoch": 1.1959038290293855, + "ewc_loss": 0.038543928414583206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9271963537903503e-05, + "grad_norm": 27.0535945892334, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8504691123962402, + "num_tokens": 358749284.0, + "step": 9401 + }, + { + "epoch": 1.196031039307976, + "ewc_loss": 0.03843607008457184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9218034140067175e-05, + "grad_norm": 26.82268714904785, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8604923486709595, + "num_tokens": 358782830.0, + "step": 9402 + }, + { + "epoch": 1.1961582495865666, + "ewc_loss": 0.03843509405851364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9217546650907025e-05, + "grad_norm": 27.081830978393555, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8629896640777588, + "num_tokens": 358820696.0, + "step": 9403 + }, + { + "epoch": 1.196285459865157, + "ewc_loss": 0.038529884070158005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9264942238805816e-05, + "grad_norm": 26.81819725036621, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8710763454437256, + "num_tokens": 358853616.0, + "step": 9404 + }, + { + "epoch": 1.1964126701437476, + "ewc_loss": 0.03838082402944565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.919041278597433e-05, + "grad_norm": 26.813894271850586, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8735511302947998, + "num_tokens": 358893420.0, + "step": 9405 + }, + { + "epoch": 1.1965398804223382, + "ewc_loss": 0.03857159614562988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9285798771306872e-05, + "grad_norm": 26.944211959838867, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8666847348213196, + "num_tokens": 358930704.0, + "step": 9406 + }, + { + "epoch": 1.1966670907009287, + "ewc_loss": 0.03846311569213867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.923155832628254e-05, + "grad_norm": 26.853317260742188, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8780557513237, + "num_tokens": 358967730.0, + "step": 9407 + }, + { + "epoch": 1.1967943009795192, + "ewc_loss": 0.03845740109682083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9228700693929568e-05, + "grad_norm": 26.739957809448242, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8774651288986206, + "num_tokens": 359007241.0, + "step": 9408 + }, + { + "epoch": 1.1969215112581097, + "ewc_loss": 0.038529735058546066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.926486766024027e-05, + "grad_norm": 26.949993133544922, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8508868217468262, + "num_tokens": 359045573.0, + "step": 9409 + }, + { + "epoch": 1.1970487215367003, + "ewc_loss": 0.03857249766588211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9286248061689548e-05, + "grad_norm": 26.803617477416992, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8677990436553955, + "num_tokens": 359083420.0, + "step": 9410 + }, + { + "epoch": 1.1971759318152906, + "ewc_loss": 0.0385013148188591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.925065771501977e-05, + "grad_norm": 26.9024715423584, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8774083852767944, + "num_tokens": 359117555.0, + "step": 9411 + }, + { + "epoch": 1.1973031420938811, + "ewc_loss": 0.03852250799536705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9261253328295425e-05, + "grad_norm": 26.827482223510742, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8615785837173462, + "num_tokens": 359154597.0, + "step": 9412 + }, + { + "epoch": 1.1974303523724716, + "ewc_loss": 0.03849146515130997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9245731891714968e-05, + "grad_norm": 26.78166389465332, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8592222929000854, + "num_tokens": 359190074.0, + "step": 9413 + }, + { + "epoch": 1.1975575626510622, + "ewc_loss": 0.03851340711116791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9256704035797156e-05, + "grad_norm": 26.866483688354492, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8534747362136841, + "num_tokens": 359232711.0, + "step": 9414 + }, + { + "epoch": 1.1976847729296527, + "ewc_loss": 0.03849706053733826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.924852949741762e-05, + "grad_norm": 26.724702835083008, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8643802404403687, + "num_tokens": 359265582.0, + "step": 9415 + }, + { + "epoch": 1.1978119832082432, + "ewc_loss": 0.03849921375513077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.924960633914452e-05, + "grad_norm": 26.86798667907715, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8724158406257629, + "num_tokens": 359303177.0, + "step": 9416 + }, + { + "epoch": 1.1979391934868338, + "ewc_loss": 0.03855886310338974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9279432308394462e-05, + "grad_norm": 26.680395126342773, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8739863634109497, + "num_tokens": 359338712.0, + "step": 9417 + }, + { + "epoch": 1.1980664037654243, + "ewc_loss": 0.038609717041254044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9304858142277226e-05, + "grad_norm": 27.069904327392578, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8731715679168701, + "num_tokens": 359372849.0, + "step": 9418 + }, + { + "epoch": 1.1981936140440148, + "ewc_loss": 0.03860986977815628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9304934539832175e-05, + "grad_norm": 26.705398559570312, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8698059320449829, + "num_tokens": 359407981.0, + "step": 9419 + }, + { + "epoch": 1.1983208243226053, + "ewc_loss": 0.0384635329246521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9231765691074543e-05, + "grad_norm": 26.747650146484375, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.843823254108429, + "num_tokens": 359450180.0, + "step": 9420 + }, + { + "epoch": 1.1984480346011959, + "ewc_loss": 0.03866375982761383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.93318792298669e-05, + "grad_norm": 26.90522003173828, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8637280464172363, + "num_tokens": 359487909.0, + "step": 9421 + }, + { + "epoch": 1.1985752448797862, + "ewc_loss": 0.03859002888202667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9295013771625236e-05, + "grad_norm": 26.898664474487305, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8765859603881836, + "num_tokens": 359522609.0, + "step": 9422 + }, + { + "epoch": 1.1987024551583767, + "ewc_loss": 0.03863789513707161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9318948034197092e-05, + "grad_norm": 26.90313720703125, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8620673418045044, + "num_tokens": 359560735.0, + "step": 9423 + }, + { + "epoch": 1.1988296654369672, + "ewc_loss": 0.038571182638406754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9285591406514868e-05, + "grad_norm": 26.728105545043945, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.858423113822937, + "num_tokens": 359602994.0, + "step": 9424 + }, + { + "epoch": 1.1989568757155578, + "ewc_loss": 0.03865267336368561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9326336769154295e-05, + "grad_norm": 26.998477935791016, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8641634583473206, + "num_tokens": 359638864.0, + "step": 9425 + }, + { + "epoch": 1.1990840859941483, + "ewc_loss": 0.03865969926118851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9329849237692542e-05, + "grad_norm": 26.796449661254883, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8729390501976013, + "num_tokens": 359674202.0, + "step": 9426 + }, + { + "epoch": 1.1992112962727388, + "ewc_loss": 0.038516685366630554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.925834294524975e-05, + "grad_norm": 26.86261558532715, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8713171482086182, + "num_tokens": 359710408.0, + "step": 9427 + }, + { + "epoch": 1.1993385065513293, + "ewc_loss": 0.03864392265677452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9321962099638768e-05, + "grad_norm": 26.80842399597168, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8720519542694092, + "num_tokens": 359749068.0, + "step": 9428 + }, + { + "epoch": 1.1994657168299199, + "ewc_loss": 0.038576528429985046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9288263501948677e-05, + "grad_norm": 27.02022361755371, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8666225075721741, + "num_tokens": 359788449.0, + "step": 9429 + }, + { + "epoch": 1.1995929271085104, + "ewc_loss": 0.03856983780860901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9284918380435556e-05, + "grad_norm": 26.70315170288086, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.858939528465271, + "num_tokens": 359824167.0, + "step": 9430 + }, + { + "epoch": 1.199720137387101, + "ewc_loss": 0.038558777421712875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9279388652648777e-05, + "grad_norm": 27.006534576416016, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8593894243240356, + "num_tokens": 359861476.0, + "step": 9431 + }, + { + "epoch": 1.1998473476656915, + "ewc_loss": 0.038696158677339554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.934807914949488e-05, + "grad_norm": 26.83966827392578, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8639272451400757, + "num_tokens": 359906463.0, + "step": 9432 + }, + { + "epoch": 1.199974557944282, + "ewc_loss": 0.0384909063577652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9245453586336225e-05, + "grad_norm": 26.894746780395508, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8687068223953247, + "num_tokens": 359940576.0, + "step": 9433 + }, + { + "epoch": 1.2001017682228725, + "ewc_loss": 0.03860880807042122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.930440339492634e-05, + "grad_norm": 26.86180877685547, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8742719888687134, + "num_tokens": 359973616.0, + "step": 9434 + }, + { + "epoch": 1.200228978501463, + "ewc_loss": 0.03860199823975563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.93009982467629e-05, + "grad_norm": 26.902507781982422, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8768330216407776, + "num_tokens": 360011059.0, + "step": 9435 + }, + { + "epoch": 1.2003561887800533, + "ewc_loss": 0.038629308342933655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.931465340021532e-05, + "grad_norm": 26.837413787841797, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8737612962722778, + "num_tokens": 360048307.0, + "step": 9436 + }, + { + "epoch": 1.2004833990586439, + "ewc_loss": 0.038535889238119125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.926794539031107e-05, + "grad_norm": 26.99635887145996, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8649580478668213, + "num_tokens": 360086946.0, + "step": 9437 + }, + { + "epoch": 1.2006106093372344, + "ewc_loss": 0.03866966441273689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9334831449668854e-05, + "grad_norm": 26.834665298461914, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8697760105133057, + "num_tokens": 360121908.0, + "step": 9438 + }, + { + "epoch": 1.200737819615825, + "ewc_loss": 0.03846852853894234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9234264982515015e-05, + "grad_norm": 26.848033905029297, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8556827306747437, + "num_tokens": 360164349.0, + "step": 9439 + }, + { + "epoch": 1.2008650298944155, + "ewc_loss": 0.038589462637901306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9294731828267686e-05, + "grad_norm": 26.865825653076172, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8799870014190674, + "num_tokens": 360197660.0, + "step": 9440 + }, + { + "epoch": 1.200992240173006, + "ewc_loss": 0.03858942538499832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9294711819384247e-05, + "grad_norm": 27.02215576171875, + "learning_rate": 1e-06, + "loss": 0.5086, + "mean_token_accuracy": 0.8434864282608032, + "num_tokens": 360240297.0, + "step": 9441 + }, + { + "epoch": 1.2011194504515965, + "ewc_loss": 0.03861182928085327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9305914975120686e-05, + "grad_norm": 26.821308135986328, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8553793430328369, + "num_tokens": 360281566.0, + "step": 9442 + }, + { + "epoch": 1.201246660730187, + "ewc_loss": 0.03853190690279007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9265953596914187e-05, + "grad_norm": 27.025217056274414, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8717150688171387, + "num_tokens": 360313927.0, + "step": 9443 + }, + { + "epoch": 1.2013738710087776, + "ewc_loss": 0.03865097835659981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9325489120092243e-05, + "grad_norm": 26.9545841217041, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8684923648834229, + "num_tokens": 360349369.0, + "step": 9444 + }, + { + "epoch": 1.201501081287368, + "ewc_loss": 0.03858299180865288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9291495846118778e-05, + "grad_norm": 26.91161346435547, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8641825914382935, + "num_tokens": 360386996.0, + "step": 9445 + }, + { + "epoch": 1.2016282915659584, + "ewc_loss": 0.038580652326345444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9290326235932298e-05, + "grad_norm": 26.937503814697266, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8692434430122375, + "num_tokens": 360424883.0, + "step": 9446 + }, + { + "epoch": 1.201755501844549, + "ewc_loss": 0.038638874888420105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9319437342346646e-05, + "grad_norm": 26.93367576599121, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8702449798583984, + "num_tokens": 360457992.0, + "step": 9447 + }, + { + "epoch": 1.2018827121231395, + "ewc_loss": 0.03860258683562279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9301292923046276e-05, + "grad_norm": 26.87666893005371, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8505327105522156, + "num_tokens": 360497934.0, + "step": 9448 + }, + { + "epoch": 1.20200992240173, + "ewc_loss": 0.038575753569602966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9287876057205722e-05, + "grad_norm": 27.03037452697754, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8624773621559143, + "num_tokens": 360533928.0, + "step": 9449 + }, + { + "epoch": 1.2021371326803205, + "ewc_loss": 0.03861411288380623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9307057300466113e-05, + "grad_norm": 26.89354133605957, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8679522275924683, + "num_tokens": 360571076.0, + "step": 9450 + }, + { + "epoch": 1.202264342958911, + "ewc_loss": 0.038567159324884415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9283579604234546e-05, + "grad_norm": 26.939329147338867, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8577135801315308, + "num_tokens": 360611957.0, + "step": 9451 + }, + { + "epoch": 1.2023915532375016, + "ewc_loss": 0.03859935328364372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9299675841466524e-05, + "grad_norm": 26.87761878967285, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.858837366104126, + "num_tokens": 360647823.0, + "step": 9452 + }, + { + "epoch": 1.202518763516092, + "ewc_loss": 0.03857235237956047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9286175302113406e-05, + "grad_norm": 26.971328735351562, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8565386533737183, + "num_tokens": 360688341.0, + "step": 9453 + }, + { + "epoch": 1.2026459737946826, + "ewc_loss": 0.03858942165970802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9294710000394844e-05, + "grad_norm": 26.846298217773438, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8573538661003113, + "num_tokens": 360730417.0, + "step": 9454 + }, + { + "epoch": 1.2027731840732732, + "ewc_loss": 0.03852322697639465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9261613488197327e-05, + "grad_norm": 26.928699493408203, + "learning_rate": 1e-06, + "loss": 0.4963, + "mean_token_accuracy": 0.8452709913253784, + "num_tokens": 360771931.0, + "step": 9455 + }, + { + "epoch": 1.2029003943518637, + "ewc_loss": 0.03859113156795502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.929556492541451e-05, + "grad_norm": 27.007017135620117, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8637815713882446, + "num_tokens": 360811460.0, + "step": 9456 + }, + { + "epoch": 1.2030276046304542, + "ewc_loss": 0.038527294993400574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.926364711835049e-05, + "grad_norm": 26.874469757080078, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8776718378067017, + "num_tokens": 360849476.0, + "step": 9457 + }, + { + "epoch": 1.2031548149090447, + "ewc_loss": 0.03846742957830429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9233715647715144e-05, + "grad_norm": 26.98748779296875, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8552407026290894, + "num_tokens": 360891178.0, + "step": 9458 + }, + { + "epoch": 1.2032820251876353, + "ewc_loss": 0.038602299988269806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9301149222883396e-05, + "grad_norm": 26.885595321655273, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.861221969127655, + "num_tokens": 360930503.0, + "step": 9459 + }, + { + "epoch": 1.2034092354662256, + "ewc_loss": 0.03848261386156082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.924130629049614e-05, + "grad_norm": 26.94400978088379, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8621270656585693, + "num_tokens": 360963730.0, + "step": 9460 + }, + { + "epoch": 1.203536445744816, + "ewc_loss": 0.03860520198941231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9302600776427425e-05, + "grad_norm": 26.879966735839844, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8504703640937805, + "num_tokens": 361003796.0, + "step": 9461 + }, + { + "epoch": 1.2036636560234066, + "ewc_loss": 0.038561079651117325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.928054007294122e-05, + "grad_norm": 26.892610549926758, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8647313714027405, + "num_tokens": 361044818.0, + "step": 9462 + }, + { + "epoch": 1.2037908663019972, + "ewc_loss": 0.03851770609617233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9258852262282744e-05, + "grad_norm": 26.716903686523438, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8754024505615234, + "num_tokens": 361080692.0, + "step": 9463 + }, + { + "epoch": 1.2039180765805877, + "ewc_loss": 0.03864070028066635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9320350475027226e-05, + "grad_norm": 26.948379516601562, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8556791543960571, + "num_tokens": 361116609.0, + "step": 9464 + }, + { + "epoch": 1.2040452868591782, + "ewc_loss": 0.038666993379592896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9333496311446652e-05, + "grad_norm": 26.905954360961914, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8574157953262329, + "num_tokens": 361154559.0, + "step": 9465 + }, + { + "epoch": 1.2041724971377687, + "ewc_loss": 0.03854596987366676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9272985809948295e-05, + "grad_norm": 26.947471618652344, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8743546009063721, + "num_tokens": 361191005.0, + "step": 9466 + }, + { + "epoch": 1.2042997074163593, + "ewc_loss": 0.03865637630224228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9328188500367105e-05, + "grad_norm": 27.06337547302246, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8622794151306152, + "num_tokens": 361230191.0, + "step": 9467 + }, + { + "epoch": 1.2044269176949498, + "ewc_loss": 0.038554828613996506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.927741504914593e-05, + "grad_norm": 26.922592163085938, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8582209348678589, + "num_tokens": 361269132.0, + "step": 9468 + }, + { + "epoch": 1.2045541279735403, + "ewc_loss": 0.03858614340424538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.929307109094225e-05, + "grad_norm": 26.85601043701172, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8576006889343262, + "num_tokens": 361306674.0, + "step": 9469 + }, + { + "epoch": 1.2046813382521309, + "ewc_loss": 0.0385780930519104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9289045667392202e-05, + "grad_norm": 26.948225021362305, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8663917779922485, + "num_tokens": 361348309.0, + "step": 9470 + }, + { + "epoch": 1.2048085485307212, + "ewc_loss": 0.038552071899175644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9276036255178042e-05, + "grad_norm": 26.67763328552246, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8601773977279663, + "num_tokens": 361386824.0, + "step": 9471 + }, + { + "epoch": 1.2049357588093117, + "ewc_loss": 0.03857959806919098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.928979872900527e-05, + "grad_norm": 26.952238082885742, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8454275131225586, + "num_tokens": 361426590.0, + "step": 9472 + }, + { + "epoch": 1.2050629690879022, + "ewc_loss": 0.03872813284397125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9364066247362643e-05, + "grad_norm": 26.780445098876953, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8498293161392212, + "num_tokens": 361463259.0, + "step": 9473 + }, + { + "epoch": 1.2051901793664928, + "ewc_loss": 0.03861038386821747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9305191017338075e-05, + "grad_norm": 26.9533748626709, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8738669157028198, + "num_tokens": 361498750.0, + "step": 9474 + }, + { + "epoch": 1.2053173896450833, + "ewc_loss": 0.03873591870069504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.936795888468623e-05, + "grad_norm": 27.010299682617188, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8546647429466248, + "num_tokens": 361537101.0, + "step": 9475 + }, + { + "epoch": 1.2054445999236738, + "ewc_loss": 0.038611385971307755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9305693058413453e-05, + "grad_norm": 26.897371292114258, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8671963214874268, + "num_tokens": 361572130.0, + "step": 9476 + }, + { + "epoch": 1.2055718102022643, + "ewc_loss": 0.03865635395050049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9328177586430684e-05, + "grad_norm": 26.942224502563477, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8798742294311523, + "num_tokens": 361608484.0, + "step": 9477 + }, + { + "epoch": 1.2056990204808549, + "ewc_loss": 0.03867605701088905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9338029233040288e-05, + "grad_norm": 26.844358444213867, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8829742074012756, + "num_tokens": 361650361.0, + "step": 9478 + }, + { + "epoch": 1.2058262307594454, + "ewc_loss": 0.03861318901181221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9306595277157612e-05, + "grad_norm": 27.006799697875977, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.871605396270752, + "num_tokens": 361680670.0, + "step": 9479 + }, + { + "epoch": 1.205953441038036, + "ewc_loss": 0.038640450686216354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.932022496475838e-05, + "grad_norm": 26.732341766357422, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8705108165740967, + "num_tokens": 361723504.0, + "step": 9480 + }, + { + "epoch": 1.2060806513166265, + "ewc_loss": 0.03867349773645401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9336748664500192e-05, + "grad_norm": 26.907697677612305, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8701292872428894, + "num_tokens": 361765850.0, + "step": 9481 + }, + { + "epoch": 1.206207861595217, + "ewc_loss": 0.038721852004528046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9360926671652123e-05, + "grad_norm": 26.809844970703125, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8557946681976318, + "num_tokens": 361804375.0, + "step": 9482 + }, + { + "epoch": 1.2063350718738075, + "ewc_loss": 0.03872830420732498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.936415173986461e-05, + "grad_norm": 26.96082878112793, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8611882925033569, + "num_tokens": 361834714.0, + "step": 9483 + }, + { + "epoch": 1.206462282152398, + "ewc_loss": 0.038703449070453644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9351724404259585e-05, + "grad_norm": 26.84002685546875, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8644829392433167, + "num_tokens": 361877568.0, + "step": 9484 + }, + { + "epoch": 1.2065894924309883, + "ewc_loss": 0.03870971128344536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9354854885023087e-05, + "grad_norm": 26.98922348022461, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8605300188064575, + "num_tokens": 361914785.0, + "step": 9485 + }, + { + "epoch": 1.2067167027095789, + "ewc_loss": 0.038685694336891174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.934284773597028e-05, + "grad_norm": 26.82501983642578, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8722862005233765, + "num_tokens": 361956781.0, + "step": 9486 + }, + { + "epoch": 1.2068439129881694, + "ewc_loss": 0.03862485662102699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9312428776174784e-05, + "grad_norm": 26.9111270904541, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8748732209205627, + "num_tokens": 361997408.0, + "step": 9487 + }, + { + "epoch": 1.20697112326676, + "ewc_loss": 0.03878072649240494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9390363377169706e-05, + "grad_norm": 26.874181747436523, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8636226654052734, + "num_tokens": 362030226.0, + "step": 9488 + }, + { + "epoch": 1.2070983335453505, + "ewc_loss": 0.038667019456624985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9333509044372477e-05, + "grad_norm": 26.856773376464844, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8754681944847107, + "num_tokens": 362068370.0, + "step": 9489 + }, + { + "epoch": 1.207225543823941, + "ewc_loss": 0.0387096181511879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9354809410288e-05, + "grad_norm": 27.059017181396484, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8731657266616821, + "num_tokens": 362108569.0, + "step": 9490 + }, + { + "epoch": 1.2073527541025315, + "ewc_loss": 0.038713254034519196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.935662658070214e-05, + "grad_norm": 26.946529388427734, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8566248416900635, + "num_tokens": 362140476.0, + "step": 9491 + }, + { + "epoch": 1.207479964381122, + "ewc_loss": 0.038661278784275055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.933063867909368e-05, + "grad_norm": 26.891088485717773, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8724748492240906, + "num_tokens": 362171132.0, + "step": 9492 + }, + { + "epoch": 1.2076071746597126, + "ewc_loss": 0.03866640850901604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.933320345415268e-05, + "grad_norm": 27.169008255004883, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8633778095245361, + "num_tokens": 362204791.0, + "step": 9493 + }, + { + "epoch": 1.207734384938303, + "ewc_loss": 0.03870006278157234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9350030925124884e-05, + "grad_norm": 26.88232421875, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8681995868682861, + "num_tokens": 362242360.0, + "step": 9494 + }, + { + "epoch": 1.2078615952168934, + "ewc_loss": 0.03853540122509003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.926769982674159e-05, + "grad_norm": 26.90640640258789, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8673440217971802, + "num_tokens": 362276558.0, + "step": 9495 + }, + { + "epoch": 1.207988805495484, + "ewc_loss": 0.03866473585367203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9332368538016453e-05, + "grad_norm": 27.02328872680664, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8498802185058594, + "num_tokens": 362307055.0, + "step": 9496 + }, + { + "epoch": 1.2081160157740745, + "ewc_loss": 0.03868255019187927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9341274310136214e-05, + "grad_norm": 26.89322280883789, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8756090402603149, + "num_tokens": 362342433.0, + "step": 9497 + }, + { + "epoch": 1.208243226052665, + "ewc_loss": 0.03859269246459007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9296347090858035e-05, + "grad_norm": 26.77740478515625, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.8857319355010986, + "num_tokens": 362377008.0, + "step": 9498 + }, + { + "epoch": 1.2083704363312555, + "ewc_loss": 0.03868848457932472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.93442429008428e-05, + "grad_norm": 27.035573959350586, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8792110085487366, + "num_tokens": 362412089.0, + "step": 9499 + }, + { + "epoch": 1.208497646609846, + "ewc_loss": 0.03875502571463585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9377512217033654e-05, + "grad_norm": 26.770021438598633, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8554403781890869, + "num_tokens": 362455817.0, + "step": 9500 + }, + { + "epoch": 1.2086248568884366, + "ewc_loss": 0.03869080916047096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9345405235071667e-05, + "grad_norm": 27.01953887939453, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8648974895477295, + "num_tokens": 362498859.0, + "step": 9501 + }, + { + "epoch": 1.208752067167027, + "ewc_loss": 0.03884818032383919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.942408925970085e-05, + "grad_norm": 26.892200469970703, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8639715909957886, + "num_tokens": 362542346.0, + "step": 9502 + }, + { + "epoch": 1.2088792774456176, + "ewc_loss": 0.03861748427152634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9308741684653796e-05, + "grad_norm": 26.78123664855957, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8833658695220947, + "num_tokens": 362575360.0, + "step": 9503 + }, + { + "epoch": 1.2090064877242082, + "ewc_loss": 0.03879687935113907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.939843969012145e-05, + "grad_norm": 26.938432693481445, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8653392791748047, + "num_tokens": 362614737.0, + "step": 9504 + }, + { + "epoch": 1.2091336980027987, + "ewc_loss": 0.03872986137866974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9364930267329328e-05, + "grad_norm": 26.777376174926758, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8598373532295227, + "num_tokens": 362653756.0, + "step": 9505 + }, + { + "epoch": 1.2092609082813892, + "ewc_loss": 0.03871800750494003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9359003999852575e-05, + "grad_norm": 26.83318519592285, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8671028017997742, + "num_tokens": 362690745.0, + "step": 9506 + }, + { + "epoch": 1.2093881185599797, + "ewc_loss": 0.038833532482385635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9416766008362174e-05, + "grad_norm": 26.854610443115234, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8743389844894409, + "num_tokens": 362726328.0, + "step": 9507 + }, + { + "epoch": 1.2095153288385703, + "ewc_loss": 0.038792818784713745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9396409697947092e-05, + "grad_norm": 26.782581329345703, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8665184378623962, + "num_tokens": 362764285.0, + "step": 9508 + }, + { + "epoch": 1.2096425391171606, + "ewc_loss": 0.03879449516534805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9397248252062127e-05, + "grad_norm": 26.862417221069336, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.845581591129303, + "num_tokens": 362794697.0, + "step": 9509 + }, + { + "epoch": 1.209769749395751, + "ewc_loss": 0.03884065896272659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.942032940860372e-05, + "grad_norm": 26.9110050201416, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8644640445709229, + "num_tokens": 362831618.0, + "step": 9510 + }, + { + "epoch": 1.2098969596743416, + "ewc_loss": 0.03882024437189102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9410123059060425e-05, + "grad_norm": 26.88164520263672, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8733945488929749, + "num_tokens": 362866404.0, + "step": 9511 + }, + { + "epoch": 1.2100241699529322, + "ewc_loss": 0.03881855681538582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.940927904797718e-05, + "grad_norm": 26.976654052734375, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8804917335510254, + "num_tokens": 362902554.0, + "step": 9512 + }, + { + "epoch": 1.2101513802315227, + "ewc_loss": 0.038758739829063416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9379369405214675e-05, + "grad_norm": 26.926050186157227, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8592855930328369, + "num_tokens": 362944169.0, + "step": 9513 + }, + { + "epoch": 1.2102785905101132, + "ewc_loss": 0.03874419629573822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9372098904568702e-05, + "grad_norm": 26.974163055419922, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8451111316680908, + "num_tokens": 362983789.0, + "step": 9514 + }, + { + "epoch": 1.2104058007887037, + "ewc_loss": 0.03878360614180565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.939180219778791e-05, + "grad_norm": 26.801403045654297, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8712202310562134, + "num_tokens": 363021551.0, + "step": 9515 + }, + { + "epoch": 1.2105330110672943, + "ewc_loss": 0.03877648338675499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.938824243552517e-05, + "grad_norm": 26.978788375854492, + "learning_rate": 1e-06, + "loss": 0.4943, + "mean_token_accuracy": 0.8479005098342896, + "num_tokens": 363064951.0, + "step": 9516 + }, + { + "epoch": 1.2106602213458848, + "ewc_loss": 0.03879604861140251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.939802496053744e-05, + "grad_norm": 26.81837272644043, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8653864860534668, + "num_tokens": 363110876.0, + "step": 9517 + }, + { + "epoch": 1.2107874316244753, + "ewc_loss": 0.03878096491098404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9390481611480936e-05, + "grad_norm": 26.89166831970215, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8624996542930603, + "num_tokens": 363155310.0, + "step": 9518 + }, + { + "epoch": 1.2109146419030659, + "ewc_loss": 0.03881268575787544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9406343199079856e-05, + "grad_norm": 26.73244857788086, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8751981258392334, + "num_tokens": 363193166.0, + "step": 9519 + }, + { + "epoch": 1.2110418521816562, + "ewc_loss": 0.03878852725028992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9394263290450908e-05, + "grad_norm": 26.90680503845215, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8464804291725159, + "num_tokens": 363225163.0, + "step": 9520 + }, + { + "epoch": 1.2111690624602467, + "ewc_loss": 0.038828715682029724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.941435766639188e-05, + "grad_norm": 26.786420822143555, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8637350797653198, + "num_tokens": 363264720.0, + "step": 9521 + }, + { + "epoch": 1.2112962727388372, + "ewc_loss": 0.038823164999485016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.941158188856207e-05, + "grad_norm": 26.821216583251953, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8491804599761963, + "num_tokens": 363305029.0, + "step": 9522 + }, + { + "epoch": 1.2114234830174277, + "ewc_loss": 0.03885602951049805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.94280146388337e-05, + "grad_norm": 26.828838348388672, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8726149797439575, + "num_tokens": 363344128.0, + "step": 9523 + }, + { + "epoch": 1.2115506932960183, + "ewc_loss": 0.038857147097587585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.942857306858059e-05, + "grad_norm": 26.802288055419922, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8645079135894775, + "num_tokens": 363385903.0, + "step": 9524 + }, + { + "epoch": 1.2116779035746088, + "ewc_loss": 0.038863956928253174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9431978216744028e-05, + "grad_norm": 26.827167510986328, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8664584159851074, + "num_tokens": 363425216.0, + "step": 9525 + }, + { + "epoch": 1.2118051138531993, + "ewc_loss": 0.03888586536049843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.944293217093218e-05, + "grad_norm": 26.82240104675293, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.875279426574707, + "num_tokens": 363462691.0, + "step": 9526 + }, + { + "epoch": 1.2119323241317899, + "ewc_loss": 0.0388096384704113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9404818885959685e-05, + "grad_norm": 26.907846450805664, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8503257632255554, + "num_tokens": 363495849.0, + "step": 9527 + }, + { + "epoch": 1.2120595344103804, + "ewc_loss": 0.03884835168719292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.942417657119222e-05, + "grad_norm": 26.737489700317383, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8643392324447632, + "num_tokens": 363535268.0, + "step": 9528 + }, + { + "epoch": 1.212186744688971, + "ewc_loss": 0.03884430229663849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.942215021699667e-05, + "grad_norm": 26.852630615234375, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8480101823806763, + "num_tokens": 363573523.0, + "step": 9529 + }, + { + "epoch": 1.2123139549675614, + "ewc_loss": 0.038926996290683746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.946349766512867e-05, + "grad_norm": 26.887964248657227, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8639203310012817, + "num_tokens": 363607545.0, + "step": 9530 + }, + { + "epoch": 1.212441165246152, + "ewc_loss": 0.0387834832072258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9391742171137594e-05, + "grad_norm": 26.831220626831055, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8675492405891418, + "num_tokens": 363648456.0, + "step": 9531 + }, + { + "epoch": 1.2125683755247425, + "ewc_loss": 0.038822367787361145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9411183529882692e-05, + "grad_norm": 26.81511688232422, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8742206692695618, + "num_tokens": 363688726.0, + "step": 9532 + }, + { + "epoch": 1.212695585803333, + "ewc_loss": 0.0388147197663784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.940736001415644e-05, + "grad_norm": 26.81382179260254, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8572725057601929, + "num_tokens": 363728557.0, + "step": 9533 + }, + { + "epoch": 1.2128227960819233, + "ewc_loss": 0.03886881098151207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9434404748608358e-05, + "grad_norm": 26.924156188964844, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8646959066390991, + "num_tokens": 363764191.0, + "step": 9534 + }, + { + "epoch": 1.2129500063605139, + "ewc_loss": 0.038882143795490265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9441071344772354e-05, + "grad_norm": 26.804712295532227, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8527336120605469, + "num_tokens": 363798126.0, + "step": 9535 + }, + { + "epoch": 1.2130772166391044, + "ewc_loss": 0.03882549703121185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.941274786076974e-05, + "grad_norm": 26.865520477294922, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.848692774772644, + "num_tokens": 363836037.0, + "step": 9536 + }, + { + "epoch": 1.213204426917695, + "ewc_loss": 0.038893382996320724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.944669202202931e-05, + "grad_norm": 26.92087173461914, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8856328129768372, + "num_tokens": 363872359.0, + "step": 9537 + }, + { + "epoch": 1.2133316371962855, + "ewc_loss": 0.03883828595280647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.941914342751261e-05, + "grad_norm": 26.906810760498047, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8595457077026367, + "num_tokens": 363914206.0, + "step": 9538 + }, + { + "epoch": 1.213458847474876, + "ewc_loss": 0.03881840780377388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9409204469411634e-05, + "grad_norm": 26.903514862060547, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.865707516670227, + "num_tokens": 363956147.0, + "step": 9539 + }, + { + "epoch": 1.2135860577534665, + "ewc_loss": 0.03883267939090729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9416340364841744e-05, + "grad_norm": 26.929956436157227, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8825246095657349, + "num_tokens": 363986328.0, + "step": 9540 + }, + { + "epoch": 1.213713268032057, + "ewc_loss": 0.0388316735625267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9415836504776962e-05, + "grad_norm": 26.864147186279297, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8619540929794312, + "num_tokens": 364024905.0, + "step": 9541 + }, + { + "epoch": 1.2138404783106476, + "ewc_loss": 0.03879905119538307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9399525626795366e-05, + "grad_norm": 26.926942825317383, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8669002056121826, + "num_tokens": 364068628.0, + "step": 9542 + }, + { + "epoch": 1.213967688589238, + "ewc_loss": 0.03878101333975792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9390507077332586e-05, + "grad_norm": 26.822866439819336, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8608620166778564, + "num_tokens": 364104261.0, + "step": 9543 + }, + { + "epoch": 1.2140948988678284, + "ewc_loss": 0.03880435600876808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9402177713345736e-05, + "grad_norm": 26.974384307861328, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8627340197563171, + "num_tokens": 364147524.0, + "step": 9544 + }, + { + "epoch": 1.214222109146419, + "ewc_loss": 0.03887585923075676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.943792995007243e-05, + "grad_norm": 26.904438018798828, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8669764399528503, + "num_tokens": 364187715.0, + "step": 9545 + }, + { + "epoch": 1.2143493194250095, + "ewc_loss": 0.03870305046439171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.93515261344146e-05, + "grad_norm": 27.02421760559082, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.875529944896698, + "num_tokens": 364226081.0, + "step": 9546 + }, + { + "epoch": 1.2144765297036, + "ewc_loss": 0.03880215436220169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.940107722475659e-05, + "grad_norm": 26.693134307861328, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8645817041397095, + "num_tokens": 364268390.0, + "step": 9547 + }, + { + "epoch": 1.2146037399821905, + "ewc_loss": 0.03877655044198036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9388275177334435e-05, + "grad_norm": 27.04758071899414, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8679767847061157, + "num_tokens": 364310176.0, + "step": 9548 + }, + { + "epoch": 1.214730950260781, + "ewc_loss": 0.0387808233499527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.93904124898836e-05, + "grad_norm": 26.814067840576172, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8758794069290161, + "num_tokens": 364345652.0, + "step": 9549 + }, + { + "epoch": 1.2148581605393716, + "ewc_loss": 0.038704995065927505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9352497474756092e-05, + "grad_norm": 26.936824798583984, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.860339343547821, + "num_tokens": 364383810.0, + "step": 9550 + }, + { + "epoch": 1.214985370817962, + "ewc_loss": 0.0387815497815609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9390774468774907e-05, + "grad_norm": 26.839906692504883, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8692255020141602, + "num_tokens": 364421169.0, + "step": 9551 + }, + { + "epoch": 1.2151125810965526, + "ewc_loss": 0.03867875039577484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9339375285198912e-05, + "grad_norm": 26.882862091064453, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8583928346633911, + "num_tokens": 364452552.0, + "step": 9552 + }, + { + "epoch": 1.2152397913751432, + "ewc_loss": 0.03877219557762146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.938609784701839e-05, + "grad_norm": 26.99557876586914, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.862870454788208, + "num_tokens": 364486779.0, + "step": 9553 + }, + { + "epoch": 1.2153670016537337, + "ewc_loss": 0.03874173387885094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.93708674487425e-05, + "grad_norm": 26.818740844726562, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.874674916267395, + "num_tokens": 364526237.0, + "step": 9554 + }, + { + "epoch": 1.2154942119323242, + "ewc_loss": 0.038772180676460266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9386090571060777e-05, + "grad_norm": 27.00999641418457, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8710087537765503, + "num_tokens": 364568446.0, + "step": 9555 + }, + { + "epoch": 1.2156214222109147, + "ewc_loss": 0.038740627467632294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9370314475963823e-05, + "grad_norm": 26.875795364379883, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8725208044052124, + "num_tokens": 364604448.0, + "step": 9556 + }, + { + "epoch": 1.2157486324895053, + "ewc_loss": 0.03865988180041313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.932994018716272e-05, + "grad_norm": 26.796031951904297, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8624955415725708, + "num_tokens": 364642707.0, + "step": 9557 + }, + { + "epoch": 1.2158758427680956, + "ewc_loss": 0.03881041333079338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.940520633070264e-05, + "grad_norm": 26.9735107421875, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8597152233123779, + "num_tokens": 364678428.0, + "step": 9558 + }, + { + "epoch": 1.216003053046686, + "ewc_loss": 0.03878478333353996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.939239155035466e-05, + "grad_norm": 26.84964370727539, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8789435625076294, + "num_tokens": 364718915.0, + "step": 9559 + }, + { + "epoch": 1.2161302633252766, + "ewc_loss": 0.0387248657643795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.936243279487826e-05, + "grad_norm": 26.83347511291504, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8816103339195251, + "num_tokens": 364756030.0, + "step": 9560 + }, + { + "epoch": 1.2162574736038672, + "ewc_loss": 0.03882049769163132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.941024856932927e-05, + "grad_norm": 26.917776107788086, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8624842166900635, + "num_tokens": 364796158.0, + "step": 9561 + }, + { + "epoch": 1.2163846838824577, + "ewc_loss": 0.03877762332558632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.938881177920848e-05, + "grad_norm": 26.82794952392578, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8734809160232544, + "num_tokens": 364832722.0, + "step": 9562 + }, + { + "epoch": 1.2165118941610482, + "ewc_loss": 0.038870032876729965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.943501592904795e-05, + "grad_norm": 26.88105583190918, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8639140725135803, + "num_tokens": 364874120.0, + "step": 9563 + }, + { + "epoch": 1.2166391044396387, + "ewc_loss": 0.03880203142762184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.940101537911687e-05, + "grad_norm": 26.821889877319336, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8672569394111633, + "num_tokens": 364913166.0, + "step": 9564 + }, + { + "epoch": 1.2167663147182293, + "ewc_loss": 0.0388181135058403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9409057131269947e-05, + "grad_norm": 26.8581485748291, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8473749160766602, + "num_tokens": 364949586.0, + "step": 9565 + }, + { + "epoch": 1.2168935249968198, + "ewc_loss": 0.03884802758693695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9424014681135304e-05, + "grad_norm": 26.81489372253418, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8468799591064453, + "num_tokens": 364989070.0, + "step": 9566 + }, + { + "epoch": 1.2170207352754103, + "ewc_loss": 0.03887341916561127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.943670940818265e-05, + "grad_norm": 26.96261978149414, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8683290481567383, + "num_tokens": 365027050.0, + "step": 9567 + }, + { + "epoch": 1.2171479455540009, + "ewc_loss": 0.03884056583046913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9420282114879228e-05, + "grad_norm": 26.805803298950195, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8650042414665222, + "num_tokens": 365060050.0, + "step": 9568 + }, + { + "epoch": 1.2172751558325912, + "ewc_loss": 0.0388265885412693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9413293557590805e-05, + "grad_norm": 26.98664665222168, + "learning_rate": 1e-06, + "loss": 0.5273, + "mean_token_accuracy": 0.8348015546798706, + "num_tokens": 365098030.0, + "step": 9569 + }, + { + "epoch": 1.2174023661111817, + "ewc_loss": 0.038901638239622116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9450819308985956e-05, + "grad_norm": 26.8498477935791, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8662163019180298, + "num_tokens": 365141143.0, + "step": 9570 + }, + { + "epoch": 1.2175295763897722, + "ewc_loss": 0.038787323981523514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9393661204958335e-05, + "grad_norm": 26.817625045776367, + "learning_rate": 1e-06, + "loss": 0.524, + "mean_token_accuracy": 0.8413399457931519, + "num_tokens": 365174204.0, + "step": 9571 + }, + { + "epoch": 1.2176567866683627, + "ewc_loss": 0.038841430097818375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9420715034357272e-05, + "grad_norm": 26.79139518737793, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.864741325378418, + "num_tokens": 365207170.0, + "step": 9572 + }, + { + "epoch": 1.2177839969469533, + "ewc_loss": 0.03886083886027336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.943041934282519e-05, + "grad_norm": 27.011985778808594, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8678921461105347, + "num_tokens": 365248926.0, + "step": 9573 + }, + { + "epoch": 1.2179112072255438, + "ewc_loss": 0.038866184651851654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.94330932572484e-05, + "grad_norm": 26.85093116760254, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8675969839096069, + "num_tokens": 365283588.0, + "step": 9574 + }, + { + "epoch": 1.2180384175041343, + "ewc_loss": 0.03883838281035423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.94191907212371e-05, + "grad_norm": 26.951871871948242, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8625940084457397, + "num_tokens": 365324869.0, + "step": 9575 + }, + { + "epoch": 1.2181656277827249, + "ewc_loss": 0.038914620876312256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.945731128216721e-05, + "grad_norm": 26.861677169799805, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8621878623962402, + "num_tokens": 365366832.0, + "step": 9576 + }, + { + "epoch": 1.2182928380613154, + "ewc_loss": 0.03888997063040733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9444985809968784e-05, + "grad_norm": 26.996517181396484, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.863050639629364, + "num_tokens": 365401148.0, + "step": 9577 + }, + { + "epoch": 1.218420048339906, + "ewc_loss": 0.03891933336853981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9459666873444803e-05, + "grad_norm": 26.853988647460938, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8666732311248779, + "num_tokens": 365442303.0, + "step": 9578 + }, + { + "epoch": 1.2185472586184964, + "ewc_loss": 0.03885994479060173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9429971871431917e-05, + "grad_norm": 26.84376335144043, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8771514892578125, + "num_tokens": 365477877.0, + "step": 9579 + }, + { + "epoch": 1.218674468897087, + "ewc_loss": 0.0389365591108799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9468279788270593e-05, + "grad_norm": 27.00278663635254, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8530255556106567, + "num_tokens": 365520648.0, + "step": 9580 + }, + { + "epoch": 1.2188016791756775, + "ewc_loss": 0.0388508141040802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9425406208029017e-05, + "grad_norm": 26.804855346679688, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8476817011833191, + "num_tokens": 365555129.0, + "step": 9581 + }, + { + "epoch": 1.218928889454268, + "ewc_loss": 0.03881869465112686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9409348169574514e-05, + "grad_norm": 26.868698120117188, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8699414730072021, + "num_tokens": 365594368.0, + "step": 9582 + }, + { + "epoch": 1.2190560997328583, + "ewc_loss": 0.03890455514192581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.94522781384876e-05, + "grad_norm": 26.868976593017578, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.863585889339447, + "num_tokens": 365634763.0, + "step": 9583 + }, + { + "epoch": 1.2191833100114489, + "ewc_loss": 0.03883510082960129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9417549992795102e-05, + "grad_norm": 26.838056564331055, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8808470964431763, + "num_tokens": 365675951.0, + "step": 9584 + }, + { + "epoch": 1.2193105202900394, + "ewc_loss": 0.03890777379274368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9453886125120334e-05, + "grad_norm": 26.877649307250977, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.87293541431427, + "num_tokens": 365710348.0, + "step": 9585 + }, + { + "epoch": 1.21943773056863, + "ewc_loss": 0.038923777639865875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9461889678495936e-05, + "grad_norm": 26.900808334350586, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8658822774887085, + "num_tokens": 365745166.0, + "step": 9586 + }, + { + "epoch": 1.2195649408472204, + "ewc_loss": 0.03893555328249931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.946777592820581e-05, + "grad_norm": 26.76299285888672, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8782898187637329, + "num_tokens": 365780122.0, + "step": 9587 + }, + { + "epoch": 1.219692151125811, + "ewc_loss": 0.0388900451362133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9445022189756855e-05, + "grad_norm": 26.92705535888672, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8650352954864502, + "num_tokens": 365816326.0, + "step": 9588 + }, + { + "epoch": 1.2198193614044015, + "ewc_loss": 0.038899071514606476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9449535102467053e-05, + "grad_norm": 26.85907554626465, + "learning_rate": 1e-06, + "loss": 0.5132, + "mean_token_accuracy": 0.8392798900604248, + "num_tokens": 365858938.0, + "step": 9589 + }, + { + "epoch": 1.219946571682992, + "ewc_loss": 0.03895501047372818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9477505702525377e-05, + "grad_norm": 26.891674041748047, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8541209101676941, + "num_tokens": 365893892.0, + "step": 9590 + }, + { + "epoch": 1.2200737819615826, + "ewc_loss": 0.038942284882068634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9471142877591774e-05, + "grad_norm": 26.864831924438477, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.854797899723053, + "num_tokens": 365938787.0, + "step": 9591 + }, + { + "epoch": 1.220200992240173, + "ewc_loss": 0.03889358043670654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9446790247457102e-05, + "grad_norm": 26.83784294128418, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8639814853668213, + "num_tokens": 365972664.0, + "step": 9592 + }, + { + "epoch": 1.2203282025187634, + "ewc_loss": 0.03891441971063614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9457209418760613e-05, + "grad_norm": 26.865671157836914, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8711413145065308, + "num_tokens": 366013367.0, + "step": 9593 + }, + { + "epoch": 1.220455412797354, + "ewc_loss": 0.03890446200966835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9452230844763108e-05, + "grad_norm": 26.836910247802734, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8719558119773865, + "num_tokens": 366051637.0, + "step": 9594 + }, + { + "epoch": 1.2205826230759445, + "ewc_loss": 0.038985755294561386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9492877981974743e-05, + "grad_norm": 26.980878829956055, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8614473342895508, + "num_tokens": 366092713.0, + "step": 9595 + }, + { + "epoch": 1.220709833354535, + "ewc_loss": 0.038926370441913605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.946318479895126e-05, + "grad_norm": 26.8712215423584, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8652793169021606, + "num_tokens": 366131594.0, + "step": 9596 + }, + { + "epoch": 1.2208370436331255, + "ewc_loss": 0.038870446383953094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9435223293839954e-05, + "grad_norm": 26.93472671508789, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8591265678405762, + "num_tokens": 366170871.0, + "step": 9597 + }, + { + "epoch": 1.220964253911716, + "ewc_loss": 0.03892933949828148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9464669094304554e-05, + "grad_norm": 26.908000946044922, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.85467529296875, + "num_tokens": 366212974.0, + "step": 9598 + }, + { + "epoch": 1.2210914641903066, + "ewc_loss": 0.03881171718239784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9405859347898513e-05, + "grad_norm": 26.823549270629883, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8567818403244019, + "num_tokens": 366255904.0, + "step": 9599 + }, + { + "epoch": 1.221218674468897, + "ewc_loss": 0.03883342072367668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9416709619690664e-05, + "grad_norm": 26.746212005615234, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8704738020896912, + "num_tokens": 366288887.0, + "step": 9600 + }, + { + "epoch": 1.2213458847474876, + "ewc_loss": 0.038916200399398804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.945810072356835e-05, + "grad_norm": 26.883607864379883, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8780922293663025, + "num_tokens": 366328796.0, + "step": 9601 + }, + { + "epoch": 1.2214730950260781, + "ewc_loss": 0.03894082084298134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9470409824862145e-05, + "grad_norm": 26.80133628845215, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8617714643478394, + "num_tokens": 366373093.0, + "step": 9602 + }, + { + "epoch": 1.2216003053046687, + "ewc_loss": 0.0388466902077198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.94233452930348e-05, + "grad_norm": 26.767499923706055, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8733322620391846, + "num_tokens": 366414353.0, + "step": 9603 + }, + { + "epoch": 1.2217275155832592, + "ewc_loss": 0.038929883390665054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9464941942715086e-05, + "grad_norm": 26.935096740722656, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8779659867286682, + "num_tokens": 366450683.0, + "step": 9604 + }, + { + "epoch": 1.2218547258618497, + "ewc_loss": 0.03894093260169029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9470466213533655e-05, + "grad_norm": 26.811485290527344, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8701193332672119, + "num_tokens": 366487158.0, + "step": 9605 + }, + { + "epoch": 1.2219819361404403, + "ewc_loss": 0.038979049772024155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9489525584504008e-05, + "grad_norm": 26.896146774291992, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8664253950119019, + "num_tokens": 366529984.0, + "step": 9606 + }, + { + "epoch": 1.2221091464190306, + "ewc_loss": 0.03889656811952591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9448283637757413e-05, + "grad_norm": 26.825773239135742, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8684436082839966, + "num_tokens": 366564100.0, + "step": 9607 + }, + { + "epoch": 1.222236356697621, + "ewc_loss": 0.03895799443125725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9478997273836285e-05, + "grad_norm": 26.92666244506836, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8713564872741699, + "num_tokens": 366603752.0, + "step": 9608 + }, + { + "epoch": 1.2223635669762116, + "ewc_loss": 0.038943659514188766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9471830455586314e-05, + "grad_norm": 26.96681022644043, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8669329881668091, + "num_tokens": 366642170.0, + "step": 9609 + }, + { + "epoch": 1.2224907772548022, + "ewc_loss": 0.038914572447538376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.945728581631556e-05, + "grad_norm": 26.851594924926758, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8509594798088074, + "num_tokens": 366684038.0, + "step": 9610 + }, + { + "epoch": 1.2226179875333927, + "ewc_loss": 0.03887099027633667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9435494323261082e-05, + "grad_norm": 26.92332649230957, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.862384557723999, + "num_tokens": 366717832.0, + "step": 9611 + }, + { + "epoch": 1.2227451978119832, + "ewc_loss": 0.03897229954600334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9486149540171027e-05, + "grad_norm": 26.98088264465332, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8593975305557251, + "num_tokens": 366757999.0, + "step": 9612 + }, + { + "epoch": 1.2228724080905737, + "ewc_loss": 0.038844842463731766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9422421246417798e-05, + "grad_norm": 26.757787704467773, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8737819194793701, + "num_tokens": 366798533.0, + "step": 9613 + }, + { + "epoch": 1.2229996183691643, + "ewc_loss": 0.038899924606084824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9449962564976886e-05, + "grad_norm": 26.974336624145508, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.860960841178894, + "num_tokens": 366843066.0, + "step": 9614 + }, + { + "epoch": 1.2231268286477548, + "ewc_loss": 0.03898340463638306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9491702914820053e-05, + "grad_norm": 26.80364418029785, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8809289932250977, + "num_tokens": 366883190.0, + "step": 9615 + }, + { + "epoch": 1.2232540389263453, + "ewc_loss": 0.03881548345088959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9407742001931183e-05, + "grad_norm": 26.803483963012695, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8694896101951599, + "num_tokens": 366920642.0, + "step": 9616 + }, + { + "epoch": 1.2233812492049359, + "ewc_loss": 0.03900197148323059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9500985217746347e-05, + "grad_norm": 27.01525115966797, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8650075793266296, + "num_tokens": 366954468.0, + "step": 9617 + }, + { + "epoch": 1.2235084594835262, + "ewc_loss": 0.03894513472914696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.947256714629475e-05, + "grad_norm": 26.855554580688477, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8490830659866333, + "num_tokens": 366991170.0, + "step": 9618 + }, + { + "epoch": 1.2236356697621167, + "ewc_loss": 0.0388798750936985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9439938114373945e-05, + "grad_norm": 26.948593139648438, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8687677979469299, + "num_tokens": 367031833.0, + "step": 9619 + }, + { + "epoch": 1.2237628800407072, + "ewc_loss": 0.03897765651345253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.948882891156245e-05, + "grad_norm": 26.994457244873047, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8604298830032349, + "num_tokens": 367074709.0, + "step": 9620 + }, + { + "epoch": 1.2238900903192977, + "ewc_loss": 0.038908831775188446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9454415451036766e-05, + "grad_norm": 26.979032516479492, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8612461090087891, + "num_tokens": 367106820.0, + "step": 9621 + }, + { + "epoch": 1.2240173005978883, + "ewc_loss": 0.03892415761947632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9462078853393905e-05, + "grad_norm": 26.885936737060547, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8789288997650146, + "num_tokens": 367142298.0, + "step": 9622 + }, + { + "epoch": 1.2241445108764788, + "ewc_loss": 0.038894928991794586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9447465092525817e-05, + "grad_norm": 26.982379913330078, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8545766472816467, + "num_tokens": 367179957.0, + "step": 9623 + }, + { + "epoch": 1.2242717211550693, + "ewc_loss": 0.03895137086510658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9475684894132428e-05, + "grad_norm": 26.85733413696289, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8668542504310608, + "num_tokens": 367226800.0, + "step": 9624 + }, + { + "epoch": 1.2243989314336599, + "ewc_loss": 0.03884495794773102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.942247945407871e-05, + "grad_norm": 26.99134063720703, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8692348003387451, + "num_tokens": 367257168.0, + "step": 9625 + }, + { + "epoch": 1.2245261417122504, + "ewc_loss": 0.03901486098766327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.950742989720311e-05, + "grad_norm": 27.0367488861084, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.866070032119751, + "num_tokens": 367295254.0, + "step": 9626 + }, + { + "epoch": 1.224653351990841, + "ewc_loss": 0.03888249769806862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.94412496057339e-05, + "grad_norm": 26.9328556060791, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8526125550270081, + "num_tokens": 367335814.0, + "step": 9627 + }, + { + "epoch": 1.2247805622694314, + "ewc_loss": 0.03886692598462105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.943346251209732e-05, + "grad_norm": 26.854188919067383, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.861213207244873, + "num_tokens": 367377592.0, + "step": 9628 + }, + { + "epoch": 1.224907772548022, + "ewc_loss": 0.03885360062122345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9426799553912133e-05, + "grad_norm": 26.90879249572754, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8562282919883728, + "num_tokens": 367408069.0, + "step": 9629 + }, + { + "epoch": 1.2250349828266125, + "ewc_loss": 0.038953714072704315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.947685632330831e-05, + "grad_norm": 26.942398071289062, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8662792444229126, + "num_tokens": 367444787.0, + "step": 9630 + }, + { + "epoch": 1.225162193105203, + "ewc_loss": 0.03889578580856323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.944789255503565e-05, + "grad_norm": 26.871871948242188, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8507234454154968, + "num_tokens": 367486848.0, + "step": 9631 + }, + { + "epoch": 1.2252894033837933, + "ewc_loss": 0.0388994924724102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9449746105237864e-05, + "grad_norm": 26.994915008544922, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8750426769256592, + "num_tokens": 367530456.0, + "step": 9632 + }, + { + "epoch": 1.2254166136623839, + "ewc_loss": 0.038859665393829346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9429833628237247e-05, + "grad_norm": 26.912269592285156, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8682751655578613, + "num_tokens": 367566513.0, + "step": 9633 + }, + { + "epoch": 1.2255438239409744, + "ewc_loss": 0.03892598673701286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.946299380506389e-05, + "grad_norm": 27.003084182739258, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8438671231269836, + "num_tokens": 367607757.0, + "step": 9634 + }, + { + "epoch": 1.225671034219565, + "ewc_loss": 0.038945842534303665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9472921849228442e-05, + "grad_norm": 26.970657348632812, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8670715689659119, + "num_tokens": 367645525.0, + "step": 9635 + }, + { + "epoch": 1.2257982444981554, + "ewc_loss": 0.038883280009031296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9441640688455664e-05, + "grad_norm": 26.964092254638672, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8737150430679321, + "num_tokens": 367680840.0, + "step": 9636 + }, + { + "epoch": 1.225925454776746, + "ewc_loss": 0.03892555832862854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.946277916431427e-05, + "grad_norm": 26.987489700317383, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.873577892780304, + "num_tokens": 367717444.0, + "step": 9637 + }, + { + "epoch": 1.2260526650553365, + "ewc_loss": 0.0388641282916069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9432063709245995e-05, + "grad_norm": 26.867891311645508, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8443546295166016, + "num_tokens": 367756548.0, + "step": 9638 + }, + { + "epoch": 1.226179875333927, + "ewc_loss": 0.03892058879137039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9460294424789026e-05, + "grad_norm": 27.010360717773438, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8597924113273621, + "num_tokens": 367798341.0, + "step": 9639 + }, + { + "epoch": 1.2263070856125176, + "ewc_loss": 0.03891071677207947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9455357687547803e-05, + "grad_norm": 26.870094299316406, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8713574409484863, + "num_tokens": 367835087.0, + "step": 9640 + }, + { + "epoch": 1.226434295891108, + "ewc_loss": 0.03897097334265709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9485487428028136e-05, + "grad_norm": 27.089746475219727, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8663331270217896, + "num_tokens": 367873816.0, + "step": 9641 + }, + { + "epoch": 1.2265615061696984, + "ewc_loss": 0.03897055983543396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9485280063236132e-05, + "grad_norm": 26.91107749938965, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8601669073104858, + "num_tokens": 367913621.0, + "step": 9642 + }, + { + "epoch": 1.226688716448289, + "ewc_loss": 0.03882281109690666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9411405446589924e-05, + "grad_norm": 27.030105590820312, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8564100861549377, + "num_tokens": 367946881.0, + "step": 9643 + }, + { + "epoch": 1.2268159267268794, + "ewc_loss": 0.03893575444817543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.946787779161241e-05, + "grad_norm": 26.827533721923828, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8696135878562927, + "num_tokens": 367980806.0, + "step": 9644 + }, + { + "epoch": 1.22694313700547, + "ewc_loss": 0.03888062387704849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9440312826191075e-05, + "grad_norm": 27.058109283447266, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8723340630531311, + "num_tokens": 368016041.0, + "step": 9645 + }, + { + "epoch": 1.2270703472840605, + "ewc_loss": 0.03899521380662918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9497607354423963e-05, + "grad_norm": 27.004413604736328, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8623005151748657, + "num_tokens": 368052331.0, + "step": 9646 + }, + { + "epoch": 1.227197557562651, + "ewc_loss": 0.038869500160217285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9434750356595032e-05, + "grad_norm": 26.881052017211914, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8729488253593445, + "num_tokens": 368091458.0, + "step": 9647 + }, + { + "epoch": 1.2273247678412416, + "ewc_loss": 0.03889213874936104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9446069927653298e-05, + "grad_norm": 27.00382423400879, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8532249331474304, + "num_tokens": 368132433.0, + "step": 9648 + }, + { + "epoch": 1.227451978119832, + "ewc_loss": 0.03898143023252487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9490715203573927e-05, + "grad_norm": 26.98577880859375, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.86509108543396, + "num_tokens": 368171030.0, + "step": 9649 + }, + { + "epoch": 1.2275791883984226, + "ewc_loss": 0.038833826780319214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.941691334650386e-05, + "grad_norm": 26.979965209960938, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8531143665313721, + "num_tokens": 368211555.0, + "step": 9650 + }, + { + "epoch": 1.2277063986770131, + "ewc_loss": 0.038951415568590164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9475708540994674e-05, + "grad_norm": 27.002613067626953, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8706428408622742, + "num_tokens": 368252076.0, + "step": 9651 + }, + { + "epoch": 1.2278336089556037, + "ewc_loss": 0.03890719264745712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.945359690580517e-05, + "grad_norm": 27.018579483032227, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8588104248046875, + "num_tokens": 368291265.0, + "step": 9652 + }, + { + "epoch": 1.2279608192341942, + "ewc_loss": 0.038896869868040085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9448434613877907e-05, + "grad_norm": 26.9062442779541, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8576392531394958, + "num_tokens": 368332178.0, + "step": 9653 + }, + { + "epoch": 1.2280880295127847, + "ewc_loss": 0.03889985755085945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.944992800417822e-05, + "grad_norm": 27.00580596923828, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8632791638374329, + "num_tokens": 368374103.0, + "step": 9654 + }, + { + "epoch": 1.2282152397913753, + "ewc_loss": 0.03887256607413292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.943628376466222e-05, + "grad_norm": 26.919620513916016, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8679484128952026, + "num_tokens": 368411928.0, + "step": 9655 + }, + { + "epoch": 1.2283424500699656, + "ewc_loss": 0.038909222930669785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.945461190189235e-05, + "grad_norm": 26.998899459838867, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8399014472961426, + "num_tokens": 368449200.0, + "step": 9656 + }, + { + "epoch": 1.228469660348556, + "ewc_loss": 0.038913168013095856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9456583686405793e-05, + "grad_norm": 27.033191680908203, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8466602563858032, + "num_tokens": 368489935.0, + "step": 9657 + }, + { + "epoch": 1.2285968706271466, + "ewc_loss": 0.038932424038648605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.946621159731876e-05, + "grad_norm": 26.99268913269043, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8768404722213745, + "num_tokens": 368522227.0, + "step": 9658 + }, + { + "epoch": 1.2287240809057371, + "ewc_loss": 0.038905009627342224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9452505512163043e-05, + "grad_norm": 27.089109420776367, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8639719486236572, + "num_tokens": 368555084.0, + "step": 9659 + }, + { + "epoch": 1.2288512911843277, + "ewc_loss": 0.03892416134476662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.946208067238331e-05, + "grad_norm": 26.934093475341797, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8609338998794556, + "num_tokens": 368591845.0, + "step": 9660 + }, + { + "epoch": 1.2289785014629182, + "ewc_loss": 0.0388820543885231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.944102768902667e-05, + "grad_norm": 26.985151290893555, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8528426885604858, + "num_tokens": 368637841.0, + "step": 9661 + }, + { + "epoch": 1.2291057117415087, + "ewc_loss": 0.0389738492667675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9486924429656938e-05, + "grad_norm": 26.91947364807129, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8797347545623779, + "num_tokens": 368677470.0, + "step": 9662 + }, + { + "epoch": 1.2292329220200993, + "ewc_loss": 0.03890172019600868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9450859326752834e-05, + "grad_norm": 26.903995513916016, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.878059446811676, + "num_tokens": 368718412.0, + "step": 9663 + }, + { + "epoch": 1.2293601322986898, + "ewc_loss": 0.038955263793468475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9477631212794222e-05, + "grad_norm": 27.01775360107422, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8532576560974121, + "num_tokens": 368759267.0, + "step": 9664 + }, + { + "epoch": 1.2294873425772803, + "ewc_loss": 0.039041146636009216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9520573914633133e-05, + "grad_norm": 27.06014060974121, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8543221950531006, + "num_tokens": 368797586.0, + "step": 9665 + }, + { + "epoch": 1.2296145528558708, + "ewc_loss": 0.03887741640210152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9438708477537148e-05, + "grad_norm": 27.000545501708984, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.856435239315033, + "num_tokens": 368833801.0, + "step": 9666 + }, + { + "epoch": 1.2297417631344612, + "ewc_loss": 0.038898032158613205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9449016690487042e-05, + "grad_norm": 26.894855499267578, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8485673666000366, + "num_tokens": 368878513.0, + "step": 9667 + }, + { + "epoch": 1.2298689734130517, + "ewc_loss": 0.03892745450139046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9463726857793517e-05, + "grad_norm": 26.955162048339844, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8570298552513123, + "num_tokens": 368917815.0, + "step": 9668 + }, + { + "epoch": 1.2299961836916422, + "ewc_loss": 0.038971614092588425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.948580757016316e-05, + "grad_norm": 26.895559310913086, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.852984607219696, + "num_tokens": 368954971.0, + "step": 9669 + }, + { + "epoch": 1.2301233939702327, + "ewc_loss": 0.03894726559519768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9473633074085228e-05, + "grad_norm": 27.001203536987305, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8687336444854736, + "num_tokens": 368996212.0, + "step": 9670 + }, + { + "epoch": 1.2302506042488233, + "ewc_loss": 0.03897517919540405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9487590179778636e-05, + "grad_norm": 26.935684204101562, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8770996332168579, + "num_tokens": 369032186.0, + "step": 9671 + }, + { + "epoch": 1.2303778145274138, + "ewc_loss": 0.03891800343990326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9459001123323105e-05, + "grad_norm": 26.90927505493164, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8687447309494019, + "num_tokens": 369077565.0, + "step": 9672 + }, + { + "epoch": 1.2305050248060043, + "ewc_loss": 0.03896256908774376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9481283743516542e-05, + "grad_norm": 27.04152488708496, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8719651103019714, + "num_tokens": 369111935.0, + "step": 9673 + }, + { + "epoch": 1.2306322350845948, + "ewc_loss": 0.03896426036953926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.948212957358919e-05, + "grad_norm": 26.93563461303711, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8624299764633179, + "num_tokens": 369154374.0, + "step": 9674 + }, + { + "epoch": 1.2307594453631854, + "ewc_loss": 0.03889428824186325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9447144950390793e-05, + "grad_norm": 26.908100128173828, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.855648398399353, + "num_tokens": 369198557.0, + "step": 9675 + }, + { + "epoch": 1.230886655641776, + "ewc_loss": 0.03902694582939148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.951347258000169e-05, + "grad_norm": 26.991954803466797, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8480501174926758, + "num_tokens": 369237756.0, + "step": 9676 + }, + { + "epoch": 1.2310138659203664, + "ewc_loss": 0.03898368775844574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9491842977004126e-05, + "grad_norm": 26.91676902770996, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8511645197868347, + "num_tokens": 369270155.0, + "step": 9677 + }, + { + "epoch": 1.231141076198957, + "ewc_loss": 0.03896341100335121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9481705749058165e-05, + "grad_norm": 26.998279571533203, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8575769662857056, + "num_tokens": 369310870.0, + "step": 9678 + }, + { + "epoch": 1.2312682864775475, + "ewc_loss": 0.03897795453667641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.948897806869354e-05, + "grad_norm": 26.897005081176758, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.846537709236145, + "num_tokens": 369344679.0, + "step": 9679 + }, + { + "epoch": 1.231395496756138, + "ewc_loss": 0.03895864263176918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9479321053950116e-05, + "grad_norm": 27.022178649902344, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8638413548469543, + "num_tokens": 369380388.0, + "step": 9680 + }, + { + "epoch": 1.2315227070347283, + "ewc_loss": 0.038960590958595276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9480296032270417e-05, + "grad_norm": 26.94944190979004, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8760616183280945, + "num_tokens": 369422327.0, + "step": 9681 + }, + { + "epoch": 1.2316499173133189, + "ewc_loss": 0.03894989565014839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9474948203423992e-05, + "grad_norm": 27.03110694885254, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8717302680015564, + "num_tokens": 369457160.0, + "step": 9682 + }, + { + "epoch": 1.2317771275919094, + "ewc_loss": 0.038921814411878586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.946090742421802e-05, + "grad_norm": 26.883323669433594, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8640588521957397, + "num_tokens": 369497883.0, + "step": 9683 + }, + { + "epoch": 1.2319043378705, + "ewc_loss": 0.0388636514544487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.943182542163413e-05, + "grad_norm": 26.966690063476562, + "learning_rate": 1e-06, + "loss": 0.346, + "mean_token_accuracy": 0.8937960863113403, + "num_tokens": 369540014.0, + "step": 9684 + }, + { + "epoch": 1.2320315481490904, + "ewc_loss": 0.03902232646942139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9511162463459186e-05, + "grad_norm": 27.149856567382812, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8635814785957336, + "num_tokens": 369580424.0, + "step": 9685 + }, + { + "epoch": 1.232158758427681, + "ewc_loss": 0.0389060378074646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9453018467174843e-05, + "grad_norm": 26.966510772705078, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8605067729949951, + "num_tokens": 369623615.0, + "step": 9686 + }, + { + "epoch": 1.2322859687062715, + "ewc_loss": 0.03887847438454628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.943923780345358e-05, + "grad_norm": 27.02819061279297, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8521946668624878, + "num_tokens": 369658408.0, + "step": 9687 + }, + { + "epoch": 1.232413178984862, + "ewc_loss": 0.03893129900097847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9465649529593065e-05, + "grad_norm": 27.001468658447266, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8606324195861816, + "num_tokens": 369696311.0, + "step": 9688 + }, + { + "epoch": 1.2325403892634526, + "ewc_loss": 0.038929205387830734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9464603610686027e-05, + "grad_norm": 26.956811904907227, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8509896993637085, + "num_tokens": 369737473.0, + "step": 9689 + }, + { + "epoch": 1.232667599542043, + "ewc_loss": 0.03889327496290207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9446637452347204e-05, + "grad_norm": 27.008686065673828, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8617902994155884, + "num_tokens": 369770151.0, + "step": 9690 + }, + { + "epoch": 1.2327948098206334, + "ewc_loss": 0.03890296816825867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.945148324011825e-05, + "grad_norm": 26.922130584716797, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8816657066345215, + "num_tokens": 369813503.0, + "step": 9691 + }, + { + "epoch": 1.232922020099224, + "ewc_loss": 0.03884675353765488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.942337621585466e-05, + "grad_norm": 27.02872657775879, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8650650382041931, + "num_tokens": 369848768.0, + "step": 9692 + }, + { + "epoch": 1.2330492303778144, + "ewc_loss": 0.03889698162674904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9448491002549417e-05, + "grad_norm": 26.856689453125, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8674876689910889, + "num_tokens": 369892510.0, + "step": 9693 + }, + { + "epoch": 1.233176440656405, + "ewc_loss": 0.03887363523244858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9436818547546864e-05, + "grad_norm": 27.030038833618164, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8723344206809998, + "num_tokens": 369929638.0, + "step": 9694 + }, + { + "epoch": 1.2333036509349955, + "ewc_loss": 0.03899125009775162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9495624655974098e-05, + "grad_norm": 26.86807632446289, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8846018314361572, + "num_tokens": 369971201.0, + "step": 9695 + }, + { + "epoch": 1.233430861213586, + "ewc_loss": 0.038904402405023575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.945220174093265e-05, + "grad_norm": 26.946744918823242, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.857833981513977, + "num_tokens": 370011713.0, + "step": 9696 + }, + { + "epoch": 1.2335580714921766, + "ewc_loss": 0.03898048773407936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.949024408531841e-05, + "grad_norm": 26.890321731567383, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8434697389602661, + "num_tokens": 370042647.0, + "step": 9697 + }, + { + "epoch": 1.233685281770767, + "ewc_loss": 0.03894830867648125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9474155124044046e-05, + "grad_norm": 26.92586898803711, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8614102602005005, + "num_tokens": 370083976.0, + "step": 9698 + }, + { + "epoch": 1.2338124920493576, + "ewc_loss": 0.0390063039958477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.950315163412597e-05, + "grad_norm": 26.966943740844727, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8604800701141357, + "num_tokens": 370126435.0, + "step": 9699 + }, + { + "epoch": 1.2339397023279481, + "ewc_loss": 0.0389731302857399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9486564269755036e-05, + "grad_norm": 26.9174747467041, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8720678091049194, + "num_tokens": 370165038.0, + "step": 9700 + }, + { + "epoch": 1.2340669126065387, + "ewc_loss": 0.039021264761686325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9510633137542754e-05, + "grad_norm": 26.981203079223633, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8557736873626709, + "num_tokens": 370203438.0, + "step": 9701 + }, + { + "epoch": 1.2341941228851292, + "ewc_loss": 0.03895288333296776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9476441593724303e-05, + "grad_norm": 26.92220687866211, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8786308765411377, + "num_tokens": 370235144.0, + "step": 9702 + }, + { + "epoch": 1.2343213331637197, + "ewc_loss": 0.03897063061594963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.94853146240348e-05, + "grad_norm": 26.926280975341797, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8587384223937988, + "num_tokens": 370278936.0, + "step": 9703 + }, + { + "epoch": 1.2344485434423103, + "ewc_loss": 0.03894687443971634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9473436623229645e-05, + "grad_norm": 26.935392379760742, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8665083050727844, + "num_tokens": 370312595.0, + "step": 9704 + }, + { + "epoch": 1.2345757537209006, + "ewc_loss": 0.03901265561580658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.950632758962456e-05, + "grad_norm": 27.00514793395996, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8614728450775146, + "num_tokens": 370351697.0, + "step": 9705 + }, + { + "epoch": 1.234702963999491, + "ewc_loss": 0.038975976407527924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9487988538458012e-05, + "grad_norm": 26.968721389770508, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8767331838607788, + "num_tokens": 370391264.0, + "step": 9706 + }, + { + "epoch": 1.2348301742780816, + "ewc_loss": 0.03890348598361015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9451743355602957e-05, + "grad_norm": 26.987733840942383, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8716215491294861, + "num_tokens": 370425248.0, + "step": 9707 + }, + { + "epoch": 1.2349573845566721, + "ewc_loss": 0.03904085233807564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9520426576491445e-05, + "grad_norm": 27.020950317382812, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8423498868942261, + "num_tokens": 370462670.0, + "step": 9708 + }, + { + "epoch": 1.2350845948352627, + "ewc_loss": 0.03889595344662666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9447976228548214e-05, + "grad_norm": 26.918296813964844, + "learning_rate": 1e-06, + "loss": 0.5, + "mean_token_accuracy": 0.8442271947860718, + "num_tokens": 370506126.0, + "step": 9709 + }, + { + "epoch": 1.2352118051138532, + "ewc_loss": 0.038952481001615524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.947623968590051e-05, + "grad_norm": 26.92137336730957, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8736061453819275, + "num_tokens": 370546895.0, + "step": 9710 + }, + { + "epoch": 1.2353390153924437, + "ewc_loss": 0.039003193378448486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.950159639818594e-05, + "grad_norm": 27.08684539794922, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8677424192428589, + "num_tokens": 370590645.0, + "step": 9711 + }, + { + "epoch": 1.2354662256710343, + "ewc_loss": 0.0389384925365448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9469245671643876e-05, + "grad_norm": 26.90889549255371, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8668849468231201, + "num_tokens": 370628614.0, + "step": 9712 + }, + { + "epoch": 1.2355934359496248, + "ewc_loss": 0.03890484943985939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9452425476629287e-05, + "grad_norm": 27.019895553588867, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8659566640853882, + "num_tokens": 370666082.0, + "step": 9713 + }, + { + "epoch": 1.2357206462282153, + "ewc_loss": 0.03888990357518196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9444951249170117e-05, + "grad_norm": 26.870342254638672, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8500738739967346, + "num_tokens": 370705780.0, + "step": 9714 + }, + { + "epoch": 1.2358478565068058, + "ewc_loss": 0.038960475474596024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9480237824609503e-05, + "grad_norm": 27.067079544067383, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8695123195648193, + "num_tokens": 370742550.0, + "step": 9715 + }, + { + "epoch": 1.2359750667853961, + "ewc_loss": 0.03905647620558739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.952823731699027e-05, + "grad_norm": 26.988779067993164, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8687812089920044, + "num_tokens": 370784406.0, + "step": 9716 + }, + { + "epoch": 1.2361022770639867, + "ewc_loss": 0.03891422972083092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9457114831311628e-05, + "grad_norm": 27.023927688598633, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8598167896270752, + "num_tokens": 370820911.0, + "step": 9717 + }, + { + "epoch": 1.2362294873425772, + "ewc_loss": 0.03891829401254654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.945914664247539e-05, + "grad_norm": 26.862150192260742, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8750436305999756, + "num_tokens": 370857427.0, + "step": 9718 + }, + { + "epoch": 1.2363566976211677, + "ewc_loss": 0.03893539309501648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9467695892672054e-05, + "grad_norm": 26.979904174804688, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8544739484786987, + "num_tokens": 370897989.0, + "step": 9719 + }, + { + "epoch": 1.2364839078997583, + "ewc_loss": 0.03908177837729454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9540888388291933e-05, + "grad_norm": 27.09561538696289, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8524384498596191, + "num_tokens": 370932672.0, + "step": 9720 + }, + { + "epoch": 1.2366111181783488, + "ewc_loss": 0.038967203348875046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.948360113601666e-05, + "grad_norm": 26.846771240234375, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8542822003364563, + "num_tokens": 370971027.0, + "step": 9721 + }, + { + "epoch": 1.2367383284569393, + "ewc_loss": 0.03898609057068825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.949304532899987e-05, + "grad_norm": 27.076608657836914, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8745734691619873, + "num_tokens": 371014742.0, + "step": 9722 + }, + { + "epoch": 1.2368655387355298, + "ewc_loss": 0.03907127305865288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9535636965883896e-05, + "grad_norm": 26.992677688598633, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8487651348114014, + "num_tokens": 371052065.0, + "step": 9723 + }, + { + "epoch": 1.2369927490141204, + "ewc_loss": 0.03894820436835289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9474102373351343e-05, + "grad_norm": 26.849803924560547, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.8426865935325623, + "num_tokens": 371093897.0, + "step": 9724 + }, + { + "epoch": 1.237119959292711, + "ewc_loss": 0.039077360183000565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9538680135156028e-05, + "grad_norm": 27.05959701538086, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8643291592597961, + "num_tokens": 371132390.0, + "step": 9725 + }, + { + "epoch": 1.2372471695713014, + "ewc_loss": 0.039093632251024246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.954681647475809e-05, + "grad_norm": 26.963132858276367, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8757134079933167, + "num_tokens": 371170468.0, + "step": 9726 + }, + { + "epoch": 1.237374379849892, + "ewc_loss": 0.039006203413009644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9503102521412075e-05, + "grad_norm": 27.038999557495117, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8610800504684448, + "num_tokens": 371209323.0, + "step": 9727 + }, + { + "epoch": 1.2375015901284825, + "ewc_loss": 0.03904739394783974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.952369711943902e-05, + "grad_norm": 26.93304443359375, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8639848232269287, + "num_tokens": 371246526.0, + "step": 9728 + }, + { + "epoch": 1.237628800407073, + "ewc_loss": 0.03905817121267319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9529084966052324e-05, + "grad_norm": 27.153535842895508, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8672394156455994, + "num_tokens": 371282850.0, + "step": 9729 + }, + { + "epoch": 1.2377560106856633, + "ewc_loss": 0.039090920239686966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9545459508663043e-05, + "grad_norm": 26.925922393798828, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8718667030334473, + "num_tokens": 371312614.0, + "step": 9730 + }, + { + "epoch": 1.2378832209642538, + "ewc_loss": 0.03896729275584221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9483646610751748e-05, + "grad_norm": 26.985063552856445, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8668262362480164, + "num_tokens": 371350170.0, + "step": 9731 + }, + { + "epoch": 1.2380104312428444, + "ewc_loss": 0.03910807892680168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.955403968167957e-05, + "grad_norm": 26.97075080871582, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8699922561645508, + "num_tokens": 371389295.0, + "step": 9732 + }, + { + "epoch": 1.238137641521435, + "ewc_loss": 0.03898210823535919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9491053535602987e-05, + "grad_norm": 26.923463821411133, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8634724617004395, + "num_tokens": 371430579.0, + "step": 9733 + }, + { + "epoch": 1.2382648518000254, + "ewc_loss": 0.03909247741103172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.954623803612776e-05, + "grad_norm": 27.037464141845703, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8551064133644104, + "num_tokens": 371465087.0, + "step": 9734 + }, + { + "epoch": 1.238392062078616, + "ewc_loss": 0.039038535207509995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9519267880241387e-05, + "grad_norm": 26.939743041992188, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8716583251953125, + "num_tokens": 371503549.0, + "step": 9735 + }, + { + "epoch": 1.2385192723572065, + "ewc_loss": 0.03899715095758438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.949857505678665e-05, + "grad_norm": 27.184473037719727, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8836686611175537, + "num_tokens": 371542953.0, + "step": 9736 + }, + { + "epoch": 1.238646482635797, + "ewc_loss": 0.038988564163446426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.949428224179428e-05, + "grad_norm": 26.888916015625, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8676378726959229, + "num_tokens": 371582541.0, + "step": 9737 + }, + { + "epoch": 1.2387736929143875, + "ewc_loss": 0.03896679729223251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9483399228192866e-05, + "grad_norm": 27.124500274658203, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8748618364334106, + "num_tokens": 371617221.0, + "step": 9738 + }, + { + "epoch": 1.238900903192978, + "ewc_loss": 0.039098262786865234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9549132048268802e-05, + "grad_norm": 26.957738876342773, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8673433661460876, + "num_tokens": 371660773.0, + "step": 9739 + }, + { + "epoch": 1.2390281134715684, + "ewc_loss": 0.03896191716194153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9480958144413307e-05, + "grad_norm": 27.21331024169922, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8590562343597412, + "num_tokens": 371694763.0, + "step": 9740 + }, + { + "epoch": 1.239155323750159, + "ewc_loss": 0.03903708979487419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.951854574144818e-05, + "grad_norm": 26.930330276489258, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8641402721405029, + "num_tokens": 371734793.0, + "step": 9741 + }, + { + "epoch": 1.2392825340287494, + "ewc_loss": 0.03887534514069557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.943767347256653e-05, + "grad_norm": 27.00049591064453, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8595722913742065, + "num_tokens": 371775691.0, + "step": 9742 + }, + { + "epoch": 1.23940974430734, + "ewc_loss": 0.039007704704999924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9503851945046335e-05, + "grad_norm": 27.0059814453125, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8562504649162292, + "num_tokens": 371817319.0, + "step": 9743 + }, + { + "epoch": 1.2395369545859305, + "ewc_loss": 0.03889769688248634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9448849343461916e-05, + "grad_norm": 26.87824058532715, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8637082576751709, + "num_tokens": 371847431.0, + "step": 9744 + }, + { + "epoch": 1.239664164864521, + "ewc_loss": 0.039075206965208054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.953760329342913e-05, + "grad_norm": 27.044889450073242, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.877069354057312, + "num_tokens": 371881788.0, + "step": 9745 + }, + { + "epoch": 1.2397913751431116, + "ewc_loss": 0.03901010379195213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9505052478052676e-05, + "grad_norm": 26.823522567749023, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8596912026405334, + "num_tokens": 371917388.0, + "step": 9746 + }, + { + "epoch": 1.239918585421702, + "ewc_loss": 0.03901179879903793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9505900127114728e-05, + "grad_norm": 27.009254455566406, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8735851049423218, + "num_tokens": 371957561.0, + "step": 9747 + }, + { + "epoch": 1.2400457957002926, + "ewc_loss": 0.039123594760894775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9561797671485692e-05, + "grad_norm": 27.0681095123291, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8571161031723022, + "num_tokens": 371987322.0, + "step": 9748 + }, + { + "epoch": 1.2401730059788831, + "ewc_loss": 0.03902880847454071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9514403902576305e-05, + "grad_norm": 26.95757293701172, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8675788640975952, + "num_tokens": 372027263.0, + "step": 9749 + }, + { + "epoch": 1.2403002162574737, + "ewc_loss": 0.039091188460588455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9545594113878906e-05, + "grad_norm": 26.939828872680664, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8691177368164062, + "num_tokens": 372061441.0, + "step": 9750 + }, + { + "epoch": 1.2404274265360642, + "ewc_loss": 0.039072226732969284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9536113541107625e-05, + "grad_norm": 26.91269874572754, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8697059154510498, + "num_tokens": 372099085.0, + "step": 9751 + }, + { + "epoch": 1.2405546368146547, + "ewc_loss": 0.03909652680158615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9548262571333908e-05, + "grad_norm": 26.973560333251953, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8672534227371216, + "num_tokens": 372134106.0, + "step": 9752 + }, + { + "epoch": 1.2406818470932452, + "ewc_loss": 0.03907279297709465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9536397303454578e-05, + "grad_norm": 26.95136260986328, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8721040487289429, + "num_tokens": 372164635.0, + "step": 9753 + }, + { + "epoch": 1.2408090573718356, + "ewc_loss": 0.039070889353752136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.953544415300712e-05, + "grad_norm": 26.914281845092773, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8696966171264648, + "num_tokens": 372204924.0, + "step": 9754 + }, + { + "epoch": 1.240936267650426, + "ewc_loss": 0.03913269191980362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9566345144994557e-05, + "grad_norm": 26.994422912597656, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8649106025695801, + "num_tokens": 372240939.0, + "step": 9755 + }, + { + "epoch": 1.2410634779290166, + "ewc_loss": 0.03913368284702301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9566841729101725e-05, + "grad_norm": 27.081649780273438, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8606364727020264, + "num_tokens": 372272355.0, + "step": 9756 + }, + { + "epoch": 1.2411906882076071, + "ewc_loss": 0.039180513471364975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.959025757969357e-05, + "grad_norm": 27.012590408325195, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8628901243209839, + "num_tokens": 372309566.0, + "step": 9757 + }, + { + "epoch": 1.2413178984861977, + "ewc_loss": 0.03907717764377594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.953858918568585e-05, + "grad_norm": 26.992530822753906, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8592394590377808, + "num_tokens": 372347768.0, + "step": 9758 + }, + { + "epoch": 1.2414451087647882, + "ewc_loss": 0.039174363017082214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9587181668612175e-05, + "grad_norm": 27.0700626373291, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.87310791015625, + "num_tokens": 372381020.0, + "step": 9759 + }, + { + "epoch": 1.2415723190433787, + "ewc_loss": 0.039071906358003616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9535953470040113e-05, + "grad_norm": 27.065927505493164, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8537018299102783, + "num_tokens": 372422434.0, + "step": 9760 + }, + { + "epoch": 1.2416995293219693, + "ewc_loss": 0.03907769173383713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.953884566319175e-05, + "grad_norm": 26.947988510131836, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8554335832595825, + "num_tokens": 372459342.0, + "step": 9761 + }, + { + "epoch": 1.2418267396005598, + "ewc_loss": 0.039114177227020264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9557088307919912e-05, + "grad_norm": 27.053321838378906, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8682576417922974, + "num_tokens": 372496637.0, + "step": 9762 + }, + { + "epoch": 1.2419539498791503, + "ewc_loss": 0.039129406213760376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9564702597563155e-05, + "grad_norm": 26.945175170898438, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8779507875442505, + "num_tokens": 372532068.0, + "step": 9763 + }, + { + "epoch": 1.2420811601577408, + "ewc_loss": 0.03912898153066635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9564491594792344e-05, + "grad_norm": 27.11855125427246, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8714171648025513, + "num_tokens": 372565460.0, + "step": 9764 + }, + { + "epoch": 1.2422083704363311, + "ewc_loss": 0.03912511467933655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9562558009056374e-05, + "grad_norm": 27.008831024169922, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.868030309677124, + "num_tokens": 372605998.0, + "step": 9765 + }, + { + "epoch": 1.2423355807149217, + "ewc_loss": 0.03896847739815712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9484239601297304e-05, + "grad_norm": 26.962915420532227, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8546454906463623, + "num_tokens": 372642038.0, + "step": 9766 + }, + { + "epoch": 1.2424627909935122, + "ewc_loss": 0.03912876173853874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9564380636438727e-05, + "grad_norm": 27.08287239074707, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8607413172721863, + "num_tokens": 372678192.0, + "step": 9767 + }, + { + "epoch": 1.2425900012721027, + "ewc_loss": 0.039086464792490005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9543233065633103e-05, + "grad_norm": 27.042234420776367, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8603682518005371, + "num_tokens": 372712867.0, + "step": 9768 + }, + { + "epoch": 1.2427172115506933, + "ewc_loss": 0.039130762219429016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9565381080610678e-05, + "grad_norm": 27.048885345458984, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8462494611740112, + "num_tokens": 372745790.0, + "step": 9769 + }, + { + "epoch": 1.2428444218292838, + "ewc_loss": 0.039083562791347504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9541781512089074e-05, + "grad_norm": 26.986194610595703, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8749603033065796, + "num_tokens": 372777634.0, + "step": 9770 + }, + { + "epoch": 1.2429716321078743, + "ewc_loss": 0.03899361193180084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9496805180097e-05, + "grad_norm": 26.925559997558594, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8498296141624451, + "num_tokens": 372820527.0, + "step": 9771 + }, + { + "epoch": 1.2430988423864648, + "ewc_loss": 0.03915637359023094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.957818676601164e-05, + "grad_norm": 27.008352279663086, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8742107152938843, + "num_tokens": 372858479.0, + "step": 9772 + }, + { + "epoch": 1.2432260526650554, + "ewc_loss": 0.03915870562195778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9579352738219313e-05, + "grad_norm": 26.950645446777344, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8713098764419556, + "num_tokens": 372895219.0, + "step": 9773 + }, + { + "epoch": 1.243353262943646, + "ewc_loss": 0.039167776703834534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9583887478802353e-05, + "grad_norm": 26.971485137939453, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8762853145599365, + "num_tokens": 372934491.0, + "step": 9774 + }, + { + "epoch": 1.2434804732222364, + "ewc_loss": 0.03906606137752533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9533030354068615e-05, + "grad_norm": 27.01289176940918, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8572144508361816, + "num_tokens": 372972892.0, + "step": 9775 + }, + { + "epoch": 1.243607683500827, + "ewc_loss": 0.03917070850729942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.958535358426161e-05, + "grad_norm": 27.127145767211914, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8682022094726562, + "num_tokens": 373011564.0, + "step": 9776 + }, + { + "epoch": 1.2437348937794175, + "ewc_loss": 0.039105504751205444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.955275183718186e-05, + "grad_norm": 27.03246307373047, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8660492897033691, + "num_tokens": 373049444.0, + "step": 9777 + }, + { + "epoch": 1.243862104058008, + "ewc_loss": 0.039057377725839615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.952868842636235e-05, + "grad_norm": 26.954545974731445, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8647929430007935, + "num_tokens": 373085466.0, + "step": 9778 + }, + { + "epoch": 1.2439893143365983, + "ewc_loss": 0.03915424272418022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9577120838221163e-05, + "grad_norm": 27.02471923828125, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8426660299301147, + "num_tokens": 373124712.0, + "step": 9779 + }, + { + "epoch": 1.2441165246151888, + "ewc_loss": 0.03911811485886574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9559058273443952e-05, + "grad_norm": 27.155683517456055, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8706997036933899, + "num_tokens": 373160008.0, + "step": 9780 + }, + { + "epoch": 1.2442437348937794, + "ewc_loss": 0.03908641263842583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.954320578079205e-05, + "grad_norm": 26.91159439086914, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8647704720497131, + "num_tokens": 373192110.0, + "step": 9781 + }, + { + "epoch": 1.24437094517237, + "ewc_loss": 0.039083145558834076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9541572328307666e-05, + "grad_norm": 27.007171630859375, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8663268089294434, + "num_tokens": 373234432.0, + "step": 9782 + }, + { + "epoch": 1.2444981554509604, + "ewc_loss": 0.039128538221120834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.956426967808511e-05, + "grad_norm": 27.093170166015625, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8550360202789307, + "num_tokens": 373270725.0, + "step": 9783 + }, + { + "epoch": 1.244625365729551, + "ewc_loss": 0.039091408252716064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.954570325324312e-05, + "grad_norm": 27.020715713500977, + "learning_rate": 1e-06, + "loss": 0.519, + "mean_token_accuracy": 0.8429279327392578, + "num_tokens": 373307531.0, + "step": 9784 + }, + { + "epoch": 1.2447525760081415, + "ewc_loss": 0.03908325359225273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9541626897989772e-05, + "grad_norm": 26.939254760742188, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8687359690666199, + "num_tokens": 373351657.0, + "step": 9785 + }, + { + "epoch": 1.244879786286732, + "ewc_loss": 0.039063483476638794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9531742509570904e-05, + "grad_norm": 27.172155380249023, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8697986602783203, + "num_tokens": 373388811.0, + "step": 9786 + }, + { + "epoch": 1.2450069965653225, + "ewc_loss": 0.039094507694244385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.954725303221494e-05, + "grad_norm": 26.96411895751953, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8739156723022461, + "num_tokens": 373423378.0, + "step": 9787 + }, + { + "epoch": 1.245134206843913, + "ewc_loss": 0.039123065769672394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9561532099032775e-05, + "grad_norm": 27.248533248901367, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8665333390235901, + "num_tokens": 373460982.0, + "step": 9788 + }, + { + "epoch": 1.2452614171225034, + "ewc_loss": 0.03911967575550079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9559838619898073e-05, + "grad_norm": 27.058300018310547, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8713686466217041, + "num_tokens": 373501474.0, + "step": 9789 + }, + { + "epoch": 1.245388627401094, + "ewc_loss": 0.03898995369672775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9494977095746435e-05, + "grad_norm": 27.06183624267578, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8694056272506714, + "num_tokens": 373537830.0, + "step": 9790 + }, + { + "epoch": 1.2455158376796844, + "ewc_loss": 0.03910328820347786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.95516440726351e-05, + "grad_norm": 27.1464900970459, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8583898544311523, + "num_tokens": 373573975.0, + "step": 9791 + }, + { + "epoch": 1.245643047958275, + "ewc_loss": 0.03906591981649399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9532959413481876e-05, + "grad_norm": 27.187545776367188, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8432738184928894, + "num_tokens": 373612157.0, + "step": 9792 + }, + { + "epoch": 1.2457702582368655, + "ewc_loss": 0.038944970816373825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.947248529177159e-05, + "grad_norm": 26.93457794189453, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8844993114471436, + "num_tokens": 373645065.0, + "step": 9793 + }, + { + "epoch": 1.245897468515456, + "ewc_loss": 0.038977742195129395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9488870748318732e-05, + "grad_norm": 27.16012954711914, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8594295382499695, + "num_tokens": 373687658.0, + "step": 9794 + }, + { + "epoch": 1.2460246787940465, + "ewc_loss": 0.039123717695474625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9561859517125413e-05, + "grad_norm": 26.982114791870117, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8660858869552612, + "num_tokens": 373732262.0, + "step": 9795 + }, + { + "epoch": 1.246151889072637, + "ewc_loss": 0.03899935632944107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9499677364365198e-05, + "grad_norm": 27.03545570373535, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8457607626914978, + "num_tokens": 373772410.0, + "step": 9796 + }, + { + "epoch": 1.2462790993512276, + "ewc_loss": 0.03908265382051468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9541326764738187e-05, + "grad_norm": 27.06629180908203, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8650968074798584, + "num_tokens": 373808773.0, + "step": 9797 + }, + { + "epoch": 1.2464063096298181, + "ewc_loss": 0.03904031589627266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.952015736605972e-05, + "grad_norm": 27.054523468017578, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8600131273269653, + "num_tokens": 373848945.0, + "step": 9798 + }, + { + "epoch": 1.2465335199084087, + "ewc_loss": 0.039130546152591705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.956527376023587e-05, + "grad_norm": 27.196508407592773, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8585799932479858, + "num_tokens": 373892716.0, + "step": 9799 + }, + { + "epoch": 1.2466607301869992, + "ewc_loss": 0.03900183364748955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9500916096149012e-05, + "grad_norm": 26.9327392578125, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.879806399345398, + "num_tokens": 373929443.0, + "step": 9800 + }, + { + "epoch": 1.2467879404655897, + "ewc_loss": 0.038949329406023026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.947466444107704e-05, + "grad_norm": 27.11699104309082, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8652785420417786, + "num_tokens": 373968882.0, + "step": 9801 + }, + { + "epoch": 1.2469151507441802, + "ewc_loss": 0.03907451033592224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.953725586645305e-05, + "grad_norm": 27.042903900146484, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8773380517959595, + "num_tokens": 374009265.0, + "step": 9802 + }, + { + "epoch": 1.2470423610227706, + "ewc_loss": 0.03903281316161156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.951640660990961e-05, + "grad_norm": 27.091463088989258, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8494192361831665, + "num_tokens": 374047152.0, + "step": 9803 + }, + { + "epoch": 1.247169571301361, + "ewc_loss": 0.03911223262548447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.955611696757842e-05, + "grad_norm": 26.956981658935547, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8600355386734009, + "num_tokens": 374083127.0, + "step": 9804 + }, + { + "epoch": 1.2472967815799516, + "ewc_loss": 0.03907826170325279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9539131244528107e-05, + "grad_norm": 27.1927433013916, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8732022047042847, + "num_tokens": 374127086.0, + "step": 9805 + }, + { + "epoch": 1.2474239918585421, + "ewc_loss": 0.039047930389642715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9523964510881342e-05, + "grad_norm": 26.977907180786133, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8590302467346191, + "num_tokens": 374163879.0, + "step": 9806 + }, + { + "epoch": 1.2475512021371327, + "ewc_loss": 0.038960348814725876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.948017415998038e-05, + "grad_norm": 26.96378517150879, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.868980884552002, + "num_tokens": 374206639.0, + "step": 9807 + }, + { + "epoch": 1.2476784124157232, + "ewc_loss": 0.03903770446777344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9518851331667975e-05, + "grad_norm": 27.00202178955078, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8620499968528748, + "num_tokens": 374244106.0, + "step": 9808 + }, + { + "epoch": 1.2478056226943137, + "ewc_loss": 0.03905189782381058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.952594902832061e-05, + "grad_norm": 27.01898193359375, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8606240749359131, + "num_tokens": 374285776.0, + "step": 9809 + }, + { + "epoch": 1.2479328329729042, + "ewc_loss": 0.03914889320731163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.957444692379795e-05, + "grad_norm": 27.085783004760742, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8739396929740906, + "num_tokens": 374323036.0, + "step": 9810 + }, + { + "epoch": 1.2480600432514948, + "ewc_loss": 0.038955774158239365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9477887690300122e-05, + "grad_norm": 26.77906036376953, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8705702424049377, + "num_tokens": 374360204.0, + "step": 9811 + }, + { + "epoch": 1.2481872535300853, + "ewc_loss": 0.039132945239543915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9566472474252805e-05, + "grad_norm": 27.0975284576416, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8746116161346436, + "num_tokens": 374394961.0, + "step": 9812 + }, + { + "epoch": 1.2483144638086758, + "ewc_loss": 0.03917529806494713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9587649148888886e-05, + "grad_norm": 26.933408737182617, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8665704727172852, + "num_tokens": 374436142.0, + "step": 9813 + }, + { + "epoch": 1.2484416740872661, + "ewc_loss": 0.03908294066786766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9541470464901067e-05, + "grad_norm": 27.097200393676758, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8610410094261169, + "num_tokens": 374474263.0, + "step": 9814 + }, + { + "epoch": 1.2485688843658567, + "ewc_loss": 0.03914591670036316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.957295899046585e-05, + "grad_norm": 27.075071334838867, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.858440637588501, + "num_tokens": 374510250.0, + "step": 9815 + }, + { + "epoch": 1.2486960946444472, + "ewc_loss": 0.03906065598130226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9530327335814945e-05, + "grad_norm": 26.987462997436523, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8826413154602051, + "num_tokens": 374549579.0, + "step": 9816 + }, + { + "epoch": 1.2488233049230377, + "ewc_loss": 0.03914467990398407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9572340534068644e-05, + "grad_norm": 27.089603424072266, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8634477257728577, + "num_tokens": 374588631.0, + "step": 9817 + }, + { + "epoch": 1.2489505152016283, + "ewc_loss": 0.03912518918514252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9562594388844445e-05, + "grad_norm": 27.026992797851562, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8605650663375854, + "num_tokens": 374627092.0, + "step": 9818 + }, + { + "epoch": 1.2490777254802188, + "ewc_loss": 0.03911414369940758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.955707193701528e-05, + "grad_norm": 27.062625885009766, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8707994222640991, + "num_tokens": 374657797.0, + "step": 9819 + }, + { + "epoch": 1.2492049357588093, + "ewc_loss": 0.03911139816045761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9555698600015603e-05, + "grad_norm": 27.139524459838867, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8696709871292114, + "num_tokens": 374694011.0, + "step": 9820 + }, + { + "epoch": 1.2493321460373998, + "ewc_loss": 0.03913057968020439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.95652901311405e-05, + "grad_norm": 27.04583740234375, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8666896820068359, + "num_tokens": 374735127.0, + "step": 9821 + }, + { + "epoch": 1.2494593563159904, + "ewc_loss": 0.03913766145706177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.95688298845198e-05, + "grad_norm": 26.977468490600586, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.862099289894104, + "num_tokens": 374774170.0, + "step": 9822 + }, + { + "epoch": 1.249586566594581, + "ewc_loss": 0.039127200841903687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9563600289984606e-05, + "grad_norm": 27.069528579711914, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8648421764373779, + "num_tokens": 374809162.0, + "step": 9823 + }, + { + "epoch": 1.2497137768731714, + "ewc_loss": 0.039146166294813156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.957308268174529e-05, + "grad_norm": 27.01338768005371, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8579089641571045, + "num_tokens": 374847974.0, + "step": 9824 + }, + { + "epoch": 1.249840987151762, + "ewc_loss": 0.039124879986047745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9562439774745144e-05, + "grad_norm": 27.01075553894043, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8807356357574463, + "num_tokens": 374894911.0, + "step": 9825 + }, + { + "epoch": 1.2499681974303525, + "ewc_loss": 0.03913148120045662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.956574124051258e-05, + "grad_norm": 27.005748748779297, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8587942123413086, + "num_tokens": 374933534.0, + "step": 9826 + }, + { + "epoch": 1.250095407708943, + "ewc_loss": 0.039172835648059845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9586417693062685e-05, + "grad_norm": 26.996932983398438, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8530533909797668, + "num_tokens": 374968183.0, + "step": 9827 + }, + { + "epoch": 1.2502226179875333, + "ewc_loss": 0.039236195385456085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.961809721251484e-05, + "grad_norm": 27.058815002441406, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8577287197113037, + "num_tokens": 375005973.0, + "step": 9828 + }, + { + "epoch": 1.2503498282661238, + "ewc_loss": 0.03919227793812752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9596138372435234e-05, + "grad_norm": 27.085559844970703, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8589098453521729, + "num_tokens": 375043446.0, + "step": 9829 + }, + { + "epoch": 1.2504770385447144, + "ewc_loss": 0.039179567247629166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9589782823459245e-05, + "grad_norm": 27.0987491607666, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8677796721458435, + "num_tokens": 375081518.0, + "step": 9830 + }, + { + "epoch": 1.250604248823305, + "ewc_loss": 0.03917248919606209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.958624488906935e-05, + "grad_norm": 27.089536666870117, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8579585552215576, + "num_tokens": 375117767.0, + "step": 9831 + }, + { + "epoch": 1.2507314591018954, + "ewc_loss": 0.03909854218363762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9549270291463472e-05, + "grad_norm": 27.055932998657227, + "learning_rate": 1e-06, + "loss": 0.5103, + "mean_token_accuracy": 0.841080904006958, + "num_tokens": 375152061.0, + "step": 9832 + }, + { + "epoch": 1.250858669380486, + "ewc_loss": 0.03913741186261177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.956870619324036e-05, + "grad_norm": 27.19174575805664, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8574701547622681, + "num_tokens": 375195677.0, + "step": 9833 + }, + { + "epoch": 1.2509858796590765, + "ewc_loss": 0.039154909551143646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9577455532271415e-05, + "grad_norm": 27.083616256713867, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8568065166473389, + "num_tokens": 375230319.0, + "step": 9834 + }, + { + "epoch": 1.251113089937667, + "ewc_loss": 0.03907331824302673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.953665923792869e-05, + "grad_norm": 27.070180892944336, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8691848516464233, + "num_tokens": 375268970.0, + "step": 9835 + }, + { + "epoch": 1.2512403002162575, + "ewc_loss": 0.039186861366033554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9593429897213355e-05, + "grad_norm": 27.296945571899414, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8709378838539124, + "num_tokens": 375303661.0, + "step": 9836 + }, + { + "epoch": 1.2513675104948478, + "ewc_loss": 0.03916405513882637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9582026652642526e-05, + "grad_norm": 27.112918853759766, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8721573352813721, + "num_tokens": 375343278.0, + "step": 9837 + }, + { + "epoch": 1.2514947207734384, + "ewc_loss": 0.03912649676203728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9563249225029722e-05, + "grad_norm": 27.4440860748291, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8717796206474304, + "num_tokens": 375378574.0, + "step": 9838 + }, + { + "epoch": 1.251621931052029, + "ewc_loss": 0.03916154056787491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9580769730964676e-05, + "grad_norm": 27.23113441467285, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8590595722198486, + "num_tokens": 375412781.0, + "step": 9839 + }, + { + "epoch": 1.2517491413306194, + "ewc_loss": 0.03900273144245148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9501365386531688e-05, + "grad_norm": 27.155536651611328, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8758492469787598, + "num_tokens": 375450431.0, + "step": 9840 + }, + { + "epoch": 1.25187635160921, + "ewc_loss": 0.03905307129025459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9526536561897956e-05, + "grad_norm": 27.386585235595703, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8587051033973694, + "num_tokens": 375490692.0, + "step": 9841 + }, + { + "epoch": 1.2520035618878005, + "ewc_loss": 0.03903678432106972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.951839294633828e-05, + "grad_norm": 26.9437198638916, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8701821565628052, + "num_tokens": 375535278.0, + "step": 9842 + }, + { + "epoch": 1.252130772166391, + "ewc_loss": 0.03897496685385704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9487482859403826e-05, + "grad_norm": 27.416545867919922, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8585638999938965, + "num_tokens": 375575859.0, + "step": 9843 + }, + { + "epoch": 1.2522579824449815, + "ewc_loss": 0.03913100063800812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.956550113391131e-05, + "grad_norm": 27.03191375732422, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8564476370811462, + "num_tokens": 375611084.0, + "step": 9844 + }, + { + "epoch": 1.252385192723572, + "ewc_loss": 0.03891116753220558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9455583242233843e-05, + "grad_norm": 27.160011291503906, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8741805553436279, + "num_tokens": 375644926.0, + "step": 9845 + }, + { + "epoch": 1.2525124030021626, + "ewc_loss": 0.03910339996218681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.955170046130661e-05, + "grad_norm": 26.978452682495117, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8730877637863159, + "num_tokens": 375686970.0, + "step": 9846 + }, + { + "epoch": 1.2526396132807531, + "ewc_loss": 0.03907083347439766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9535416868166067e-05, + "grad_norm": 27.224639892578125, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8757233619689941, + "num_tokens": 375727372.0, + "step": 9847 + }, + { + "epoch": 1.2527668235593437, + "ewc_loss": 0.03916194662451744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9580973457777873e-05, + "grad_norm": 27.072397232055664, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8530815839767456, + "num_tokens": 375763568.0, + "step": 9848 + }, + { + "epoch": 1.2528940338379342, + "ewc_loss": 0.03908548876643181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.954274375748355e-05, + "grad_norm": 27.050260543823242, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.848513662815094, + "num_tokens": 375797954.0, + "step": 9849 + }, + { + "epoch": 1.2530212441165247, + "ewc_loss": 0.039103128015995026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9551564037101343e-05, + "grad_norm": 26.965730667114258, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8619712591171265, + "num_tokens": 375836362.0, + "step": 9850 + }, + { + "epoch": 1.2531484543951152, + "ewc_loss": 0.039180293679237366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9590146621339954e-05, + "grad_norm": 27.07314682006836, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8596705794334412, + "num_tokens": 375879403.0, + "step": 9851 + }, + { + "epoch": 1.2532756646737058, + "ewc_loss": 0.03920931741595268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9604658518801443e-05, + "grad_norm": 27.128921508789062, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8677435517311096, + "num_tokens": 375916728.0, + "step": 9852 + }, + { + "epoch": 1.253402874952296, + "ewc_loss": 0.03917602077126503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.958801112778019e-05, + "grad_norm": 27.026926040649414, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8588536977767944, + "num_tokens": 375951743.0, + "step": 9853 + }, + { + "epoch": 1.2535300852308866, + "ewc_loss": 0.039181239902973175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9590619558584876e-05, + "grad_norm": 27.096433639526367, + "learning_rate": 1e-06, + "loss": 0.5154, + "mean_token_accuracy": 0.8390440344810486, + "num_tokens": 375988872.0, + "step": 9854 + }, + { + "epoch": 1.2536572955094771, + "ewc_loss": 0.03919393941760063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9596969650592655e-05, + "grad_norm": 26.998628616333008, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8541103005409241, + "num_tokens": 376030388.0, + "step": 9855 + }, + { + "epoch": 1.2537845057880677, + "ewc_loss": 0.03919156268239021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.959578185051214e-05, + "grad_norm": 27.052709579467773, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8735025525093079, + "num_tokens": 376062975.0, + "step": 9856 + }, + { + "epoch": 1.2539117160666582, + "ewc_loss": 0.03928378224372864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9641891412902623e-05, + "grad_norm": 27.03823471069336, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8573426604270935, + "num_tokens": 376101873.0, + "step": 9857 + }, + { + "epoch": 1.2540389263452487, + "ewc_loss": 0.03926778584718704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9633893316495232e-05, + "grad_norm": 27.076520919799805, + "learning_rate": 1e-06, + "loss": 0.5037, + "mean_token_accuracy": 0.8429415822029114, + "num_tokens": 376138774.0, + "step": 9858 + }, + { + "epoch": 1.2541661366238392, + "ewc_loss": 0.0393226183950901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9661309124785475e-05, + "grad_norm": 26.993305206298828, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.855657160282135, + "num_tokens": 376172357.0, + "step": 9859 + }, + { + "epoch": 1.2542933469024298, + "ewc_loss": 0.03922343626618385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9611718016676605e-05, + "grad_norm": 27.058530807495117, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8540793657302856, + "num_tokens": 376212977.0, + "step": 9860 + }, + { + "epoch": 1.2544205571810203, + "ewc_loss": 0.039343152195215225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9671575500979088e-05, + "grad_norm": 27.10732650756836, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8539380431175232, + "num_tokens": 376256989.0, + "step": 9861 + }, + { + "epoch": 1.2545477674596106, + "ewc_loss": 0.03925103321671486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9625516870291904e-05, + "grad_norm": 26.9962215423584, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8684378862380981, + "num_tokens": 376300130.0, + "step": 9862 + }, + { + "epoch": 1.2546749777382011, + "ewc_loss": 0.039413630962371826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9706814782693982e-05, + "grad_norm": 27.127357482910156, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8675673007965088, + "num_tokens": 376337475.0, + "step": 9863 + }, + { + "epoch": 1.2548021880167917, + "ewc_loss": 0.039283595979213715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9641798644443043e-05, + "grad_norm": 27.102495193481445, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.854823648929596, + "num_tokens": 376376485.0, + "step": 9864 + }, + { + "epoch": 1.2549293982953822, + "ewc_loss": 0.03927110135555267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9635550415841863e-05, + "grad_norm": 27.033748626708984, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8705534934997559, + "num_tokens": 376413393.0, + "step": 9865 + }, + { + "epoch": 1.2550566085739727, + "ewc_loss": 0.039297912269830704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9648956367745996e-05, + "grad_norm": 27.162761688232422, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8546655774116516, + "num_tokens": 376448192.0, + "step": 9866 + }, + { + "epoch": 1.2551838188525632, + "ewc_loss": 0.03925709053874016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9628545487648807e-05, + "grad_norm": 26.962167739868164, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8675057888031006, + "num_tokens": 376489382.0, + "step": 9867 + }, + { + "epoch": 1.2553110291311538, + "ewc_loss": 0.03925345838069916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9626728317234665e-05, + "grad_norm": 27.06195831298828, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8684419989585876, + "num_tokens": 376527221.0, + "step": 9868 + }, + { + "epoch": 1.2554382394097443, + "ewc_loss": 0.03929083049297333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9645414795377292e-05, + "grad_norm": 27.08512306213379, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8598819971084595, + "num_tokens": 376567045.0, + "step": 9869 + }, + { + "epoch": 1.2555654496883348, + "ewc_loss": 0.039274703711271286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9637351215351373e-05, + "grad_norm": 27.0844669342041, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8714848756790161, + "num_tokens": 376607039.0, + "step": 9870 + }, + { + "epoch": 1.2556926599669254, + "ewc_loss": 0.039241764694452286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.962088208529167e-05, + "grad_norm": 27.036331176757812, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8661150932312012, + "num_tokens": 376643220.0, + "step": 9871 + }, + { + "epoch": 1.255819870245516, + "ewc_loss": 0.039281368255615234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.964068360393867e-05, + "grad_norm": 27.156152725219727, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8838696479797363, + "num_tokens": 376681417.0, + "step": 9872 + }, + { + "epoch": 1.2559470805241064, + "ewc_loss": 0.03927820920944214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.963910472113639e-05, + "grad_norm": 27.05879020690918, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8819645643234253, + "num_tokens": 376722836.0, + "step": 9873 + }, + { + "epoch": 1.256074290802697, + "ewc_loss": 0.03916546329855919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.95827324205311e-05, + "grad_norm": 27.00950050354004, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8576368093490601, + "num_tokens": 376763194.0, + "step": 9874 + }, + { + "epoch": 1.2562015010812875, + "ewc_loss": 0.03929746523499489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.964873263204936e-05, + "grad_norm": 27.099824905395508, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8599590063095093, + "num_tokens": 376803701.0, + "step": 9875 + }, + { + "epoch": 1.256328711359878, + "ewc_loss": 0.03925109654664993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9625547793111764e-05, + "grad_norm": 27.03823471069336, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.872639000415802, + "num_tokens": 376835296.0, + "step": 9876 + }, + { + "epoch": 1.2564559216384683, + "ewc_loss": 0.039218124002218246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9609062292147428e-05, + "grad_norm": 27.043960571289062, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8671919107437134, + "num_tokens": 376876317.0, + "step": 9877 + }, + { + "epoch": 1.2565831319170588, + "ewc_loss": 0.039201054722070694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.960052759386599e-05, + "grad_norm": 27.043048858642578, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.862583339214325, + "num_tokens": 376916403.0, + "step": 9878 + }, + { + "epoch": 1.2567103421956494, + "ewc_loss": 0.03919012099504471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9595059711718932e-05, + "grad_norm": 27.003562927246094, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8736854195594788, + "num_tokens": 376956079.0, + "step": 9879 + }, + { + "epoch": 1.25683755247424, + "ewc_loss": 0.03918685019016266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9593424440245144e-05, + "grad_norm": 27.03899383544922, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8536605834960938, + "num_tokens": 376991618.0, + "step": 9880 + }, + { + "epoch": 1.2569647627528304, + "ewc_loss": 0.03922728821635246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.961364432645496e-05, + "grad_norm": 26.938772201538086, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8710623979568481, + "num_tokens": 377033964.0, + "step": 9881 + }, + { + "epoch": 1.257091973031421, + "ewc_loss": 0.039146069437265396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.95730353880208e-05, + "grad_norm": 26.997602462768555, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8720977902412415, + "num_tokens": 377070091.0, + "step": 9882 + }, + { + "epoch": 1.2572191833100115, + "ewc_loss": 0.03929698094725609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9648490706458688e-05, + "grad_norm": 26.922666549682617, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8722752332687378, + "num_tokens": 377111185.0, + "step": 9883 + }, + { + "epoch": 1.257346393588602, + "ewc_loss": 0.03920355439186096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9601777239586227e-05, + "grad_norm": 27.058610916137695, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8693299293518066, + "num_tokens": 377147637.0, + "step": 9884 + }, + { + "epoch": 1.2574736038671925, + "ewc_loss": 0.0393235869705677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.966179297596682e-05, + "grad_norm": 27.03870964050293, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8641178607940674, + "num_tokens": 377181918.0, + "step": 9885 + }, + { + "epoch": 1.2576008141457828, + "ewc_loss": 0.03924372047185898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9621860701590776e-05, + "grad_norm": 27.009313583374023, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8576865792274475, + "num_tokens": 377222900.0, + "step": 9886 + }, + { + "epoch": 1.2577280244243734, + "ewc_loss": 0.03929177671670914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9645887732622214e-05, + "grad_norm": 27.033206939697266, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8785872459411621, + "num_tokens": 377263107.0, + "step": 9887 + }, + { + "epoch": 1.257855234702964, + "ewc_loss": 0.039257172495126724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9628585505415685e-05, + "grad_norm": 27.029739379882812, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8797590732574463, + "num_tokens": 377301384.0, + "step": 9888 + }, + { + "epoch": 1.2579824449815544, + "ewc_loss": 0.0393143892288208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9657194570754655e-05, + "grad_norm": 26.98842430114746, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8583004474639893, + "num_tokens": 377339865.0, + "step": 9889 + }, + { + "epoch": 1.258109655260145, + "ewc_loss": 0.039219215512275696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.960960798896849e-05, + "grad_norm": 27.041034698486328, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.873154878616333, + "num_tokens": 377381582.0, + "step": 9890 + }, + { + "epoch": 1.2582368655387355, + "ewc_loss": 0.039321739226579666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9660868929349817e-05, + "grad_norm": 26.97676658630371, + "learning_rate": 1e-06, + "loss": 0.5107, + "mean_token_accuracy": 0.8413163423538208, + "num_tokens": 377425548.0, + "step": 9891 + }, + { + "epoch": 1.258364075817326, + "ewc_loss": 0.03926192224025726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9630961105576716e-05, + "grad_norm": 27.10379981994629, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8640770316123962, + "num_tokens": 377462298.0, + "step": 9892 + }, + { + "epoch": 1.2584912860959165, + "ewc_loss": 0.03924454748630524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.962227361218538e-05, + "grad_norm": 26.954803466796875, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8568594455718994, + "num_tokens": 377503873.0, + "step": 9893 + }, + { + "epoch": 1.258618496374507, + "ewc_loss": 0.0392620749771595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9631037503131665e-05, + "grad_norm": 27.080053329467773, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8800041675567627, + "num_tokens": 377540497.0, + "step": 9894 + }, + { + "epoch": 1.2587457066530976, + "ewc_loss": 0.03935234248638153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.967617208720185e-05, + "grad_norm": 27.07514190673828, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8644083738327026, + "num_tokens": 377582242.0, + "step": 9895 + }, + { + "epoch": 1.2588729169316881, + "ewc_loss": 0.03920116648077965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.96005839825375e-05, + "grad_norm": 27.031795501708984, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8807420134544373, + "num_tokens": 377614972.0, + "step": 9896 + }, + { + "epoch": 1.2590001272102787, + "ewc_loss": 0.039278171956539154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9639086531242356e-05, + "grad_norm": 27.0770263671875, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8618605136871338, + "num_tokens": 377645392.0, + "step": 9897 + }, + { + "epoch": 1.2591273374888692, + "ewc_loss": 0.039197634905576706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9598817743826658e-05, + "grad_norm": 27.149892807006836, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8725311160087585, + "num_tokens": 377680317.0, + "step": 9898 + }, + { + "epoch": 1.2592545477674597, + "ewc_loss": 0.03931368514895439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9656841686810367e-05, + "grad_norm": 27.173097610473633, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8646796345710754, + "num_tokens": 377714879.0, + "step": 9899 + }, + { + "epoch": 1.2593817580460502, + "ewc_loss": 0.03915708512067795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9578543287934735e-05, + "grad_norm": 26.967662811279297, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8498114347457886, + "num_tokens": 377751404.0, + "step": 9900 + }, + { + "epoch": 1.2595089683246408, + "ewc_loss": 0.03924524039030075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.962262103916146e-05, + "grad_norm": 27.105937957763672, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8682945370674133, + "num_tokens": 377788354.0, + "step": 9901 + }, + { + "epoch": 1.259636178603231, + "ewc_loss": 0.039199840277433395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9599920051405206e-05, + "grad_norm": 27.054622650146484, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.85316002368927, + "num_tokens": 377827259.0, + "step": 9902 + }, + { + "epoch": 1.2597633888818216, + "ewc_loss": 0.03930457681417465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.965228875633329e-05, + "grad_norm": 26.958126068115234, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8557875156402588, + "num_tokens": 377872477.0, + "step": 9903 + }, + { + "epoch": 1.2598905991604121, + "ewc_loss": 0.03922637179493904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9613185941125266e-05, + "grad_norm": 27.043432235717773, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8576213121414185, + "num_tokens": 377911159.0, + "step": 9904 + }, + { + "epoch": 1.2600178094390027, + "ewc_loss": 0.039351657032966614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.967582829820458e-05, + "grad_norm": 27.007301330566406, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8681565523147583, + "num_tokens": 377951919.0, + "step": 9905 + }, + { + "epoch": 1.2601450197175932, + "ewc_loss": 0.03930957615375519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9654788047773764e-05, + "grad_norm": 27.08411979675293, + "learning_rate": 1e-06, + "loss": 0.4985, + "mean_token_accuracy": 0.846685528755188, + "num_tokens": 377997567.0, + "step": 9906 + }, + { + "epoch": 1.2602722299961837, + "ewc_loss": 0.039333149790763855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.966657509910874e-05, + "grad_norm": 27.07878875732422, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8630087375640869, + "num_tokens": 378033811.0, + "step": 9907 + }, + { + "epoch": 1.2603994402747742, + "ewc_loss": 0.03937634080648422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9688170141307637e-05, + "grad_norm": 27.16758155822754, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8533360362052917, + "num_tokens": 378070136.0, + "step": 9908 + }, + { + "epoch": 1.2605266505533648, + "ewc_loss": 0.0393076129257679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.965380579349585e-05, + "grad_norm": 27.076696395874023, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8621597290039062, + "num_tokens": 378106902.0, + "step": 9909 + }, + { + "epoch": 1.2606538608319553, + "ewc_loss": 0.039358120411634445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.967906064237468e-05, + "grad_norm": 27.03951644897461, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.868253767490387, + "num_tokens": 378144638.0, + "step": 9910 + }, + { + "epoch": 1.2607810711105456, + "ewc_loss": 0.03934823349118233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9674116629175842e-05, + "grad_norm": 27.15566635131836, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.873054027557373, + "num_tokens": 378179059.0, + "step": 9911 + }, + { + "epoch": 1.2609082813891361, + "ewc_loss": 0.0393524207174778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9676210285979323e-05, + "grad_norm": 27.1099796295166, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8512738347053528, + "num_tokens": 378210346.0, + "step": 9912 + }, + { + "epoch": 1.2610354916677267, + "ewc_loss": 0.03927714750170708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9638573576230556e-05, + "grad_norm": 27.124588012695312, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8699375987052917, + "num_tokens": 378242687.0, + "step": 9913 + }, + { + "epoch": 1.2611627019463172, + "ewc_loss": 0.03934831917285919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9674160284921527e-05, + "grad_norm": 27.01809310913086, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8702014684677124, + "num_tokens": 378279421.0, + "step": 9914 + }, + { + "epoch": 1.2612899122249077, + "ewc_loss": 0.03934015706181526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.967007847269997e-05, + "grad_norm": 27.083391189575195, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.859442949295044, + "num_tokens": 378318133.0, + "step": 9915 + }, + { + "epoch": 1.2614171225034982, + "ewc_loss": 0.039423588663339615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.971179517568089e-05, + "grad_norm": 27.096647262573242, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8460408449172974, + "num_tokens": 378362987.0, + "step": 9916 + }, + { + "epoch": 1.2615443327820888, + "ewc_loss": 0.03930206969380379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.965103547263425e-05, + "grad_norm": 27.11727523803711, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8800188302993774, + "num_tokens": 378396197.0, + "step": 9917 + }, + { + "epoch": 1.2616715430606793, + "ewc_loss": 0.03931545093655586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.965772571566049e-05, + "grad_norm": 27.027997970581055, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8728959560394287, + "num_tokens": 378433691.0, + "step": 9918 + }, + { + "epoch": 1.2617987533392698, + "ewc_loss": 0.0393691211938858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9684561266331002e-05, + "grad_norm": 27.090652465820312, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8679782152175903, + "num_tokens": 378470277.0, + "step": 9919 + }, + { + "epoch": 1.2619259636178604, + "ewc_loss": 0.03939621150493622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9698105461429805e-05, + "grad_norm": 27.053468704223633, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8580433130264282, + "num_tokens": 378510500.0, + "step": 9920 + }, + { + "epoch": 1.2620531738964509, + "ewc_loss": 0.03940446674823761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.970223274838645e-05, + "grad_norm": 27.12616729736328, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8630216121673584, + "num_tokens": 378552893.0, + "step": 9921 + }, + { + "epoch": 1.2621803841750414, + "ewc_loss": 0.03930339962244034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9651699403766543e-05, + "grad_norm": 26.918363571166992, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8706074357032776, + "num_tokens": 378590917.0, + "step": 9922 + }, + { + "epoch": 1.262307594453632, + "ewc_loss": 0.03940902650356293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9704513761098497e-05, + "grad_norm": 27.120935440063477, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8723475337028503, + "num_tokens": 378637764.0, + "step": 9923 + }, + { + "epoch": 1.2624348047322225, + "ewc_loss": 0.039465732872486115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9732866348931566e-05, + "grad_norm": 27.095407485961914, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8608162999153137, + "num_tokens": 378675497.0, + "step": 9924 + }, + { + "epoch": 1.262562015010813, + "ewc_loss": 0.03934090957045555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9670455003506504e-05, + "grad_norm": 27.080793380737305, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8562476634979248, + "num_tokens": 378710470.0, + "step": 9925 + }, + { + "epoch": 1.2626892252894033, + "ewc_loss": 0.03940572589635849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9702862118720077e-05, + "grad_norm": 27.127294540405273, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8608013987541199, + "num_tokens": 378758116.0, + "step": 9926 + }, + { + "epoch": 1.2628164355679938, + "ewc_loss": 0.03929979354143143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.964989678526763e-05, + "grad_norm": 27.04596710205078, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8613002896308899, + "num_tokens": 378798166.0, + "step": 9927 + }, + { + "epoch": 1.2629436458465844, + "ewc_loss": 0.03930673003196716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.965336559806019e-05, + "grad_norm": 27.046537399291992, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8791972398757935, + "num_tokens": 378839899.0, + "step": 9928 + }, + { + "epoch": 1.263070856125175, + "ewc_loss": 0.03933437913656235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.966718991752714e-05, + "grad_norm": 27.036962509155273, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8648406267166138, + "num_tokens": 378886022.0, + "step": 9929 + }, + { + "epoch": 1.2631980664037654, + "ewc_loss": 0.039356134831905365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9678067474160343e-05, + "grad_norm": 27.10657501220703, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8550230860710144, + "num_tokens": 378925097.0, + "step": 9930 + }, + { + "epoch": 1.263325276682356, + "ewc_loss": 0.039323825389146805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9661913029267453e-05, + "grad_norm": 27.068252563476562, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8763935565948486, + "num_tokens": 378965242.0, + "step": 9931 + }, + { + "epoch": 1.2634524869609465, + "ewc_loss": 0.03932110220193863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9660550606204197e-05, + "grad_norm": 27.13400650024414, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.842520534992218, + "num_tokens": 379005932.0, + "step": 9932 + }, + { + "epoch": 1.263579697239537, + "ewc_loss": 0.03933107852935791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9665540094138123e-05, + "grad_norm": 27.06125259399414, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8617871999740601, + "num_tokens": 379045817.0, + "step": 9933 + }, + { + "epoch": 1.2637069075181275, + "ewc_loss": 0.0392778143286705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9638906451291405e-05, + "grad_norm": 27.051353454589844, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8576025366783142, + "num_tokens": 379086481.0, + "step": 9934 + }, + { + "epoch": 1.2638341177967178, + "ewc_loss": 0.03927353024482727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.963676550076343e-05, + "grad_norm": 27.14042091369629, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8467382788658142, + "num_tokens": 379129782.0, + "step": 9935 + }, + { + "epoch": 1.2639613280753084, + "ewc_loss": 0.03927481919527054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9637409423012286e-05, + "grad_norm": 27.064756393432617, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8622393608093262, + "num_tokens": 379171525.0, + "step": 9936 + }, + { + "epoch": 1.264088538353899, + "ewc_loss": 0.03925998508930206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.962999340321403e-05, + "grad_norm": 27.032634735107422, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8573558330535889, + "num_tokens": 379213887.0, + "step": 9937 + }, + { + "epoch": 1.2642157486324894, + "ewc_loss": 0.039222601801157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9611301468103193e-05, + "grad_norm": 27.115713119506836, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8646417856216431, + "num_tokens": 379252634.0, + "step": 9938 + }, + { + "epoch": 1.26434295891108, + "ewc_loss": 0.03927432745695114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9637163859442808e-05, + "grad_norm": 27.000764846801758, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8631376028060913, + "num_tokens": 379287402.0, + "step": 9939 + }, + { + "epoch": 1.2644701691896705, + "ewc_loss": 0.03924631327390671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.96231558220461e-05, + "grad_norm": 27.10219955444336, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8704507350921631, + "num_tokens": 379325592.0, + "step": 9940 + }, + { + "epoch": 1.264597379468261, + "ewc_loss": 0.03927141800522804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.963570866791997e-05, + "grad_norm": 27.116565704345703, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8520622253417969, + "num_tokens": 379359010.0, + "step": 9941 + }, + { + "epoch": 1.2647245897468515, + "ewc_loss": 0.03926989063620567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9634944692370482e-05, + "grad_norm": 27.10388946533203, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8563694953918457, + "num_tokens": 379399783.0, + "step": 9942 + }, + { + "epoch": 1.264851800025442, + "ewc_loss": 0.039319004863500595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9659502868307754e-05, + "grad_norm": 27.070085525512695, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8668827414512634, + "num_tokens": 379442207.0, + "step": 9943 + }, + { + "epoch": 1.2649790103040326, + "ewc_loss": 0.03930018097162247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9650091417133808e-05, + "grad_norm": 26.958553314208984, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8636695742607117, + "num_tokens": 379479561.0, + "step": 9944 + }, + { + "epoch": 1.2651062205826231, + "ewc_loss": 0.03934384882450104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.967192474694457e-05, + "grad_norm": 27.18103790283203, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8692939281463623, + "num_tokens": 379516413.0, + "step": 9945 + }, + { + "epoch": 1.2652334308612136, + "ewc_loss": 0.039290059357881546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.964502916962374e-05, + "grad_norm": 26.981412887573242, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8539202213287354, + "num_tokens": 379557993.0, + "step": 9946 + }, + { + "epoch": 1.2653606411398042, + "ewc_loss": 0.03931179642677307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9655897631309927e-05, + "grad_norm": 27.097896575927734, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8461857438087463, + "num_tokens": 379594028.0, + "step": 9947 + }, + { + "epoch": 1.2654878514183947, + "ewc_loss": 0.039438266307115555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9719132978934795e-05, + "grad_norm": 27.110851287841797, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8576955199241638, + "num_tokens": 379631235.0, + "step": 9948 + }, + { + "epoch": 1.2656150616969852, + "ewc_loss": 0.03930912911891937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9654564312077127e-05, + "grad_norm": 27.120927810668945, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8521223068237305, + "num_tokens": 379668714.0, + "step": 9949 + }, + { + "epoch": 1.2657422719755758, + "ewc_loss": 0.03941214084625244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9706070816027932e-05, + "grad_norm": 27.07841682434082, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8691067695617676, + "num_tokens": 379710233.0, + "step": 9950 + }, + { + "epoch": 1.265869482254166, + "ewc_loss": 0.03932397440075874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9661987607832998e-05, + "grad_norm": 27.113466262817383, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.85981684923172, + "num_tokens": 379746208.0, + "step": 9951 + }, + { + "epoch": 1.2659966925327566, + "ewc_loss": 0.03938284143805504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9691420675371774e-05, + "grad_norm": 27.057336807250977, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8729826211929321, + "num_tokens": 379796420.0, + "step": 9952 + }, + { + "epoch": 1.2661239028113471, + "ewc_loss": 0.0393342524766922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9667126252898015e-05, + "grad_norm": 27.193330764770508, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8656860589981079, + "num_tokens": 379830027.0, + "step": 9953 + }, + { + "epoch": 1.2662511130899377, + "ewc_loss": 0.039389800280332565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9694900402100757e-05, + "grad_norm": 27.084238052368164, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8642953634262085, + "num_tokens": 379871751.0, + "step": 9954 + }, + { + "epoch": 1.2663783233685282, + "ewc_loss": 0.03935069963335991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9675349903991446e-05, + "grad_norm": 27.10451316833496, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8572281002998352, + "num_tokens": 379908255.0, + "step": 9955 + }, + { + "epoch": 1.2665055336471187, + "ewc_loss": 0.03942577913403511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9712890207301825e-05, + "grad_norm": 27.110811233520508, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8664012551307678, + "num_tokens": 379944502.0, + "step": 9956 + }, + { + "epoch": 1.2666327439257092, + "ewc_loss": 0.03936311975121498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.968155993381515e-05, + "grad_norm": 27.040544509887695, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8615254163742065, + "num_tokens": 379981929.0, + "step": 9957 + }, + { + "epoch": 1.2667599542042998, + "ewc_loss": 0.039374858140945435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9687429812620394e-05, + "grad_norm": 27.12053680419922, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8586280345916748, + "num_tokens": 380018876.0, + "step": 9958 + }, + { + "epoch": 1.2668871644828903, + "ewc_loss": 0.03942593187093735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9712966604856774e-05, + "grad_norm": 27.04975700378418, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8593354821205139, + "num_tokens": 380054153.0, + "step": 9959 + }, + { + "epoch": 1.2670143747614806, + "ewc_loss": 0.03934280201792717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9671400877996348e-05, + "grad_norm": 27.096132278442383, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8537927865982056, + "num_tokens": 380086550.0, + "step": 9960 + }, + { + "epoch": 1.2671415850400711, + "ewc_loss": 0.03948555886745453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.974277984118089e-05, + "grad_norm": 27.207820892333984, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8584244251251221, + "num_tokens": 380120308.0, + "step": 9961 + }, + { + "epoch": 1.2672687953186617, + "ewc_loss": 0.03931359574198723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9656798031064682e-05, + "grad_norm": 26.98682975769043, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8466109037399292, + "num_tokens": 380160204.0, + "step": 9962 + }, + { + "epoch": 1.2673960055972522, + "ewc_loss": 0.03931855037808418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.965927549463231e-05, + "grad_norm": 27.040029525756836, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.866536021232605, + "num_tokens": 380196687.0, + "step": 9963 + }, + { + "epoch": 1.2675232158758427, + "ewc_loss": 0.039451394230127335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.972569771169219e-05, + "grad_norm": 27.13324737548828, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8784000873565674, + "num_tokens": 380237451.0, + "step": 9964 + }, + { + "epoch": 1.2676504261544332, + "ewc_loss": 0.039406850934028625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9703426005435176e-05, + "grad_norm": 27.026226043701172, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8609384298324585, + "num_tokens": 380276631.0, + "step": 9965 + }, + { + "epoch": 1.2677776364330238, + "ewc_loss": 0.03939857706427574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.969928780454211e-05, + "grad_norm": 27.11429214477539, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8677250146865845, + "num_tokens": 380319432.0, + "step": 9966 + }, + { + "epoch": 1.2679048467116143, + "ewc_loss": 0.03945906087756157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9729530322365463e-05, + "grad_norm": 27.079870223999023, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8726733326911926, + "num_tokens": 380356666.0, + "step": 9967 + }, + { + "epoch": 1.2680320569902048, + "ewc_loss": 0.03939457982778549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9697290554177016e-05, + "grad_norm": 27.065021514892578, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.873037576675415, + "num_tokens": 380390347.0, + "step": 9968 + }, + { + "epoch": 1.2681592672687954, + "ewc_loss": 0.03944665938615799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9723329387488775e-05, + "grad_norm": 27.142229080200195, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8657947778701782, + "num_tokens": 380424083.0, + "step": 9969 + }, + { + "epoch": 1.2682864775473859, + "ewc_loss": 0.03937271982431412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.968636024685111e-05, + "grad_norm": 26.942110061645508, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8587050437927246, + "num_tokens": 380471179.0, + "step": 9970 + }, + { + "epoch": 1.2684136878259764, + "ewc_loss": 0.039519619196653366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9759809219976887e-05, + "grad_norm": 27.27314567565918, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8650471568107605, + "num_tokens": 380510291.0, + "step": 9971 + }, + { + "epoch": 1.268540898104567, + "ewc_loss": 0.03946179524064064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.973089820239693e-05, + "grad_norm": 27.153045654296875, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8678606152534485, + "num_tokens": 380556297.0, + "step": 9972 + }, + { + "epoch": 1.2686681083831575, + "ewc_loss": 0.03937222436070442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9686112864292227e-05, + "grad_norm": 27.220945358276367, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.874397337436676, + "num_tokens": 380592113.0, + "step": 9973 + }, + { + "epoch": 1.268795318661748, + "ewc_loss": 0.03942418843507767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9712093489943072e-05, + "grad_norm": 27.229921340942383, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.877484142780304, + "num_tokens": 380623942.0, + "step": 9974 + }, + { + "epoch": 1.2689225289403383, + "ewc_loss": 0.039363794028759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9681896446854807e-05, + "grad_norm": 27.14857292175293, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8572145700454712, + "num_tokens": 380662300.0, + "step": 9975 + }, + { + "epoch": 1.2690497392189288, + "ewc_loss": 0.039370857179164886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9685428924276493e-05, + "grad_norm": 27.28780746459961, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8649505376815796, + "num_tokens": 380701452.0, + "step": 9976 + }, + { + "epoch": 1.2691769494975194, + "ewc_loss": 0.03936920687556267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9684603103087284e-05, + "grad_norm": 27.069740295410156, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8748222589492798, + "num_tokens": 380741248.0, + "step": 9977 + }, + { + "epoch": 1.2693041597761099, + "ewc_loss": 0.0393250435590744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.966252239071764e-05, + "grad_norm": 27.19825553894043, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8566969633102417, + "num_tokens": 380781455.0, + "step": 9978 + }, + { + "epoch": 1.2694313700547004, + "ewc_loss": 0.0393700934946537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.968504693650175e-05, + "grad_norm": 27.155654907226562, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8553261160850525, + "num_tokens": 380818139.0, + "step": 9979 + }, + { + "epoch": 1.269558580333291, + "ewc_loss": 0.03937717154622078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.968858668988105e-05, + "grad_norm": 27.36017417907715, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8694912195205688, + "num_tokens": 380860530.0, + "step": 9980 + }, + { + "epoch": 1.2696857906118815, + "ewc_loss": 0.03936169669032097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9680848708958365e-05, + "grad_norm": 27.16437530517578, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8763920664787292, + "num_tokens": 380900825.0, + "step": 9981 + }, + { + "epoch": 1.269813000890472, + "ewc_loss": 0.039266377687454224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.963318936759606e-05, + "grad_norm": 27.09690284729004, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8613463640213013, + "num_tokens": 380939270.0, + "step": 9982 + }, + { + "epoch": 1.2699402111690625, + "ewc_loss": 0.039323192089796066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9661596525111236e-05, + "grad_norm": 27.14950942993164, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8420167565345764, + "num_tokens": 380981708.0, + "step": 9983 + }, + { + "epoch": 1.2700674214476528, + "ewc_loss": 0.03932010754942894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.966005402209703e-05, + "grad_norm": 27.07562255859375, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8694562315940857, + "num_tokens": 381015186.0, + "step": 9984 + }, + { + "epoch": 1.2701946317262434, + "ewc_loss": 0.03937838226556778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9689190594363026e-05, + "grad_norm": 27.283586502075195, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8738196492195129, + "num_tokens": 381055503.0, + "step": 9985 + }, + { + "epoch": 1.270321842004834, + "ewc_loss": 0.03928043320775032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9640216123661958e-05, + "grad_norm": 26.989917755126953, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8764635920524597, + "num_tokens": 381091935.0, + "step": 9986 + }, + { + "epoch": 1.2704490522834244, + "ewc_loss": 0.0393342450261116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9667122614919208e-05, + "grad_norm": 27.16230010986328, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8689975738525391, + "num_tokens": 381127951.0, + "step": 9987 + }, + { + "epoch": 1.270576262562015, + "ewc_loss": 0.03936144337058067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9680721379700117e-05, + "grad_norm": 27.245485305786133, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8569145202636719, + "num_tokens": 381161918.0, + "step": 9988 + }, + { + "epoch": 1.2707034728406055, + "ewc_loss": 0.03931870311498642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.965935189218726e-05, + "grad_norm": 26.997011184692383, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8520119190216064, + "num_tokens": 381198747.0, + "step": 9989 + }, + { + "epoch": 1.270830683119196, + "ewc_loss": 0.039304569363594055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9652285118354484e-05, + "grad_norm": 27.14978790283203, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8647867441177368, + "num_tokens": 381242714.0, + "step": 9990 + }, + { + "epoch": 1.2709578933977865, + "ewc_loss": 0.0393783338367939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.968916694750078e-05, + "grad_norm": 27.074893951416016, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8748307824134827, + "num_tokens": 381274667.0, + "step": 9991 + }, + { + "epoch": 1.271085103676377, + "ewc_loss": 0.03932909667491913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.966454874491319e-05, + "grad_norm": 27.04376792907715, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8737444877624512, + "num_tokens": 381318110.0, + "step": 9992 + }, + { + "epoch": 1.2712123139549676, + "ewc_loss": 0.03935101255774498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.967550633708015e-05, + "grad_norm": 27.029706954956055, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8652762174606323, + "num_tokens": 381356694.0, + "step": 9993 + }, + { + "epoch": 1.2713395242335581, + "ewc_loss": 0.03934495523571968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9672477719723247e-05, + "grad_norm": 27.132204055786133, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8536030054092407, + "num_tokens": 381395218.0, + "step": 9994 + }, + { + "epoch": 1.2714667345121486, + "ewc_loss": 0.039387717843055725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9693858121172525e-05, + "grad_norm": 27.113117218017578, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8778588771820068, + "num_tokens": 381429195.0, + "step": 9995 + }, + { + "epoch": 1.2715939447907392, + "ewc_loss": 0.03937966749072075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.968983451661188e-05, + "grad_norm": 27.118070602416992, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8580067157745361, + "num_tokens": 381468276.0, + "step": 9996 + }, + { + "epoch": 1.2717211550693297, + "ewc_loss": 0.03939737379550934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.969868753803894e-05, + "grad_norm": 27.080402374267578, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8722174167633057, + "num_tokens": 381503400.0, + "step": 9997 + }, + { + "epoch": 1.2718483653479202, + "ewc_loss": 0.03940042108297348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9700210032169707e-05, + "grad_norm": 27.151365280151367, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8694422245025635, + "num_tokens": 381542126.0, + "step": 9998 + }, + { + "epoch": 1.2719755756265108, + "ewc_loss": 0.039422277361154556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.971113852050621e-05, + "grad_norm": 27.097389221191406, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8719190359115601, + "num_tokens": 381579169.0, + "step": 9999 + }, + { + "epoch": 1.272102785905101, + "ewc_loss": 0.039322659373283386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9661329133668914e-05, + "grad_norm": 26.987672805786133, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8490565419197083, + "num_tokens": 381615501.0, + "step": 10000 + }, + { + "epoch": 1.2722299961836916, + "ewc_loss": 0.039427340030670166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9713670553755946e-05, + "grad_norm": 27.1143798828125, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8591310381889343, + "num_tokens": 381649737.0, + "step": 10001 + }, + { + "epoch": 1.2723572064622821, + "ewc_loss": 0.039430540055036545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9715269445441663e-05, + "grad_norm": 27.13099479675293, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8678406476974487, + "num_tokens": 381693776.0, + "step": 10002 + }, + { + "epoch": 1.2724844167408726, + "ewc_loss": 0.039389584213495255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9694791262736544e-05, + "grad_norm": 27.159046173095703, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8747415542602539, + "num_tokens": 381733324.0, + "step": 10003 + }, + { + "epoch": 1.2726116270194632, + "ewc_loss": 0.03940001130104065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.970000630535651e-05, + "grad_norm": 27.06463623046875, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.859250545501709, + "num_tokens": 381765893.0, + "step": 10004 + }, + { + "epoch": 1.2727388372980537, + "ewc_loss": 0.03940543532371521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9702718418557197e-05, + "grad_norm": 27.101896286010742, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8630821704864502, + "num_tokens": 381802782.0, + "step": 10005 + }, + { + "epoch": 1.2728660475766442, + "ewc_loss": 0.039455022662878036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9727511244127527e-05, + "grad_norm": 27.10652732849121, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8676767945289612, + "num_tokens": 381843448.0, + "step": 10006 + }, + { + "epoch": 1.2729932578552348, + "ewc_loss": 0.039446957409381866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9723478544619866e-05, + "grad_norm": 27.04884910583496, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.852368175983429, + "num_tokens": 381878850.0, + "step": 10007 + }, + { + "epoch": 1.2731204681338253, + "ewc_loss": 0.039500586688518524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.975029408640694e-05, + "grad_norm": 27.204360961914062, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8425017595291138, + "num_tokens": 381915508.0, + "step": 10008 + }, + { + "epoch": 1.2732476784124156, + "ewc_loss": 0.03949182853102684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.97459139599232e-05, + "grad_norm": 27.10334587097168, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8546130657196045, + "num_tokens": 381952699.0, + "step": 10009 + }, + { + "epoch": 1.2733748886910061, + "ewc_loss": 0.039460439234972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9730219719349407e-05, + "grad_norm": 27.07598876953125, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8577052354812622, + "num_tokens": 381989947.0, + "step": 10010 + }, + { + "epoch": 1.2735020989695967, + "ewc_loss": 0.03953627496957779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.976813655346632e-05, + "grad_norm": 27.182592391967773, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8629400730133057, + "num_tokens": 382027701.0, + "step": 10011 + }, + { + "epoch": 1.2736293092481872, + "ewc_loss": 0.03946824371814728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9734121451620013e-05, + "grad_norm": 27.103025436401367, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.861819863319397, + "num_tokens": 382066415.0, + "step": 10012 + }, + { + "epoch": 1.2737565195267777, + "ewc_loss": 0.03950384631752968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9751923900912516e-05, + "grad_norm": 27.1364803314209, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8590872883796692, + "num_tokens": 382108402.0, + "step": 10013 + }, + { + "epoch": 1.2738837298053682, + "ewc_loss": 0.039557140320539474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.977857027668506e-05, + "grad_norm": 27.14898109436035, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.865327775478363, + "num_tokens": 382146865.0, + "step": 10014 + }, + { + "epoch": 1.2740109400839588, + "ewc_loss": 0.03951837122440338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.975918530661147e-05, + "grad_norm": 27.130373001098633, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.861213207244873, + "num_tokens": 382186746.0, + "step": 10015 + }, + { + "epoch": 1.2741381503625493, + "ewc_loss": 0.03956708312034607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.978354157472495e-05, + "grad_norm": 27.244295120239258, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8769605159759521, + "num_tokens": 382220693.0, + "step": 10016 + }, + { + "epoch": 1.2742653606411398, + "ewc_loss": 0.03947628661990166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9738143237191252e-05, + "grad_norm": 27.02705192565918, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8584179878234863, + "num_tokens": 382259549.0, + "step": 10017 + }, + { + "epoch": 1.2743925709197303, + "ewc_loss": 0.03946835175156593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.973417602130212e-05, + "grad_norm": 27.13533592224121, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8741270899772644, + "num_tokens": 382299164.0, + "step": 10018 + }, + { + "epoch": 1.2745197811983209, + "ewc_loss": 0.03954814001917839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9774070096900687e-05, + "grad_norm": 27.06424903869629, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.877253532409668, + "num_tokens": 382333596.0, + "step": 10019 + }, + { + "epoch": 1.2746469914769114, + "ewc_loss": 0.03941237926483154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9706189050339162e-05, + "grad_norm": 27.04176139831543, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8664672374725342, + "num_tokens": 382372321.0, + "step": 10020 + }, + { + "epoch": 1.274774201755502, + "ewc_loss": 0.039590343832969666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9795172192971222e-05, + "grad_norm": 27.20072364807129, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8673162460327148, + "num_tokens": 382411125.0, + "step": 10021 + }, + { + "epoch": 1.2749014120340925, + "ewc_loss": 0.03953108564019203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9765542674576864e-05, + "grad_norm": 27.102413177490234, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8541186451911926, + "num_tokens": 382450935.0, + "step": 10022 + }, + { + "epoch": 1.275028622312683, + "ewc_loss": 0.039512138813734055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9756069377763197e-05, + "grad_norm": 27.085102081298828, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8624144196510315, + "num_tokens": 382488232.0, + "step": 10023 + }, + { + "epoch": 1.2751558325912733, + "ewc_loss": 0.03957052528858185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9785262338700704e-05, + "grad_norm": 27.109769821166992, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8584529757499695, + "num_tokens": 382522280.0, + "step": 10024 + }, + { + "epoch": 1.2752830428698638, + "ewc_loss": 0.03949318826198578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9746594261960126e-05, + "grad_norm": 27.226436614990234, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8739830851554871, + "num_tokens": 382562788.0, + "step": 10025 + }, + { + "epoch": 1.2754102531484544, + "ewc_loss": 0.03954513370990753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9772567611653358e-05, + "grad_norm": 27.094743728637695, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8573254346847534, + "num_tokens": 382598534.0, + "step": 10026 + }, + { + "epoch": 1.2755374634270449, + "ewc_loss": 0.03950983285903931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.975491613848135e-05, + "grad_norm": 27.300403594970703, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8578351736068726, + "num_tokens": 382641803.0, + "step": 10027 + }, + { + "epoch": 1.2756646737056354, + "ewc_loss": 0.03945077955722809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9725390302482992e-05, + "grad_norm": 27.0582218170166, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8748891353607178, + "num_tokens": 382677871.0, + "step": 10028 + }, + { + "epoch": 1.275791883984226, + "ewc_loss": 0.03944418579339981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9722092474694364e-05, + "grad_norm": 27.222509384155273, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8734678030014038, + "num_tokens": 382714989.0, + "step": 10029 + }, + { + "epoch": 1.2759190942628165, + "ewc_loss": 0.03957968205213547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.978984073502943e-05, + "grad_norm": 27.22113800048828, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8676002621650696, + "num_tokens": 382751919.0, + "step": 10030 + }, + { + "epoch": 1.276046304541407, + "ewc_loss": 0.03942881152033806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.971440542547498e-05, + "grad_norm": 27.12383460998535, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.875532865524292, + "num_tokens": 382793550.0, + "step": 10031 + }, + { + "epoch": 1.2761735148199975, + "ewc_loss": 0.03947601467370987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9738006812985986e-05, + "grad_norm": 27.159423828125, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8711699843406677, + "num_tokens": 382833142.0, + "step": 10032 + }, + { + "epoch": 1.2763007250985878, + "ewc_loss": 0.039456047117710114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9728024199139327e-05, + "grad_norm": 27.220972061157227, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8720071315765381, + "num_tokens": 382869934.0, + "step": 10033 + }, + { + "epoch": 1.2764279353771784, + "ewc_loss": 0.03946952894330025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9734765373868868e-05, + "grad_norm": 27.140295028686523, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8514402508735657, + "num_tokens": 382909533.0, + "step": 10034 + }, + { + "epoch": 1.2765551456557689, + "ewc_loss": 0.039398349821567535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.969917502719909e-05, + "grad_norm": 27.090600967407227, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8704758882522583, + "num_tokens": 382947462.0, + "step": 10035 + }, + { + "epoch": 1.2766823559343594, + "ewc_loss": 0.03950934857130051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9754674212890677e-05, + "grad_norm": 27.21807098388672, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8651014566421509, + "num_tokens": 382989973.0, + "step": 10036 + }, + { + "epoch": 1.27680956621295, + "ewc_loss": 0.03941556438803673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.970778248505667e-05, + "grad_norm": 27.087705612182617, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.863200306892395, + "num_tokens": 383033052.0, + "step": 10037 + }, + { + "epoch": 1.2769367764915405, + "ewc_loss": 0.039393242448568344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.969662116607651e-05, + "grad_norm": 27.180299758911133, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8629124760627747, + "num_tokens": 383075981.0, + "step": 10038 + }, + { + "epoch": 1.277063986770131, + "ewc_loss": 0.03946122154593468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.973061080207117e-05, + "grad_norm": 27.108427047729492, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8804817199707031, + "num_tokens": 383111984.0, + "step": 10039 + }, + { + "epoch": 1.2771911970487215, + "ewc_loss": 0.03944720700383186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.972360405488871e-05, + "grad_norm": 27.127538681030273, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8660093545913696, + "num_tokens": 383155323.0, + "step": 10040 + }, + { + "epoch": 1.277318407327312, + "ewc_loss": 0.03945387527346611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.972693826246541e-05, + "grad_norm": 27.192523956298828, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8634098172187805, + "num_tokens": 383190069.0, + "step": 10041 + }, + { + "epoch": 1.2774456176059026, + "ewc_loss": 0.03943794220685959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.971897108887788e-05, + "grad_norm": 27.188167572021484, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.856065571308136, + "num_tokens": 383224999.0, + "step": 10042 + }, + { + "epoch": 1.2775728278844931, + "ewc_loss": 0.03938790038228035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9693950889632106e-05, + "grad_norm": 27.07815933227539, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8572444915771484, + "num_tokens": 383267764.0, + "step": 10043 + }, + { + "epoch": 1.2777000381630836, + "ewc_loss": 0.0393621101975441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9681054254760966e-05, + "grad_norm": 27.144380569458008, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.873916745185852, + "num_tokens": 383306148.0, + "step": 10044 + }, + { + "epoch": 1.2778272484416742, + "ewc_loss": 0.039513327181339264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9756664187298156e-05, + "grad_norm": 27.149322509765625, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8680413961410522, + "num_tokens": 383343408.0, + "step": 10045 + }, + { + "epoch": 1.2779544587202647, + "ewc_loss": 0.0394534096121788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9726705431821756e-05, + "grad_norm": 27.229713439941406, + "learning_rate": 1e-06, + "loss": 0.357, + "mean_token_accuracy": 0.8874301910400391, + "num_tokens": 383383626.0, + "step": 10046 + }, + { + "epoch": 1.2780816689988552, + "ewc_loss": 0.03939598426222801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9697992684086785e-05, + "grad_norm": 27.160655975341797, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8485008478164673, + "num_tokens": 383423218.0, + "step": 10047 + }, + { + "epoch": 1.2782088792774458, + "ewc_loss": 0.03943408280611038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9717041141120717e-05, + "grad_norm": 27.15378761291504, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8722349405288696, + "num_tokens": 383461126.0, + "step": 10048 + }, + { + "epoch": 1.278336089556036, + "ewc_loss": 0.03943103551864624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.971551864698995e-05, + "grad_norm": 27.191843032836914, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8560431599617004, + "num_tokens": 383501808.0, + "step": 10049 + }, + { + "epoch": 1.2784632998346266, + "ewc_loss": 0.03936266154050827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9681330741150305e-05, + "grad_norm": 27.10484504699707, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8856675028800964, + "num_tokens": 383542395.0, + "step": 10050 + }, + { + "epoch": 1.2785905101132171, + "ewc_loss": 0.039438676089048386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9719338524737395e-05, + "grad_norm": 27.229148864746094, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8605419993400574, + "num_tokens": 383579968.0, + "step": 10051 + }, + { + "epoch": 1.2787177203918076, + "ewc_loss": 0.0393807515501976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9690376575454138e-05, + "grad_norm": 27.034130096435547, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8646525144577026, + "num_tokens": 383617861.0, + "step": 10052 + }, + { + "epoch": 1.2788449306703982, + "ewc_loss": 0.03941333666443825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9706667444552295e-05, + "grad_norm": 27.27564811706543, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8592613935470581, + "num_tokens": 383652534.0, + "step": 10053 + }, + { + "epoch": 1.2789721409489887, + "ewc_loss": 0.03948689252138138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.974344559130259e-05, + "grad_norm": 27.228225708007812, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8606114983558655, + "num_tokens": 383693120.0, + "step": 10054 + }, + { + "epoch": 1.2790993512275792, + "ewc_loss": 0.039354197680950165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9677099771797657e-05, + "grad_norm": 27.100406646728516, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.854547381401062, + "num_tokens": 383734958.0, + "step": 10055 + }, + { + "epoch": 1.2792265615061698, + "ewc_loss": 0.039415232837200165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9707616957020946e-05, + "grad_norm": 27.127275466918945, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8572332262992859, + "num_tokens": 383768873.0, + "step": 10056 + }, + { + "epoch": 1.2793537717847603, + "ewc_loss": 0.03941253572702408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9706267266883515e-05, + "grad_norm": 27.137535095214844, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8657709360122681, + "num_tokens": 383804063.0, + "step": 10057 + }, + { + "epoch": 1.2794809820633506, + "ewc_loss": 0.03938794881105423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9693974536494352e-05, + "grad_norm": 27.113712310791016, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8782945275306702, + "num_tokens": 383843098.0, + "step": 10058 + }, + { + "epoch": 1.2796081923419411, + "ewc_loss": 0.039479196071624756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.973959842871409e-05, + "grad_norm": 27.137605667114258, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.872494101524353, + "num_tokens": 383879443.0, + "step": 10059 + }, + { + "epoch": 1.2797354026205316, + "ewc_loss": 0.039415549486875534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9707775209099054e-05, + "grad_norm": 27.081890106201172, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8640260696411133, + "num_tokens": 383918071.0, + "step": 10060 + }, + { + "epoch": 1.2798626128991222, + "ewc_loss": 0.03949796408414841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9748982595046982e-05, + "grad_norm": 27.182636260986328, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8579995632171631, + "num_tokens": 383953052.0, + "step": 10061 + }, + { + "epoch": 1.2799898231777127, + "ewc_loss": 0.039487287402153015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9743643861147575e-05, + "grad_norm": 27.132766723632812, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8622196912765503, + "num_tokens": 383992835.0, + "step": 10062 + }, + { + "epoch": 1.2801170334563032, + "ewc_loss": 0.03945223614573479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.972611789824441e-05, + "grad_norm": 27.088428497314453, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8518299460411072, + "num_tokens": 384031234.0, + "step": 10063 + }, + { + "epoch": 1.2802442437348938, + "ewc_loss": 0.03951792046427727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.975895975192543e-05, + "grad_norm": 27.185279846191406, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8556593656539917, + "num_tokens": 384063611.0, + "step": 10064 + }, + { + "epoch": 1.2803714540134843, + "ewc_loss": 0.039564166218042374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9782082745223306e-05, + "grad_norm": 27.175098419189453, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8734644651412964, + "num_tokens": 384106757.0, + "step": 10065 + }, + { + "epoch": 1.2804986642920748, + "ewc_loss": 0.039461906999349594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9730952772079036e-05, + "grad_norm": 27.262393951416016, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8677825927734375, + "num_tokens": 384143094.0, + "step": 10066 + }, + { + "epoch": 1.2806258745706653, + "ewc_loss": 0.03954857587814331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9774288375629112e-05, + "grad_norm": 27.205121994018555, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8637546300888062, + "num_tokens": 384181498.0, + "step": 10067 + }, + { + "epoch": 1.2807530848492559, + "ewc_loss": 0.03944194316864014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.972097197722178e-05, + "grad_norm": 27.191909790039062, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8680931925773621, + "num_tokens": 384209784.0, + "step": 10068 + }, + { + "epoch": 1.2808802951278464, + "ewc_loss": 0.039505962282419205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9752980733755976e-05, + "grad_norm": 27.261037826538086, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8700518608093262, + "num_tokens": 384248000.0, + "step": 10069 + }, + { + "epoch": 1.281007505406437, + "ewc_loss": 0.03950049728155136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.975024861167185e-05, + "grad_norm": 27.141883850097656, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8552420139312744, + "num_tokens": 384290897.0, + "step": 10070 + }, + { + "epoch": 1.2811347156850275, + "ewc_loss": 0.03942682594060898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9713412257260643e-05, + "grad_norm": 27.037609100341797, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8584168553352356, + "num_tokens": 384329464.0, + "step": 10071 + }, + { + "epoch": 1.281261925963618, + "ewc_loss": 0.03958570957183838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9792854800471105e-05, + "grad_norm": 27.420639038085938, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8673655390739441, + "num_tokens": 384368915.0, + "step": 10072 + }, + { + "epoch": 1.2813891362422083, + "ewc_loss": 0.03952790051698685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.976395105884876e-05, + "grad_norm": 26.96831703186035, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8737785816192627, + "num_tokens": 384407445.0, + "step": 10073 + }, + { + "epoch": 1.2815163465207988, + "ewc_loss": 0.03952058404684067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9760291252168827e-05, + "grad_norm": 27.2869873046875, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8599474430084229, + "num_tokens": 384444384.0, + "step": 10074 + }, + { + "epoch": 1.2816435567993893, + "ewc_loss": 0.03962935134768486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.981467539735604e-05, + "grad_norm": 27.1901912689209, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8626670837402344, + "num_tokens": 384479670.0, + "step": 10075 + }, + { + "epoch": 1.2817707670779799, + "ewc_loss": 0.03954063728451729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9770319340750575e-05, + "grad_norm": 27.22410011291504, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8530634641647339, + "num_tokens": 384522436.0, + "step": 10076 + }, + { + "epoch": 1.2818979773565704, + "ewc_loss": 0.03958561643958092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9792807506746612e-05, + "grad_norm": 27.186573028564453, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.857316792011261, + "num_tokens": 384559754.0, + "step": 10077 + }, + { + "epoch": 1.282025187635161, + "ewc_loss": 0.03948065638542175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.974032784346491e-05, + "grad_norm": 27.223541259765625, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8614288568496704, + "num_tokens": 384595153.0, + "step": 10078 + }, + { + "epoch": 1.2821523979137515, + "ewc_loss": 0.03954044356942177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9770221115322784e-05, + "grad_norm": 27.01335334777832, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8629021644592285, + "num_tokens": 384628345.0, + "step": 10079 + }, + { + "epoch": 1.282279608192342, + "ewc_loss": 0.03953549265861511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9767745470744558e-05, + "grad_norm": 27.336055755615234, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8596146106719971, + "num_tokens": 384666725.0, + "step": 10080 + }, + { + "epoch": 1.2824068184709325, + "ewc_loss": 0.03958459571003914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.979229818971362e-05, + "grad_norm": 27.137907028198242, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8549053072929382, + "num_tokens": 384706217.0, + "step": 10081 + }, + { + "epoch": 1.2825340287495228, + "ewc_loss": 0.039457403123378754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9728700863197446e-05, + "grad_norm": 27.23329734802246, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8599613904953003, + "num_tokens": 384741835.0, + "step": 10082 + }, + { + "epoch": 1.2826612390281134, + "ewc_loss": 0.03951592370867729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9757961126742885e-05, + "grad_norm": 27.29173469543457, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8654348850250244, + "num_tokens": 384777273.0, + "step": 10083 + }, + { + "epoch": 1.2827884493067039, + "ewc_loss": 0.039508428424596786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.975421400857158e-05, + "grad_norm": 27.147533416748047, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.878810703754425, + "num_tokens": 384814950.0, + "step": 10084 + }, + { + "epoch": 1.2829156595852944, + "ewc_loss": 0.03943946212530136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.971973142644856e-05, + "grad_norm": 27.201053619384766, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8555814623832703, + "num_tokens": 384848088.0, + "step": 10085 + }, + { + "epoch": 1.283042869863885, + "ewc_loss": 0.039545755833387375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9772878658841364e-05, + "grad_norm": 27.223772048950195, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8561214208602905, + "num_tokens": 384887361.0, + "step": 10086 + }, + { + "epoch": 1.2831700801424755, + "ewc_loss": 0.03949480876326561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9747403712244704e-05, + "grad_norm": 27.27985191345215, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8703981637954712, + "num_tokens": 384930459.0, + "step": 10087 + }, + { + "epoch": 1.283297290421066, + "ewc_loss": 0.03949586674571037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9747933038161136e-05, + "grad_norm": 27.161346435546875, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8583502173423767, + "num_tokens": 384969506.0, + "step": 10088 + }, + { + "epoch": 1.2834245006996565, + "ewc_loss": 0.03947392478585243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.973696271306835e-05, + "grad_norm": 27.174007415771484, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8702971935272217, + "num_tokens": 385004368.0, + "step": 10089 + }, + { + "epoch": 1.283551710978247, + "ewc_loss": 0.03952045366168022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9760227587539703e-05, + "grad_norm": 27.29922103881836, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8658353090286255, + "num_tokens": 385044553.0, + "step": 10090 + }, + { + "epoch": 1.2836789212568376, + "ewc_loss": 0.03955027088522911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9775136024691164e-05, + "grad_norm": 27.254762649536133, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.876397967338562, + "num_tokens": 385076873.0, + "step": 10091 + }, + { + "epoch": 1.283806131535428, + "ewc_loss": 0.03945131227374077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.972565587493591e-05, + "grad_norm": 27.089893341064453, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8662954568862915, + "num_tokens": 385113969.0, + "step": 10092 + }, + { + "epoch": 1.2839333418140186, + "ewc_loss": 0.03951457142829895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.975728628167417e-05, + "grad_norm": 27.47304916381836, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8745547533035278, + "num_tokens": 385149460.0, + "step": 10093 + }, + { + "epoch": 1.2840605520926092, + "ewc_loss": 0.03952037915587425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9760189388762228e-05, + "grad_norm": 27.335554122924805, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8466261029243469, + "num_tokens": 385187240.0, + "step": 10094 + }, + { + "epoch": 1.2841877623711997, + "ewc_loss": 0.039471711963415146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9735856767510995e-05, + "grad_norm": 27.081748962402344, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8571303486824036, + "num_tokens": 385225532.0, + "step": 10095 + }, + { + "epoch": 1.2843149726497902, + "ewc_loss": 0.039499495178461075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9749748389585875e-05, + "grad_norm": 27.479265213012695, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8663727641105652, + "num_tokens": 385267404.0, + "step": 10096 + }, + { + "epoch": 1.2844421829283807, + "ewc_loss": 0.03956367447972298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9781837181653827e-05, + "grad_norm": 27.137414932250977, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8527481555938721, + "num_tokens": 385314008.0, + "step": 10097 + }, + { + "epoch": 1.284569393206971, + "ewc_loss": 0.039348337799310684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9674169379868545e-05, + "grad_norm": 27.151844024658203, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8517975807189941, + "num_tokens": 385352204.0, + "step": 10098 + }, + { + "epoch": 1.2846966034855616, + "ewc_loss": 0.03957490622997284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9787452401942573e-05, + "grad_norm": 27.276580810546875, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8569057583808899, + "num_tokens": 385393307.0, + "step": 10099 + }, + { + "epoch": 1.2848238137641521, + "ewc_loss": 0.03944491595029831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9722458091564476e-05, + "grad_norm": 27.135034561157227, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8649564981460571, + "num_tokens": 385426124.0, + "step": 10100 + }, + { + "epoch": 1.2849510240427426, + "ewc_loss": 0.03946496918797493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.973248436115682e-05, + "grad_norm": 27.108749389648438, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8687853813171387, + "num_tokens": 385467471.0, + "step": 10101 + }, + { + "epoch": 1.2850782343213332, + "ewc_loss": 0.03946980461478233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9734901798074134e-05, + "grad_norm": 27.192371368408203, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.855892539024353, + "num_tokens": 385508780.0, + "step": 10102 + }, + { + "epoch": 1.2852054445999237, + "ewc_loss": 0.03953471779823303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9767358026001602e-05, + "grad_norm": 27.039804458618164, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8697464466094971, + "num_tokens": 385552646.0, + "step": 10103 + }, + { + "epoch": 1.2853326548785142, + "ewc_loss": 0.03947412595152855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9737062757485546e-05, + "grad_norm": 27.18654441833496, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8521008491516113, + "num_tokens": 385586676.0, + "step": 10104 + }, + { + "epoch": 1.2854598651571048, + "ewc_loss": 0.039600323885679245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.980016168090515e-05, + "grad_norm": 27.20237159729004, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8522582054138184, + "num_tokens": 385623201.0, + "step": 10105 + }, + { + "epoch": 1.2855870754356953, + "ewc_loss": 0.03956294059753418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.978146974579431e-05, + "grad_norm": 27.191211700439453, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8743710517883301, + "num_tokens": 385662716.0, + "step": 10106 + }, + { + "epoch": 1.2857142857142856, + "ewc_loss": 0.03955743461847305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9778717614826746e-05, + "grad_norm": 27.070804595947266, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.853376567363739, + "num_tokens": 385706986.0, + "step": 10107 + }, + { + "epoch": 1.2858414959928761, + "ewc_loss": 0.039522673934698105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9761337171075866e-05, + "grad_norm": 27.171648025512695, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8637591600418091, + "num_tokens": 385744240.0, + "step": 10108 + }, + { + "epoch": 1.2859687062714666, + "ewc_loss": 0.03963729366660118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.981864625122398e-05, + "grad_norm": 27.185718536376953, + "learning_rate": 1e-06, + "loss": 0.497, + "mean_token_accuracy": 0.8483593463897705, + "num_tokens": 385778583.0, + "step": 10109 + }, + { + "epoch": 1.2860959165500572, + "ewc_loss": 0.039603374898433685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9801687813014723e-05, + "grad_norm": 27.20733642578125, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8656702041625977, + "num_tokens": 385819910.0, + "step": 10110 + }, + { + "epoch": 1.2862231268286477, + "ewc_loss": 0.03956688195466995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9783441530307755e-05, + "grad_norm": 27.18221092224121, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8617187142372131, + "num_tokens": 385860068.0, + "step": 10111 + }, + { + "epoch": 1.2863503371072382, + "ewc_loss": 0.039596494287252426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9798246285063215e-05, + "grad_norm": 27.110593795776367, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.855368971824646, + "num_tokens": 385890460.0, + "step": 10112 + }, + { + "epoch": 1.2864775473858288, + "ewc_loss": 0.0395815446972847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9790772057604045e-05, + "grad_norm": 27.187795639038086, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8614624738693237, + "num_tokens": 385926174.0, + "step": 10113 + }, + { + "epoch": 1.2866047576644193, + "ewc_loss": 0.03961640223860741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.980820161406882e-05, + "grad_norm": 27.069425582885742, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8734708428382874, + "num_tokens": 385962928.0, + "step": 10114 + }, + { + "epoch": 1.2867319679430098, + "ewc_loss": 0.039640430361032486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.982021603907924e-05, + "grad_norm": 27.11525535583496, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8728114366531372, + "num_tokens": 386002509.0, + "step": 10115 + }, + { + "epoch": 1.2868591782216003, + "ewc_loss": 0.039671484380960464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.983574293262791e-05, + "grad_norm": 27.195476531982422, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8637471795082092, + "num_tokens": 386043304.0, + "step": 10116 + }, + { + "epoch": 1.2869863885001909, + "ewc_loss": 0.03967653214931488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9838265870930627e-05, + "grad_norm": 27.197349548339844, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8711948394775391, + "num_tokens": 386081607.0, + "step": 10117 + }, + { + "epoch": 1.2871135987787814, + "ewc_loss": 0.03961580991744995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9807905118796043e-05, + "grad_norm": 27.241548538208008, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8822811841964722, + "num_tokens": 386117762.0, + "step": 10118 + }, + { + "epoch": 1.287240809057372, + "ewc_loss": 0.03963937982916832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9819690351141617e-05, + "grad_norm": 27.084218978881836, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.85495525598526, + "num_tokens": 386155908.0, + "step": 10119 + }, + { + "epoch": 1.2873680193359625, + "ewc_loss": 0.03960046544671059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9800232621491887e-05, + "grad_norm": 27.244714736938477, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8612152338027954, + "num_tokens": 386199385.0, + "step": 10120 + }, + { + "epoch": 1.287495229614553, + "ewc_loss": 0.03967653587460518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.983826768992003e-05, + "grad_norm": 27.205732345581055, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8585327863693237, + "num_tokens": 386231892.0, + "step": 10121 + }, + { + "epoch": 1.2876224398931433, + "ewc_loss": 0.0395418256521225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.977091233129613e-05, + "grad_norm": 27.162473678588867, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8477233648300171, + "num_tokens": 386269657.0, + "step": 10122 + }, + { + "epoch": 1.2877496501717338, + "ewc_loss": 0.039641499519348145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.982074900297448e-05, + "grad_norm": 27.251501083374023, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8685615658760071, + "num_tokens": 386306936.0, + "step": 10123 + }, + { + "epoch": 1.2878768604503243, + "ewc_loss": 0.03962190821766853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9810953745036386e-05, + "grad_norm": 27.178722381591797, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8645299077033997, + "num_tokens": 386342260.0, + "step": 10124 + }, + { + "epoch": 1.2880040707289149, + "ewc_loss": 0.039595551788806915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9797775166807696e-05, + "grad_norm": 27.295705795288086, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8594685196876526, + "num_tokens": 386381784.0, + "step": 10125 + }, + { + "epoch": 1.2881312810075054, + "ewc_loss": 0.03964247927069664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9821240130113438e-05, + "grad_norm": 27.025169372558594, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8624980449676514, + "num_tokens": 386422350.0, + "step": 10126 + }, + { + "epoch": 1.288258491286096, + "ewc_loss": 0.03962727263569832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9813636754406616e-05, + "grad_norm": 27.331979751586914, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.859754741191864, + "num_tokens": 386467951.0, + "step": 10127 + }, + { + "epoch": 1.2883857015646865, + "ewc_loss": 0.039636898785829544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.98184498003684e-05, + "grad_norm": 27.05486488342285, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8729041814804077, + "num_tokens": 386511896.0, + "step": 10128 + }, + { + "epoch": 1.288512911843277, + "ewc_loss": 0.03965863212943077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.982931644306518e-05, + "grad_norm": 27.329025268554688, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8508260846138, + "num_tokens": 386550704.0, + "step": 10129 + }, + { + "epoch": 1.2886401221218675, + "ewc_loss": 0.03975105658173561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9875527868862264e-05, + "grad_norm": 27.25404167175293, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8470748662948608, + "num_tokens": 386589295.0, + "step": 10130 + }, + { + "epoch": 1.2887673324004578, + "ewc_loss": 0.03954199701547623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9770997823798098e-05, + "grad_norm": 27.197546005249023, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8636416792869568, + "num_tokens": 386623462.0, + "step": 10131 + }, + { + "epoch": 1.2888945426790483, + "ewc_loss": 0.03964623436331749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9823117327177897e-05, + "grad_norm": 27.291275024414062, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8668879866600037, + "num_tokens": 386663036.0, + "step": 10132 + }, + { + "epoch": 1.2890217529576389, + "ewc_loss": 0.03962355852127075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9811779566225596e-05, + "grad_norm": 27.18147087097168, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8631770014762878, + "num_tokens": 386702530.0, + "step": 10133 + }, + { + "epoch": 1.2891489632362294, + "ewc_loss": 0.03962785750627518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9813929611700587e-05, + "grad_norm": 27.24932289123535, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.878139078617096, + "num_tokens": 386739758.0, + "step": 10134 + }, + { + "epoch": 1.28927617351482, + "ewc_loss": 0.039578232914209366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9789116777246818e-05, + "grad_norm": 27.14068603515625, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8734391331672668, + "num_tokens": 386778287.0, + "step": 10135 + }, + { + "epoch": 1.2894033837934105, + "ewc_loss": 0.0396200492978096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9810024241451174e-05, + "grad_norm": 27.195953369140625, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8745228052139282, + "num_tokens": 386812348.0, + "step": 10136 + }, + { + "epoch": 1.289530594072001, + "ewc_loss": 0.039588749408721924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.979437547561247e-05, + "grad_norm": 27.13936424255371, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8710458278656006, + "num_tokens": 386855540.0, + "step": 10137 + }, + { + "epoch": 1.2896578043505915, + "ewc_loss": 0.03959939256310463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.979969601961784e-05, + "grad_norm": 27.15076446533203, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8598471879959106, + "num_tokens": 386893775.0, + "step": 10138 + }, + { + "epoch": 1.289785014629182, + "ewc_loss": 0.03959067538380623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9795337721006945e-05, + "grad_norm": 27.339073181152344, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8637797236442566, + "num_tokens": 386929553.0, + "step": 10139 + }, + { + "epoch": 1.2899122249077726, + "ewc_loss": 0.03960394486784935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9801971575361677e-05, + "grad_norm": 27.267202377319336, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8579131960868835, + "num_tokens": 386968033.0, + "step": 10140 + }, + { + "epoch": 1.290039435186363, + "ewc_loss": 0.03953829035162926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9769144273595884e-05, + "grad_norm": 27.078142166137695, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8626447916030884, + "num_tokens": 387004967.0, + "step": 10141 + }, + { + "epoch": 1.2901666454649536, + "ewc_loss": 0.039550021290779114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.977501051442232e-05, + "grad_norm": 27.329530715942383, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8509200811386108, + "num_tokens": 387038086.0, + "step": 10142 + }, + { + "epoch": 1.2902938557435442, + "ewc_loss": 0.039643920958042145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9821960449917242e-05, + "grad_norm": 27.080461502075195, + "learning_rate": 1e-06, + "loss": 0.5229, + "mean_token_accuracy": 0.8375754356384277, + "num_tokens": 387077112.0, + "step": 10143 + }, + { + "epoch": 1.2904210660221347, + "ewc_loss": 0.03953251987695694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.976625935640186e-05, + "grad_norm": 27.176660537719727, + "learning_rate": 1e-06, + "loss": 0.4956, + "mean_token_accuracy": 0.8470625877380371, + "num_tokens": 387118800.0, + "step": 10144 + }, + { + "epoch": 1.2905482763007252, + "ewc_loss": 0.039691392332315445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9845696442644112e-05, + "grad_norm": 27.296878814697266, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8585106730461121, + "num_tokens": 387160321.0, + "step": 10145 + }, + { + "epoch": 1.2906754865793157, + "ewc_loss": 0.039537206292152405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9768602214753628e-05, + "grad_norm": 27.1971378326416, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8614354729652405, + "num_tokens": 387199387.0, + "step": 10146 + }, + { + "epoch": 1.290802696857906, + "ewc_loss": 0.039641257375478745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9820628949673846e-05, + "grad_norm": 27.33411407470703, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8692399263381958, + "num_tokens": 387238859.0, + "step": 10147 + }, + { + "epoch": 1.2909299071364966, + "ewc_loss": 0.03958124667406082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9790622900472954e-05, + "grad_norm": 27.147127151489258, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8695630431175232, + "num_tokens": 387272149.0, + "step": 10148 + }, + { + "epoch": 1.291057117415087, + "ewc_loss": 0.03951755538582802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9758777852985077e-05, + "grad_norm": 27.291126251220703, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8619462251663208, + "num_tokens": 387311617.0, + "step": 10149 + }, + { + "epoch": 1.2911843276936776, + "ewc_loss": 0.039648257195949554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.982412868528627e-05, + "grad_norm": 27.313127517700195, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8510164022445679, + "num_tokens": 387344281.0, + "step": 10150 + }, + { + "epoch": 1.2913115379722682, + "ewc_loss": 0.03954556584358215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9772782252402976e-05, + "grad_norm": 27.23770523071289, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8537437915802002, + "num_tokens": 387385959.0, + "step": 10151 + }, + { + "epoch": 1.2914387482508587, + "ewc_loss": 0.03960447013378143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.980223532882519e-05, + "grad_norm": 27.32582664489746, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8590378761291504, + "num_tokens": 387426197.0, + "step": 10152 + }, + { + "epoch": 1.2915659585294492, + "ewc_loss": 0.03963989391922951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9819946828647517e-05, + "grad_norm": 27.525854110717773, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8688452243804932, + "num_tokens": 387462250.0, + "step": 10153 + }, + { + "epoch": 1.2916931688080397, + "ewc_loss": 0.03954806923866272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9774033717112616e-05, + "grad_norm": 27.356473922729492, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8632062673568726, + "num_tokens": 387499903.0, + "step": 10154 + }, + { + "epoch": 1.2918203790866303, + "ewc_loss": 0.039406899362802505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9703449652297422e-05, + "grad_norm": 27.20215606689453, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8701725006103516, + "num_tokens": 387535361.0, + "step": 10155 + }, + { + "epoch": 1.2919475893652206, + "ewc_loss": 0.039539631456136703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9769815480685793e-05, + "grad_norm": 27.285097122192383, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.859779953956604, + "num_tokens": 387574300.0, + "step": 10156 + }, + { + "epoch": 1.2920747996438111, + "ewc_loss": 0.03954913094639778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.977456486201845e-05, + "grad_norm": 27.407962799072266, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8649847507476807, + "num_tokens": 387606860.0, + "step": 10157 + }, + { + "epoch": 1.2922020099224016, + "ewc_loss": 0.03955908864736557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9779543436015956e-05, + "grad_norm": 27.316057205200195, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8528909683227539, + "num_tokens": 387646519.0, + "step": 10158 + }, + { + "epoch": 1.2923292202009922, + "ewc_loss": 0.03944757580757141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.972378777281847e-05, + "grad_norm": 27.221485137939453, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8749053478240967, + "num_tokens": 387681100.0, + "step": 10159 + }, + { + "epoch": 1.2924564304795827, + "ewc_loss": 0.03955172747373581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9775863620452583e-05, + "grad_norm": 27.371776580810547, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.885354220867157, + "num_tokens": 387715962.0, + "step": 10160 + }, + { + "epoch": 1.2925836407581732, + "ewc_loss": 0.03957105427980423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9785527911153622e-05, + "grad_norm": 27.30072593688965, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8626292943954468, + "num_tokens": 387755856.0, + "step": 10161 + }, + { + "epoch": 1.2927108510367638, + "ewc_loss": 0.03950771689414978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.975385930563789e-05, + "grad_norm": 27.2377986907959, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.871170163154602, + "num_tokens": 387793256.0, + "step": 10162 + }, + { + "epoch": 1.2928380613153543, + "ewc_loss": 0.03955821692943573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.977910869754851e-05, + "grad_norm": 27.352190017700195, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8595241904258728, + "num_tokens": 387834696.0, + "step": 10163 + }, + { + "epoch": 1.2929652715939448, + "ewc_loss": 0.03948656842112541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9743283701245673e-05, + "grad_norm": 27.176990509033203, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8520321249961853, + "num_tokens": 387874973.0, + "step": 10164 + }, + { + "epoch": 1.2930924818725353, + "ewc_loss": 0.03946388140320778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.973194048332516e-05, + "grad_norm": 27.365514755249023, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8528661727905273, + "num_tokens": 387904887.0, + "step": 10165 + }, + { + "epoch": 1.2932196921511259, + "ewc_loss": 0.03957298770546913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9786493794526905e-05, + "grad_norm": 27.172203063964844, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8707284927368164, + "num_tokens": 387944676.0, + "step": 10166 + }, + { + "epoch": 1.2933469024297164, + "ewc_loss": 0.039446137845516205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9723069272004068e-05, + "grad_norm": 27.136856079101562, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8640318512916565, + "num_tokens": 387982234.0, + "step": 10167 + }, + { + "epoch": 1.293474112708307, + "ewc_loss": 0.03959948569536209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9799743313342333e-05, + "grad_norm": 27.221355438232422, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8651520013809204, + "num_tokens": 388018001.0, + "step": 10168 + }, + { + "epoch": 1.2936013229868975, + "ewc_loss": 0.03958602249622345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.979301123355981e-05, + "grad_norm": 27.166412353515625, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8582030534744263, + "num_tokens": 388057476.0, + "step": 10169 + }, + { + "epoch": 1.293728533265488, + "ewc_loss": 0.03963369131088257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.981684545171447e-05, + "grad_norm": 27.261613845825195, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.881485104560852, + "num_tokens": 388095351.0, + "step": 10170 + }, + { + "epoch": 1.2938557435440783, + "ewc_loss": 0.03969535604119301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9847677322104573e-05, + "grad_norm": 27.21090316772461, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8634909391403198, + "num_tokens": 388135366.0, + "step": 10171 + }, + { + "epoch": 1.2939829538226688, + "ewc_loss": 0.03955940902233124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.977970532607287e-05, + "grad_norm": 27.202194213867188, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8526157140731812, + "num_tokens": 388175003.0, + "step": 10172 + }, + { + "epoch": 1.2941101641012593, + "ewc_loss": 0.03967000171542168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9835000784951262e-05, + "grad_norm": 27.209774017333984, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8457233905792236, + "num_tokens": 388207083.0, + "step": 10173 + }, + { + "epoch": 1.2942373743798499, + "ewc_loss": 0.0396854504942894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.984272603294812e-05, + "grad_norm": 27.22480583190918, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.879419207572937, + "num_tokens": 388245038.0, + "step": 10174 + }, + { + "epoch": 1.2943645846584404, + "ewc_loss": 0.03966169431805611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9830848032142967e-05, + "grad_norm": 27.191709518432617, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8602234125137329, + "num_tokens": 388285035.0, + "step": 10175 + }, + { + "epoch": 1.294491794937031, + "ewc_loss": 0.03966804966330528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9834023987641558e-05, + "grad_norm": 27.167728424072266, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8688284158706665, + "num_tokens": 388323971.0, + "step": 10176 + }, + { + "epoch": 1.2946190052156215, + "ewc_loss": 0.03968914598226547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.984457230719272e-05, + "grad_norm": 27.161394119262695, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8586571216583252, + "num_tokens": 388362256.0, + "step": 10177 + }, + { + "epoch": 1.294746215494212, + "ewc_loss": 0.039744868874549866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9872433767886832e-05, + "grad_norm": 27.243026733398438, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8837199807167053, + "num_tokens": 388401120.0, + "step": 10178 + }, + { + "epoch": 1.2948734257728025, + "ewc_loss": 0.039613183587789536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9806591808446683e-05, + "grad_norm": 27.071918487548828, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8662189245223999, + "num_tokens": 388446833.0, + "step": 10179 + }, + { + "epoch": 1.2950006360513928, + "ewc_loss": 0.039665333926677704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9832667021546513e-05, + "grad_norm": 27.302791595458984, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8756279945373535, + "num_tokens": 388482266.0, + "step": 10180 + }, + { + "epoch": 1.2951278463299833, + "ewc_loss": 0.0397915318608284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9895765944966115e-05, + "grad_norm": 27.26636505126953, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8647897839546204, + "num_tokens": 388518024.0, + "step": 10181 + }, + { + "epoch": 1.2952550566085739, + "ewc_loss": 0.03972579911351204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.986290044442285e-05, + "grad_norm": 27.280508041381836, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8665981292724609, + "num_tokens": 388564246.0, + "step": 10182 + }, + { + "epoch": 1.2953822668871644, + "ewc_loss": 0.0396391823887825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9819592125713825e-05, + "grad_norm": 27.330385208129883, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8674153685569763, + "num_tokens": 388603556.0, + "step": 10183 + }, + { + "epoch": 1.295509477165755, + "ewc_loss": 0.03961220756173134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9806104319286533e-05, + "grad_norm": 27.18787384033203, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8729077577590942, + "num_tokens": 388643615.0, + "step": 10184 + }, + { + "epoch": 1.2956366874443455, + "ewc_loss": 0.03960122540593147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9800612790277228e-05, + "grad_norm": 27.367345809936523, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8681105971336365, + "num_tokens": 388682917.0, + "step": 10185 + }, + { + "epoch": 1.295763897722936, + "ewc_loss": 0.039626941084861755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9813471226370893e-05, + "grad_norm": 27.26180648803711, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8663003444671631, + "num_tokens": 388720269.0, + "step": 10186 + }, + { + "epoch": 1.2958911080015265, + "ewc_loss": 0.039578162133693695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9789080397458747e-05, + "grad_norm": 27.351911544799805, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8569275140762329, + "num_tokens": 388759245.0, + "step": 10187 + }, + { + "epoch": 1.296018318280117, + "ewc_loss": 0.039602648466825485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9801324015134014e-05, + "grad_norm": 27.167354583740234, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8604261875152588, + "num_tokens": 388795673.0, + "step": 10188 + }, + { + "epoch": 1.2961455285587076, + "ewc_loss": 0.03953946754336357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9769733626162633e-05, + "grad_norm": 27.27438735961914, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8681883811950684, + "num_tokens": 388832926.0, + "step": 10189 + }, + { + "epoch": 1.296272738837298, + "ewc_loss": 0.03963372856378555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9816863641608506e-05, + "grad_norm": 27.245559692382812, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8678040504455566, + "num_tokens": 388869767.0, + "step": 10190 + }, + { + "epoch": 1.2963999491158886, + "ewc_loss": 0.03964539244771004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9822697140625678e-05, + "grad_norm": 27.152372360229492, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8666069507598877, + "num_tokens": 388913347.0, + "step": 10191 + }, + { + "epoch": 1.2965271593944792, + "ewc_loss": 0.03960230574011803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.980115303013008e-05, + "grad_norm": 27.182247161865234, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8760418891906738, + "num_tokens": 388948211.0, + "step": 10192 + }, + { + "epoch": 1.2966543696730697, + "ewc_loss": 0.03969847038388252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.984923437703401e-05, + "grad_norm": 27.122156143188477, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8695382475852966, + "num_tokens": 388989198.0, + "step": 10193 + }, + { + "epoch": 1.2967815799516602, + "ewc_loss": 0.03967802971601486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9839015294564888e-05, + "grad_norm": 27.27977180480957, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8561244010925293, + "num_tokens": 389029076.0, + "step": 10194 + }, + { + "epoch": 1.2969087902302507, + "ewc_loss": 0.03971049562096596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.985524795600213e-05, + "grad_norm": 27.158369064331055, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8636691570281982, + "num_tokens": 389066751.0, + "step": 10195 + }, + { + "epoch": 1.297036000508841, + "ewc_loss": 0.039609119296073914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9804559997282922e-05, + "grad_norm": 27.347213745117188, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8626208305358887, + "num_tokens": 389104207.0, + "step": 10196 + }, + { + "epoch": 1.2971632107874316, + "ewc_loss": 0.039661578834056854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9830789824482054e-05, + "grad_norm": 27.23012924194336, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8677629232406616, + "num_tokens": 389141941.0, + "step": 10197 + }, + { + "epoch": 1.297290421066022, + "ewc_loss": 0.039618197828531265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.980909837584477e-05, + "grad_norm": 27.250640869140625, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8756071925163269, + "num_tokens": 389185326.0, + "step": 10198 + }, + { + "epoch": 1.2974176313446126, + "ewc_loss": 0.03969227150082588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.984613663807977e-05, + "grad_norm": 27.334197998046875, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8736464977264404, + "num_tokens": 389221512.0, + "step": 10199 + }, + { + "epoch": 1.2975448416232032, + "ewc_loss": 0.039628539234399796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.981426976271905e-05, + "grad_norm": 27.395193099975586, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8716135621070862, + "num_tokens": 389259388.0, + "step": 10200 + }, + { + "epoch": 1.2976720519017937, + "ewc_loss": 0.039625488221645355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9812743630609475e-05, + "grad_norm": 27.279081344604492, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8542799949645996, + "num_tokens": 389298773.0, + "step": 10201 + }, + { + "epoch": 1.2977992621803842, + "ewc_loss": 0.03948301076889038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9741504729609005e-05, + "grad_norm": 27.191926956176758, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8663198947906494, + "num_tokens": 389339680.0, + "step": 10202 + }, + { + "epoch": 1.2979264724589747, + "ewc_loss": 0.03966256231069565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.983128095162101e-05, + "grad_norm": 27.306747436523438, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8678745031356812, + "num_tokens": 389380755.0, + "step": 10203 + }, + { + "epoch": 1.2980536827375653, + "ewc_loss": 0.039517804980278015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.975890336325392e-05, + "grad_norm": 27.190763473510742, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8686147332191467, + "num_tokens": 389412031.0, + "step": 10204 + }, + { + "epoch": 1.2981808930161556, + "ewc_loss": 0.03956872224807739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.978436193894595e-05, + "grad_norm": 27.333011627197266, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.875795841217041, + "num_tokens": 389448299.0, + "step": 10205 + }, + { + "epoch": 1.298308103294746, + "ewc_loss": 0.03967590630054474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9837953004753217e-05, + "grad_norm": 27.259244918823242, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8591651916503906, + "num_tokens": 389483962.0, + "step": 10206 + }, + { + "epoch": 1.2984353135733366, + "ewc_loss": 0.03964788466691971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9823943148367107e-05, + "grad_norm": 27.3105525970459, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8570573329925537, + "num_tokens": 389521989.0, + "step": 10207 + }, + { + "epoch": 1.2985625238519272, + "ewc_loss": 0.039652131497859955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9826065909001045e-05, + "grad_norm": 27.20762062072754, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8599063158035278, + "num_tokens": 389558228.0, + "step": 10208 + }, + { + "epoch": 1.2986897341305177, + "ewc_loss": 0.03961384296417236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9806921045528725e-05, + "grad_norm": 27.2887020111084, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.863184928894043, + "num_tokens": 389597164.0, + "step": 10209 + }, + { + "epoch": 1.2988169444091082, + "ewc_loss": 0.03965587913990021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9827939468086697e-05, + "grad_norm": 27.12748908996582, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8651676774024963, + "num_tokens": 389636435.0, + "step": 10210 + }, + { + "epoch": 1.2989441546876987, + "ewc_loss": 0.03971441090106964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9857205188600346e-05, + "grad_norm": 27.36467933654785, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8558580875396729, + "num_tokens": 389670498.0, + "step": 10211 + }, + { + "epoch": 1.2990713649662893, + "ewc_loss": 0.03977426886558533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.988713484024629e-05, + "grad_norm": 27.27292251586914, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8666641116142273, + "num_tokens": 389707433.0, + "step": 10212 + }, + { + "epoch": 1.2991985752448798, + "ewc_loss": 0.03962164744734764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9810824596788734e-05, + "grad_norm": 27.37986183166504, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8661407232284546, + "num_tokens": 389747300.0, + "step": 10213 + }, + { + "epoch": 1.2993257855234703, + "ewc_loss": 0.0397152416408062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9857619918184355e-05, + "grad_norm": 27.178503036499023, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.859002947807312, + "num_tokens": 389788381.0, + "step": 10214 + }, + { + "epoch": 1.2994529958020609, + "ewc_loss": 0.03965682536363602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.982841240533162e-05, + "grad_norm": 27.40812873840332, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.868154764175415, + "num_tokens": 389821792.0, + "step": 10215 + }, + { + "epoch": 1.2995802060806514, + "ewc_loss": 0.03965466469526291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9827331925625913e-05, + "grad_norm": 27.148414611816406, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8762837052345276, + "num_tokens": 389859370.0, + "step": 10216 + }, + { + "epoch": 1.299707416359242, + "ewc_loss": 0.03963519632816315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.981759851332754e-05, + "grad_norm": 27.410812377929688, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8562614917755127, + "num_tokens": 389897020.0, + "step": 10217 + }, + { + "epoch": 1.2998346266378324, + "ewc_loss": 0.039745036512613297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9872517441399395e-05, + "grad_norm": 27.179359436035156, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8390169143676758, + "num_tokens": 389927107.0, + "step": 10218 + }, + { + "epoch": 1.299961836916423, + "ewc_loss": 0.03964395076036453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.982197500183247e-05, + "grad_norm": 27.361879348754883, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8621752262115479, + "num_tokens": 389966323.0, + "step": 10219 + }, + { + "epoch": 1.3000890471950133, + "ewc_loss": 0.03980565443634987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.990282726183068e-05, + "grad_norm": 27.20698356628418, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8639240860939026, + "num_tokens": 390001416.0, + "step": 10220 + }, + { + "epoch": 1.3002162574736038, + "ewc_loss": 0.03963460028171539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9817300199065357e-05, + "grad_norm": 27.26078224182129, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8670597672462463, + "num_tokens": 390042326.0, + "step": 10221 + }, + { + "epoch": 1.3003434677521943, + "ewc_loss": 0.039771389216184616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.988569420063868e-05, + "grad_norm": 27.192686080932617, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8544430732727051, + "num_tokens": 390084443.0, + "step": 10222 + }, + { + "epoch": 1.3004706780307849, + "ewc_loss": 0.039742834866046906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.987141695281025e-05, + "grad_norm": 27.310165405273438, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8579229116439819, + "num_tokens": 390126309.0, + "step": 10223 + }, + { + "epoch": 1.3005978883093754, + "ewc_loss": 0.039789654314517975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9894827346433885e-05, + "grad_norm": 27.223651885986328, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8695549964904785, + "num_tokens": 390161463.0, + "step": 10224 + }, + { + "epoch": 1.300725098587966, + "ewc_loss": 0.03971279412508011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9856397557305172e-05, + "grad_norm": 27.273761749267578, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8745607733726501, + "num_tokens": 390203221.0, + "step": 10225 + }, + { + "epoch": 1.3008523088665565, + "ewc_loss": 0.039739012718200684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9869507013936527e-05, + "grad_norm": 27.074764251708984, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8804190158843994, + "num_tokens": 390243269.0, + "step": 10226 + }, + { + "epoch": 1.300979519145147, + "ewc_loss": 0.039779841899871826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9889921532012522e-05, + "grad_norm": 27.21247100830078, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8655210733413696, + "num_tokens": 390282488.0, + "step": 10227 + }, + { + "epoch": 1.3011067294237375, + "ewc_loss": 0.03984807804226875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9924038497265428e-05, + "grad_norm": 27.319412231445312, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.860857367515564, + "num_tokens": 390320268.0, + "step": 10228 + }, + { + "epoch": 1.3012339397023278, + "ewc_loss": 0.03974676877260208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9873385099344887e-05, + "grad_norm": 27.16236114501953, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8549924492835999, + "num_tokens": 390354733.0, + "step": 10229 + }, + { + "epoch": 1.3013611499809183, + "ewc_loss": 0.039770595729351044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.988529766094871e-05, + "grad_norm": 27.25745391845703, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8638824224472046, + "num_tokens": 390388161.0, + "step": 10230 + }, + { + "epoch": 1.3014883602595089, + "ewc_loss": 0.03977912664413452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9889563191100024e-05, + "grad_norm": 27.27458953857422, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8747798204421997, + "num_tokens": 390424609.0, + "step": 10231 + }, + { + "epoch": 1.3016155705380994, + "ewc_loss": 0.039806317538022995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9903158317902125e-05, + "grad_norm": 27.307479858398438, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8814214468002319, + "num_tokens": 390460809.0, + "step": 10232 + }, + { + "epoch": 1.30174278081669, + "ewc_loss": 0.03976360708475113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9881803382304497e-05, + "grad_norm": 27.245346069335938, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8546011447906494, + "num_tokens": 390500198.0, + "step": 10233 + }, + { + "epoch": 1.3018699910952805, + "ewc_loss": 0.03965890780091286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.982945468625985e-05, + "grad_norm": 27.20005226135254, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8584607839584351, + "num_tokens": 390545588.0, + "step": 10234 + }, + { + "epoch": 1.301997201373871, + "ewc_loss": 0.039749208837747574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9874603822245263e-05, + "grad_norm": 27.296052932739258, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8548704385757446, + "num_tokens": 390581681.0, + "step": 10235 + }, + { + "epoch": 1.3021244116524615, + "ewc_loss": 0.03974994644522667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9874973077094182e-05, + "grad_norm": 27.288286209106445, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8820853233337402, + "num_tokens": 390619122.0, + "step": 10236 + }, + { + "epoch": 1.302251621931052, + "ewc_loss": 0.03973047062754631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9865236026817e-05, + "grad_norm": 27.183439254760742, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8519042730331421, + "num_tokens": 390667974.0, + "step": 10237 + }, + { + "epoch": 1.3023788322096426, + "ewc_loss": 0.03968632221221924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.984316077141557e-05, + "grad_norm": 27.26814079284668, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8669905066490173, + "num_tokens": 390705188.0, + "step": 10238 + }, + { + "epoch": 1.302506042488233, + "ewc_loss": 0.03980300948023796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9901504856534302e-05, + "grad_norm": 27.350435256958008, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8746899962425232, + "num_tokens": 390746892.0, + "step": 10239 + }, + { + "epoch": 1.3026332527668236, + "ewc_loss": 0.03968597576022148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9842987967422232e-05, + "grad_norm": 27.221027374267578, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8799120783805847, + "num_tokens": 390787727.0, + "step": 10240 + }, + { + "epoch": 1.3027604630454142, + "ewc_loss": 0.03963334485888481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9816672647721134e-05, + "grad_norm": 27.269027709960938, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.857679009437561, + "num_tokens": 390823702.0, + "step": 10241 + }, + { + "epoch": 1.3028876733240047, + "ewc_loss": 0.039688851684331894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9844424969051033e-05, + "grad_norm": 27.206850051879883, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8629475235939026, + "num_tokens": 390860256.0, + "step": 10242 + }, + { + "epoch": 1.3030148836025952, + "ewc_loss": 0.03967254236340523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9836270439554937e-05, + "grad_norm": 27.220705032348633, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.877683699131012, + "num_tokens": 390901676.0, + "step": 10243 + }, + { + "epoch": 1.3031420938811857, + "ewc_loss": 0.03971083462238312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.985541712201666e-05, + "grad_norm": 27.30980110168457, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8472657203674316, + "num_tokens": 390936911.0, + "step": 10244 + }, + { + "epoch": 1.303269304159776, + "ewc_loss": 0.03969869390130043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.984934715437703e-05, + "grad_norm": 27.2652530670166, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8526792526245117, + "num_tokens": 390976185.0, + "step": 10245 + }, + { + "epoch": 1.3033965144383666, + "ewc_loss": 0.0396893210709095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9844661437673494e-05, + "grad_norm": 27.275432586669922, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8651502132415771, + "num_tokens": 391014661.0, + "step": 10246 + }, + { + "epoch": 1.303523724716957, + "ewc_loss": 0.0397261381149292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9863069610437378e-05, + "grad_norm": 27.319976806640625, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8572319149971008, + "num_tokens": 391056205.0, + "step": 10247 + }, + { + "epoch": 1.3036509349955476, + "ewc_loss": 0.03976266086101532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9881330445059575e-05, + "grad_norm": 27.33363151550293, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8656160831451416, + "num_tokens": 391092187.0, + "step": 10248 + }, + { + "epoch": 1.3037781452741382, + "ewc_loss": 0.03965534642338753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.982767389563378e-05, + "grad_norm": 27.314044952392578, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8789779543876648, + "num_tokens": 391130890.0, + "step": 10249 + }, + { + "epoch": 1.3039053555527287, + "ewc_loss": 0.039722997695207596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9861498003592715e-05, + "grad_norm": 27.23653793334961, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8740972280502319, + "num_tokens": 391172783.0, + "step": 10250 + }, + { + "epoch": 1.3040325658313192, + "ewc_loss": 0.0396118238568306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9805911506409757e-05, + "grad_norm": 27.397472381591797, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8478507995605469, + "num_tokens": 391215031.0, + "step": 10251 + }, + { + "epoch": 1.3041597761099097, + "ewc_loss": 0.03973669931292534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9868350136675872e-05, + "grad_norm": 27.305130004882812, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8679606318473816, + "num_tokens": 391251400.0, + "step": 10252 + }, + { + "epoch": 1.3042869863885003, + "ewc_loss": 0.03965100646018982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.982550384127535e-05, + "grad_norm": 27.335922241210938, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8560773134231567, + "num_tokens": 391285895.0, + "step": 10253 + }, + { + "epoch": 1.3044141966670906, + "ewc_loss": 0.039695583283901215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9847791918436997e-05, + "grad_norm": 27.305294036865234, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.866576075553894, + "num_tokens": 391328361.0, + "step": 10254 + }, + { + "epoch": 1.304541406945681, + "ewc_loss": 0.03965222090482712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.982610956474673e-05, + "grad_norm": 27.30959129333496, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8633956909179688, + "num_tokens": 391365019.0, + "step": 10255 + }, + { + "epoch": 1.3046686172242716, + "ewc_loss": 0.03969496488571167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9847482690238394e-05, + "grad_norm": 27.341936111450195, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8492089509963989, + "num_tokens": 391400407.0, + "step": 10256 + }, + { + "epoch": 1.3047958275028622, + "ewc_loss": 0.03966502100229263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9832510588457808e-05, + "grad_norm": 27.252805709838867, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8465572595596313, + "num_tokens": 391440660.0, + "step": 10257 + }, + { + "epoch": 1.3049230377814527, + "ewc_loss": 0.0396052785217762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9802639144472778e-05, + "grad_norm": 27.269107818603516, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8676918745040894, + "num_tokens": 391481096.0, + "step": 10258 + }, + { + "epoch": 1.3050502480600432, + "ewc_loss": 0.039607807993888855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9803903342108242e-05, + "grad_norm": 27.300312042236328, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8652745485305786, + "num_tokens": 391516356.0, + "step": 10259 + }, + { + "epoch": 1.3051774583386337, + "ewc_loss": 0.03975260257720947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.987630093935877e-05, + "grad_norm": 27.34919548034668, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8528720736503601, + "num_tokens": 391554691.0, + "step": 10260 + }, + { + "epoch": 1.3053046686172243, + "ewc_loss": 0.03967767953872681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9838838852592744e-05, + "grad_norm": 27.243160247802734, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8504103422164917, + "num_tokens": 391592473.0, + "step": 10261 + }, + { + "epoch": 1.3054318788958148, + "ewc_loss": 0.039692506194114685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9846253053401597e-05, + "grad_norm": 27.410249710083008, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8702459335327148, + "num_tokens": 391632091.0, + "step": 10262 + }, + { + "epoch": 1.3055590891744053, + "ewc_loss": 0.03975929319858551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.987964606087189e-05, + "grad_norm": 27.22289276123047, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8579137921333313, + "num_tokens": 391676121.0, + "step": 10263 + }, + { + "epoch": 1.3056862994529959, + "ewc_loss": 0.03971192613244057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9855962818837725e-05, + "grad_norm": 27.433855056762695, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8620738983154297, + "num_tokens": 391709664.0, + "step": 10264 + }, + { + "epoch": 1.3058135097315864, + "ewc_loss": 0.039741676300764084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9870838514179923e-05, + "grad_norm": 27.1768856048584, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8534549474716187, + "num_tokens": 391755828.0, + "step": 10265 + }, + { + "epoch": 1.305940720010177, + "ewc_loss": 0.039650265127420425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9825132767437026e-05, + "grad_norm": 27.414154052734375, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8809830546379089, + "num_tokens": 391800232.0, + "step": 10266 + }, + { + "epoch": 1.3060679302887674, + "ewc_loss": 0.03977920114994049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9889601389877498e-05, + "grad_norm": 27.33513641357422, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8658405542373657, + "num_tokens": 391837612.0, + "step": 10267 + }, + { + "epoch": 1.306195140567358, + "ewc_loss": 0.03958834335207939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9794171748799272e-05, + "grad_norm": 27.244510650634766, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8746659755706787, + "num_tokens": 391871193.0, + "step": 10268 + }, + { + "epoch": 1.3063223508459483, + "ewc_loss": 0.039716485887765884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.985824383154977e-05, + "grad_norm": 27.313753128051758, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8563265800476074, + "num_tokens": 391908203.0, + "step": 10269 + }, + { + "epoch": 1.3064495611245388, + "ewc_loss": 0.03975438326597214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.987719224416651e-05, + "grad_norm": 27.40465545654297, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8697784543037415, + "num_tokens": 391938744.0, + "step": 10270 + }, + { + "epoch": 1.3065767714031293, + "ewc_loss": 0.03967732936143875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9838664229610004e-05, + "grad_norm": 27.37286949157715, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.86748206615448, + "num_tokens": 391976020.0, + "step": 10271 + }, + { + "epoch": 1.3067039816817199, + "ewc_loss": 0.039722152054309845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9861075998051092e-05, + "grad_norm": 27.397489547729492, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8554939031600952, + "num_tokens": 392018006.0, + "step": 10272 + }, + { + "epoch": 1.3068311919603104, + "ewc_loss": 0.0396721176803112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9836059436784126e-05, + "grad_norm": 27.35582733154297, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8629629015922546, + "num_tokens": 392055358.0, + "step": 10273 + }, + { + "epoch": 1.306958402238901, + "ewc_loss": 0.039689596742391586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.984479786187876e-05, + "grad_norm": 27.331632614135742, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.862371563911438, + "num_tokens": 392097727.0, + "step": 10274 + }, + { + "epoch": 1.3070856125174914, + "ewc_loss": 0.039639588445425034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.981979403353762e-05, + "grad_norm": 27.35416030883789, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8701540231704712, + "num_tokens": 392144641.0, + "step": 10275 + }, + { + "epoch": 1.307212822796082, + "ewc_loss": 0.03961421176791191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9807106582447886e-05, + "grad_norm": 27.33770179748535, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8726582527160645, + "num_tokens": 392178810.0, + "step": 10276 + }, + { + "epoch": 1.3073400330746725, + "ewc_loss": 0.03964420035481453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9822100512101315e-05, + "grad_norm": 27.387981414794922, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8605344295501709, + "num_tokens": 392213571.0, + "step": 10277 + }, + { + "epoch": 1.3074672433532628, + "ewc_loss": 0.03963874652981758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.98193738469854e-05, + "grad_norm": 27.20551872253418, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8600480556488037, + "num_tokens": 392254628.0, + "step": 10278 + }, + { + "epoch": 1.3075944536318533, + "ewc_loss": 0.039681192487478256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.984059599635657e-05, + "grad_norm": 27.30031394958496, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8635108470916748, + "num_tokens": 392297243.0, + "step": 10279 + }, + { + "epoch": 1.3077216639104439, + "ewc_loss": 0.03969217836856842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9846089344355278e-05, + "grad_norm": 27.341480255126953, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8680260181427002, + "num_tokens": 392329116.0, + "step": 10280 + }, + { + "epoch": 1.3078488741890344, + "ewc_loss": 0.03969673067331314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9848364900099114e-05, + "grad_norm": 27.367633819580078, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8614937663078308, + "num_tokens": 392370542.0, + "step": 10281 + }, + { + "epoch": 1.307976084467625, + "ewc_loss": 0.039752233773469925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9876117221429013e-05, + "grad_norm": 27.27912712097168, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.869049072265625, + "num_tokens": 392411155.0, + "step": 10282 + }, + { + "epoch": 1.3081032947462155, + "ewc_loss": 0.03957325220108032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9786626580753364e-05, + "grad_norm": 27.323915481567383, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8639755249023438, + "num_tokens": 392450440.0, + "step": 10283 + }, + { + "epoch": 1.308230505024806, + "ewc_loss": 0.039730533957481384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.986526694963686e-05, + "grad_norm": 27.34566879272461, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8745160698890686, + "num_tokens": 392488545.0, + "step": 10284 + }, + { + "epoch": 1.3083577153033965, + "ewc_loss": 0.039644043892621994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9822022295556962e-05, + "grad_norm": 27.210735321044922, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8640874624252319, + "num_tokens": 392527138.0, + "step": 10285 + }, + { + "epoch": 1.308484925581987, + "ewc_loss": 0.03965196758508682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9825984054477885e-05, + "grad_norm": 27.279422760009766, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8650553226470947, + "num_tokens": 392568181.0, + "step": 10286 + }, + { + "epoch": 1.3086121358605776, + "ewc_loss": 0.039742857217788696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.987142786674667e-05, + "grad_norm": 27.22821617126465, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8789771795272827, + "num_tokens": 392610900.0, + "step": 10287 + }, + { + "epoch": 1.308739346139168, + "ewc_loss": 0.03971933200955391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9859666281263344e-05, + "grad_norm": 27.293102264404297, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8558146357536316, + "num_tokens": 392646624.0, + "step": 10288 + }, + { + "epoch": 1.3088665564177586, + "ewc_loss": 0.039806075394153595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.990303826460149e-05, + "grad_norm": 27.3015193939209, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8644307851791382, + "num_tokens": 392683103.0, + "step": 10289 + }, + { + "epoch": 1.3089937666963491, + "ewc_loss": 0.03974345698952675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9871727999998257e-05, + "grad_norm": 27.320499420166016, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8656837940216064, + "num_tokens": 392723169.0, + "step": 10290 + }, + { + "epoch": 1.3091209769749397, + "ewc_loss": 0.03976670280098915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9883351342286915e-05, + "grad_norm": 27.164640426635742, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.879326343536377, + "num_tokens": 392762323.0, + "step": 10291 + }, + { + "epoch": 1.3092481872535302, + "ewc_loss": 0.039736032485961914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.986801544262562e-05, + "grad_norm": 27.308456420898438, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8804486989974976, + "num_tokens": 392797555.0, + "step": 10292 + }, + { + "epoch": 1.3093753975321207, + "ewc_loss": 0.03978569805622101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9892848285962828e-05, + "grad_norm": 27.192148208618164, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8760087490081787, + "num_tokens": 392840001.0, + "step": 10293 + }, + { + "epoch": 1.309502607810711, + "ewc_loss": 0.03976072743535042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9880364561686292e-05, + "grad_norm": 27.28484535217285, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8640957474708557, + "num_tokens": 392880871.0, + "step": 10294 + }, + { + "epoch": 1.3096298180893016, + "ewc_loss": 0.039756279438734055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9878139937645756e-05, + "grad_norm": 27.18575668334961, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8743512630462646, + "num_tokens": 392917554.0, + "step": 10295 + }, + { + "epoch": 1.309757028367892, + "ewc_loss": 0.039726704359054565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9863351553794928e-05, + "grad_norm": 27.305692672729492, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8828755617141724, + "num_tokens": 392958601.0, + "step": 10296 + }, + { + "epoch": 1.3098842386464826, + "ewc_loss": 0.03975018113851547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9875091311405413e-05, + "grad_norm": 27.16741943359375, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8663759231567383, + "num_tokens": 393000108.0, + "step": 10297 + }, + { + "epoch": 1.3100114489250732, + "ewc_loss": 0.03976696357131004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.988348230952397e-05, + "grad_norm": 27.287490844726562, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8859846591949463, + "num_tokens": 393032321.0, + "step": 10298 + }, + { + "epoch": 1.3101386592036637, + "ewc_loss": 0.039775997400283813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9887998860212974e-05, + "grad_norm": 27.30388641357422, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8632912635803223, + "num_tokens": 393068347.0, + "step": 10299 + }, + { + "epoch": 1.3102658694822542, + "ewc_loss": 0.03971712291240692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.985856215469539e-05, + "grad_norm": 27.23360252380371, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8820542097091675, + "num_tokens": 393102509.0, + "step": 10300 + }, + { + "epoch": 1.3103930797608447, + "ewc_loss": 0.03975484147667885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9877421436831355e-05, + "grad_norm": 27.27933692932129, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.873385488986969, + "num_tokens": 393140974.0, + "step": 10301 + }, + { + "epoch": 1.3105202900394353, + "ewc_loss": 0.03984257951378822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.992129000427667e-05, + "grad_norm": 27.314664840698242, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8747932314872742, + "num_tokens": 393177214.0, + "step": 10302 + }, + { + "epoch": 1.3106475003180256, + "ewc_loss": 0.0397818386554718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.989092015719507e-05, + "grad_norm": 27.283870697021484, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8675057888031006, + "num_tokens": 393215801.0, + "step": 10303 + }, + { + "epoch": 1.310774710596616, + "ewc_loss": 0.03983357176184654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.991678618651349e-05, + "grad_norm": 27.291364669799805, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8762817978858948, + "num_tokens": 393256444.0, + "step": 10304 + }, + { + "epoch": 1.3109019208752066, + "ewc_loss": 0.039860837161540985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9930419512093067e-05, + "grad_norm": 27.207345962524414, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8611979484558105, + "num_tokens": 393300238.0, + "step": 10305 + }, + { + "epoch": 1.3110291311537972, + "ewc_loss": 0.03989848494529724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.994924241444096e-05, + "grad_norm": 27.296649932861328, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8507194519042969, + "num_tokens": 393344087.0, + "step": 10306 + }, + { + "epoch": 1.3111563414323877, + "ewc_loss": 0.03989484906196594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9947425244026817e-05, + "grad_norm": 27.313026428222656, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8701796531677246, + "num_tokens": 393381678.0, + "step": 10307 + }, + { + "epoch": 1.3112835517109782, + "ewc_loss": 0.03978639468550682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.989319753192831e-05, + "grad_norm": 27.290210723876953, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8639845848083496, + "num_tokens": 393415740.0, + "step": 10308 + }, + { + "epoch": 1.3114107619895687, + "ewc_loss": 0.03986358642578125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9931792849092744e-05, + "grad_norm": 27.326095581054688, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8414192199707031, + "num_tokens": 393457692.0, + "step": 10309 + }, + { + "epoch": 1.3115379722681593, + "ewc_loss": 0.039795391261577606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9897695892723277e-05, + "grad_norm": 27.25298500061035, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8599547147750854, + "num_tokens": 393499003.0, + "step": 10310 + }, + { + "epoch": 1.3116651825467498, + "ewc_loss": 0.03984212502837181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9921062630601227e-05, + "grad_norm": 27.430206298828125, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8538693189620972, + "num_tokens": 393533842.0, + "step": 10311 + }, + { + "epoch": 1.3117923928253403, + "ewc_loss": 0.03982052206993103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9910261471522972e-05, + "grad_norm": 27.33698272705078, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8732243776321411, + "num_tokens": 393573906.0, + "step": 10312 + }, + { + "epoch": 1.3119196031039309, + "ewc_loss": 0.03969110548496246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.984555274248123e-05, + "grad_norm": 27.135467529296875, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8633818030357361, + "num_tokens": 393615298.0, + "step": 10313 + }, + { + "epoch": 1.3120468133825214, + "ewc_loss": 0.039790742099285126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9895371224265546e-05, + "grad_norm": 27.319868087768555, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8538157939910889, + "num_tokens": 393654690.0, + "step": 10314 + }, + { + "epoch": 1.312174023661112, + "ewc_loss": 0.03982238844037056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.991119461308699e-05, + "grad_norm": 27.313642501831055, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8668277859687805, + "num_tokens": 393696222.0, + "step": 10315 + }, + { + "epoch": 1.3123012339397024, + "ewc_loss": 0.039768416434526443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9884208086295985e-05, + "grad_norm": 27.299118041992188, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8614389300346375, + "num_tokens": 393736748.0, + "step": 10316 + }, + { + "epoch": 1.312428444218293, + "ewc_loss": 0.03981199488043785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.990599776036106e-05, + "grad_norm": 27.278356552124023, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8564389944076538, + "num_tokens": 393773077.0, + "step": 10317 + }, + { + "epoch": 1.3125556544968833, + "ewc_loss": 0.03979194536805153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.989597330975812e-05, + "grad_norm": 27.30318832397461, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8757131099700928, + "num_tokens": 393806466.0, + "step": 10318 + }, + { + "epoch": 1.3126828647754738, + "ewc_loss": 0.03986012935638428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9930064809159376e-05, + "grad_norm": 27.265357971191406, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8608748912811279, + "num_tokens": 393840922.0, + "step": 10319 + }, + { + "epoch": 1.3128100750540643, + "ewc_loss": 0.03978259488940239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9891296688001603e-05, + "grad_norm": 27.405548095703125, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8684556484222412, + "num_tokens": 393874544.0, + "step": 10320 + }, + { + "epoch": 1.3129372853326549, + "ewc_loss": 0.039873115718364716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9936558601330034e-05, + "grad_norm": 27.36812400817871, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8612790107727051, + "num_tokens": 393917773.0, + "step": 10321 + }, + { + "epoch": 1.3130644956112454, + "ewc_loss": 0.03975871950387955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9879360479535535e-05, + "grad_norm": 27.35645866394043, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8606547117233276, + "num_tokens": 393951525.0, + "step": 10322 + }, + { + "epoch": 1.313191705889836, + "ewc_loss": 0.03980636224150658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.990318196476437e-05, + "grad_norm": 27.362018585205078, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8698542714118958, + "num_tokens": 393989796.0, + "step": 10323 + }, + { + "epoch": 1.3133189161684264, + "ewc_loss": 0.03981078043580055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9905390217900276e-05, + "grad_norm": 27.317222595214844, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8622693419456482, + "num_tokens": 394028746.0, + "step": 10324 + }, + { + "epoch": 1.313446126447017, + "ewc_loss": 0.03973450884222984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9867255105054937e-05, + "grad_norm": 27.29453468322754, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8738568425178528, + "num_tokens": 394065241.0, + "step": 10325 + }, + { + "epoch": 1.3135733367256075, + "ewc_loss": 0.03986849635839462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.993424848478753e-05, + "grad_norm": 27.32701301574707, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.860137939453125, + "num_tokens": 394110113.0, + "step": 10326 + }, + { + "epoch": 1.3137005470041978, + "ewc_loss": 0.03981519490480423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9907596652046777e-05, + "grad_norm": 27.311691284179688, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8489341735839844, + "num_tokens": 394146577.0, + "step": 10327 + }, + { + "epoch": 1.3138277572827883, + "ewc_loss": 0.03989125415682793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.994562626350671e-05, + "grad_norm": 27.44053840637207, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8646149635314941, + "num_tokens": 394185392.0, + "step": 10328 + }, + { + "epoch": 1.3139549675613789, + "ewc_loss": 0.039853282272815704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9926641471101902e-05, + "grad_norm": 27.27485466003418, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8556243181228638, + "num_tokens": 394226301.0, + "step": 10329 + }, + { + "epoch": 1.3140821778399694, + "ewc_loss": 0.03973659873008728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9868299204972573e-05, + "grad_norm": 27.320112228393555, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8562110662460327, + "num_tokens": 394262654.0, + "step": 10330 + }, + { + "epoch": 1.31420938811856, + "ewc_loss": 0.03983962535858154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.991981298488099e-05, + "grad_norm": 27.29705238342285, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8763140439987183, + "num_tokens": 394299185.0, + "step": 10331 + }, + { + "epoch": 1.3143365983971504, + "ewc_loss": 0.03981888294219971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9909441107301973e-05, + "grad_norm": 27.270984649658203, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8666771054267883, + "num_tokens": 394331364.0, + "step": 10332 + }, + { + "epoch": 1.314463808675741, + "ewc_loss": 0.0398692861199379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.99346432054881e-05, + "grad_norm": 27.283411026000977, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8530969023704529, + "num_tokens": 394375479.0, + "step": 10333 + }, + { + "epoch": 1.3145910189543315, + "ewc_loss": 0.0398191437125206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9909572074539028e-05, + "grad_norm": 27.331735610961914, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8707209825515747, + "num_tokens": 394414913.0, + "step": 10334 + }, + { + "epoch": 1.314718229232922, + "ewc_loss": 0.03986329585313797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.993164732994046e-05, + "grad_norm": 27.35770034790039, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8654748797416687, + "num_tokens": 394450889.0, + "step": 10335 + }, + { + "epoch": 1.3148454395115126, + "ewc_loss": 0.03984186798334122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9920933482353576e-05, + "grad_norm": 27.338611602783203, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8480970859527588, + "num_tokens": 394495280.0, + "step": 10336 + }, + { + "epoch": 1.314972649790103, + "ewc_loss": 0.03986633941531181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.993316982407123e-05, + "grad_norm": 27.39830207824707, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8734256625175476, + "num_tokens": 394536811.0, + "step": 10337 + }, + { + "epoch": 1.3150998600686936, + "ewc_loss": 0.039855994284152985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9927996618207544e-05, + "grad_norm": 27.300832748413086, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8627619743347168, + "num_tokens": 394573353.0, + "step": 10338 + }, + { + "epoch": 1.3152270703472841, + "ewc_loss": 0.039796940982341766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9898470782209188e-05, + "grad_norm": 27.327011108398438, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8570859432220459, + "num_tokens": 394603517.0, + "step": 10339 + }, + { + "epoch": 1.3153542806258747, + "ewc_loss": 0.03980180248618126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9900900952052325e-05, + "grad_norm": 27.289318084716797, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.871090292930603, + "num_tokens": 394650547.0, + "step": 10340 + }, + { + "epoch": 1.3154814909044652, + "ewc_loss": 0.039823226630687714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9911612980649807e-05, + "grad_norm": 27.293176651000977, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8682974576950073, + "num_tokens": 394684460.0, + "step": 10341 + }, + { + "epoch": 1.3156087011830557, + "ewc_loss": 0.03985186293721199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.992593206523452e-05, + "grad_norm": 27.2983455657959, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8808541893959045, + "num_tokens": 394733520.0, + "step": 10342 + }, + { + "epoch": 1.315735911461646, + "ewc_loss": 0.039791978895664215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.989598968066275e-05, + "grad_norm": 27.28134536743164, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8588659763336182, + "num_tokens": 394775498.0, + "step": 10343 + }, + { + "epoch": 1.3158631217402366, + "ewc_loss": 0.039889685809612274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9944842279073782e-05, + "grad_norm": 27.32419776916504, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.852802038192749, + "num_tokens": 394815138.0, + "step": 10344 + }, + { + "epoch": 1.315990332018827, + "ewc_loss": 0.03986935317516327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9934675947297364e-05, + "grad_norm": 27.378799438476562, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8566664457321167, + "num_tokens": 394850447.0, + "step": 10345 + }, + { + "epoch": 1.3161175422974176, + "ewc_loss": 0.03981218859553337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.990609416679945e-05, + "grad_norm": 27.208415985107422, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8586114645004272, + "num_tokens": 394891800.0, + "step": 10346 + }, + { + "epoch": 1.3162447525760081, + "ewc_loss": 0.03980465605854988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.990232885873411e-05, + "grad_norm": 27.34135627746582, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8714654445648193, + "num_tokens": 394930087.0, + "step": 10347 + }, + { + "epoch": 1.3163719628545987, + "ewc_loss": 0.03989977762103081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.994988815567922e-05, + "grad_norm": 27.307226181030273, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8745160102844238, + "num_tokens": 394964497.0, + "step": 10348 + }, + { + "epoch": 1.3164991731331892, + "ewc_loss": 0.039891429245471954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9945715393987484e-05, + "grad_norm": 27.433469772338867, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8468120694160461, + "num_tokens": 395003127.0, + "step": 10349 + }, + { + "epoch": 1.3166263834117797, + "ewc_loss": 0.039807096123695374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9903547581634484e-05, + "grad_norm": 27.213899612426758, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8540388941764832, + "num_tokens": 395040570.0, + "step": 10350 + }, + { + "epoch": 1.3167535936903703, + "ewc_loss": 0.03983968868851662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.991984390770085e-05, + "grad_norm": 27.460004806518555, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.865085780620575, + "num_tokens": 395075205.0, + "step": 10351 + }, + { + "epoch": 1.3168808039689606, + "ewc_loss": 0.03990715742111206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9953578885179013e-05, + "grad_norm": 27.25033187866211, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8639532923698425, + "num_tokens": 395117492.0, + "step": 10352 + }, + { + "epoch": 1.317008014247551, + "ewc_loss": 0.039813317358493805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9906658053514548e-05, + "grad_norm": 27.349502563476562, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8619775772094727, + "num_tokens": 395162449.0, + "step": 10353 + }, + { + "epoch": 1.3171352245261416, + "ewc_loss": 0.039899539202451706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9949769921367988e-05, + "grad_norm": 27.350353240966797, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8729989528656006, + "num_tokens": 395197310.0, + "step": 10354 + }, + { + "epoch": 1.3172624348047322, + "ewc_loss": 0.039894700050354004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9947350665461272e-05, + "grad_norm": 27.361740112304688, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8753707408905029, + "num_tokens": 395232420.0, + "step": 10355 + }, + { + "epoch": 1.3173896450833227, + "ewc_loss": 0.03989020362496376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.994510239455849e-05, + "grad_norm": 27.290952682495117, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.843590259552002, + "num_tokens": 395269392.0, + "step": 10356 + }, + { + "epoch": 1.3175168553619132, + "ewc_loss": 0.03983616828918457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9918084944947623e-05, + "grad_norm": 27.358495712280273, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8532556295394897, + "num_tokens": 395305496.0, + "step": 10357 + }, + { + "epoch": 1.3176440656405037, + "ewc_loss": 0.0398617647588253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9930881535401568e-05, + "grad_norm": 27.259937286376953, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8735369443893433, + "num_tokens": 395347089.0, + "step": 10358 + }, + { + "epoch": 1.3177712759190943, + "ewc_loss": 0.039916835725307465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9958417396992445e-05, + "grad_norm": 27.26862144470215, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8745700120925903, + "num_tokens": 395382885.0, + "step": 10359 + }, + { + "epoch": 1.3178984861976848, + "ewc_loss": 0.03992582485079765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9962912119808607e-05, + "grad_norm": 27.313520431518555, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.877662181854248, + "num_tokens": 395416260.0, + "step": 10360 + }, + { + "epoch": 1.3180256964762753, + "ewc_loss": 0.03991446644067764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9957233234890737e-05, + "grad_norm": 27.28032875061035, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8552517890930176, + "num_tokens": 395454179.0, + "step": 10361 + }, + { + "epoch": 1.3181529067548658, + "ewc_loss": 0.039929017424583435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.996450919250492e-05, + "grad_norm": 27.330291748046875, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8553664684295654, + "num_tokens": 395489173.0, + "step": 10362 + }, + { + "epoch": 1.3182801170334564, + "ewc_loss": 0.03997578099370003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.99878904822981e-05, + "grad_norm": 27.39618492126465, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8585269451141357, + "num_tokens": 395532057.0, + "step": 10363 + }, + { + "epoch": 1.318407327312047, + "ewc_loss": 0.03992873802781105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9964369130320847e-05, + "grad_norm": 27.27266502380371, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8710045218467712, + "num_tokens": 395570321.0, + "step": 10364 + }, + { + "epoch": 1.3185345375906374, + "ewc_loss": 0.0398918054997921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.994590274989605e-05, + "grad_norm": 27.290761947631836, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8849951028823853, + "num_tokens": 395609041.0, + "step": 10365 + }, + { + "epoch": 1.318661747869228, + "ewc_loss": 0.03994203358888626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9971017536590807e-05, + "grad_norm": 27.292766571044922, + "learning_rate": 1e-06, + "loss": 0.5044, + "mean_token_accuracy": 0.8402248024940491, + "num_tokens": 395649961.0, + "step": 10366 + }, + { + "epoch": 1.3187889581478183, + "ewc_loss": 0.03997806832194328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.998903462663293e-05, + "grad_norm": 27.4193172454834, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8658838272094727, + "num_tokens": 395691882.0, + "step": 10367 + }, + { + "epoch": 1.3189161684264088, + "ewc_loss": 0.0399271622300148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.996358150790911e-05, + "grad_norm": 27.38089942932129, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8520539999008179, + "num_tokens": 395731496.0, + "step": 10368 + }, + { + "epoch": 1.3190433787049993, + "ewc_loss": 0.03985132277011871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.992566103581339e-05, + "grad_norm": 27.29434585571289, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8691452145576477, + "num_tokens": 395772618.0, + "step": 10369 + }, + { + "epoch": 1.3191705889835899, + "ewc_loss": 0.039941053837537766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.997052640945185e-05, + "grad_norm": 27.362266540527344, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8474195003509521, + "num_tokens": 395811907.0, + "step": 10370 + }, + { + "epoch": 1.3192977992621804, + "ewc_loss": 0.03989269956946373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9946350221289322e-05, + "grad_norm": 27.43381118774414, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8588607311248779, + "num_tokens": 395851186.0, + "step": 10371 + }, + { + "epoch": 1.319425009540771, + "ewc_loss": 0.039935823529958725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9967912521678954e-05, + "grad_norm": 27.329891204833984, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8650382161140442, + "num_tokens": 395893868.0, + "step": 10372 + }, + { + "epoch": 1.3195522198193614, + "ewc_loss": 0.03986163064837456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.993081605178304e-05, + "grad_norm": 27.480419158935547, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8740894794464111, + "num_tokens": 395930519.0, + "step": 10373 + }, + { + "epoch": 1.319679430097952, + "ewc_loss": 0.03990171477198601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.995085767703131e-05, + "grad_norm": 27.318180084228516, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.87987220287323, + "num_tokens": 395965692.0, + "step": 10374 + }, + { + "epoch": 1.3198066403765425, + "ewc_loss": 0.03981482610106468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.990741293411702e-05, + "grad_norm": 27.447725296020508, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.865780770778656, + "num_tokens": 396002316.0, + "step": 10375 + }, + { + "epoch": 1.3199338506551328, + "ewc_loss": 0.03984397277235985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.992198667721823e-05, + "grad_norm": 27.295963287353516, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8709425926208496, + "num_tokens": 396043392.0, + "step": 10376 + }, + { + "epoch": 1.3200610609337233, + "ewc_loss": 0.039777807891368866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.988890471693594e-05, + "grad_norm": 27.36663818359375, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8623958826065063, + "num_tokens": 396078603.0, + "step": 10377 + }, + { + "epoch": 1.3201882712123139, + "ewc_loss": 0.03990289941430092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.995144884858746e-05, + "grad_norm": 27.35061264038086, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8554214239120483, + "num_tokens": 396121132.0, + "step": 10378 + }, + { + "epoch": 1.3203154814909044, + "ewc_loss": 0.03985575959086418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9927880202885717e-05, + "grad_norm": 27.404109954833984, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8743380308151245, + "num_tokens": 396155730.0, + "step": 10379 + }, + { + "epoch": 1.320442691769495, + "ewc_loss": 0.03983237221837044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.991618592001032e-05, + "grad_norm": 27.432735443115234, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8500217199325562, + "num_tokens": 396190976.0, + "step": 10380 + }, + { + "epoch": 1.3205699020480854, + "ewc_loss": 0.03979817405343056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.989908741961699e-05, + "grad_norm": 27.33893394470215, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8646641969680786, + "num_tokens": 396229571.0, + "step": 10381 + }, + { + "epoch": 1.320697112326676, + "ewc_loss": 0.039846546947956085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9923272702726535e-05, + "grad_norm": 27.434146881103516, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8639716506004333, + "num_tokens": 396265586.0, + "step": 10382 + }, + { + "epoch": 1.3208243226052665, + "ewc_loss": 0.03981327265501022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9906636225641705e-05, + "grad_norm": 27.317243576049805, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8690321445465088, + "num_tokens": 396303810.0, + "step": 10383 + }, + { + "epoch": 1.320951532883857, + "ewc_loss": 0.03989321365952492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9946606698795222e-05, + "grad_norm": 27.35592269897461, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8842718601226807, + "num_tokens": 396345671.0, + "step": 10384 + }, + { + "epoch": 1.3210787431624476, + "ewc_loss": 0.039788730442523956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9894365323125385e-05, + "grad_norm": 27.346904754638672, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8823065757751465, + "num_tokens": 396388062.0, + "step": 10385 + }, + { + "epoch": 1.321205953441038, + "ewc_loss": 0.03985556215047836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9927781977457926e-05, + "grad_norm": 27.388120651245117, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8781115412712097, + "num_tokens": 396425189.0, + "step": 10386 + }, + { + "epoch": 1.3213331637196286, + "ewc_loss": 0.0398220531642437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9911027266061865e-05, + "grad_norm": 27.381689071655273, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8553391098976135, + "num_tokens": 396461861.0, + "step": 10387 + }, + { + "epoch": 1.3214603739982191, + "ewc_loss": 0.03986422345042229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9932111172238365e-05, + "grad_norm": 27.33928108215332, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.876275897026062, + "num_tokens": 396506190.0, + "step": 10388 + }, + { + "epoch": 1.3215875842768097, + "ewc_loss": 0.03989214450120926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.994607191591058e-05, + "grad_norm": 27.410076141357422, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8670135736465454, + "num_tokens": 396538900.0, + "step": 10389 + }, + { + "epoch": 1.3217147945554002, + "ewc_loss": 0.039840735495090485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9920367776649073e-05, + "grad_norm": 27.27912712097168, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.873671293258667, + "num_tokens": 396577029.0, + "step": 10390 + }, + { + "epoch": 1.3218420048339907, + "ewc_loss": 0.03981775417923927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9908877220586874e-05, + "grad_norm": 27.316944122314453, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.849105715751648, + "num_tokens": 396616917.0, + "step": 10391 + }, + { + "epoch": 1.321969215112581, + "ewc_loss": 0.0399368554353714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9968427295680158e-05, + "grad_norm": 27.376890182495117, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8731586933135986, + "num_tokens": 396651726.0, + "step": 10392 + }, + { + "epoch": 1.3220964253911716, + "ewc_loss": 0.039816077798604965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.990803866647184e-05, + "grad_norm": 27.354244232177734, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8532410860061646, + "num_tokens": 396693796.0, + "step": 10393 + }, + { + "epoch": 1.322223635669762, + "ewc_loss": 0.03983481600880623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9917408280889504e-05, + "grad_norm": 27.40863800048828, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8557416796684265, + "num_tokens": 396731947.0, + "step": 10394 + }, + { + "epoch": 1.3223508459483526, + "ewc_loss": 0.03983272612094879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9916362361982465e-05, + "grad_norm": 27.383602142333984, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8694764375686646, + "num_tokens": 396767062.0, + "step": 10395 + }, + { + "epoch": 1.3224780562269431, + "ewc_loss": 0.03984977677464485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9924887965316884e-05, + "grad_norm": 27.410619735717773, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.867856502532959, + "num_tokens": 396803918.0, + "step": 10396 + }, + { + "epoch": 1.3226052665055337, + "ewc_loss": 0.03976837545633316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9884188077412546e-05, + "grad_norm": 27.195714950561523, + "learning_rate": 1e-06, + "loss": 0.5248, + "mean_token_accuracy": 0.8376131057739258, + "num_tokens": 396850314.0, + "step": 10397 + }, + { + "epoch": 1.3227324767841242, + "ewc_loss": 0.03977406024932861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9887029338860884e-05, + "grad_norm": 27.291128158569336, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8585291504859924, + "num_tokens": 396887920.0, + "step": 10398 + }, + { + "epoch": 1.3228596870627147, + "ewc_loss": 0.039924897253513336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9962448277510703e-05, + "grad_norm": 27.32891082763672, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8540832996368408, + "num_tokens": 396923369.0, + "step": 10399 + }, + { + "epoch": 1.3229868973413053, + "ewc_loss": 0.03987020254135132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9935101590817794e-05, + "grad_norm": 27.318767547607422, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8630273938179016, + "num_tokens": 396962338.0, + "step": 10400 + }, + { + "epoch": 1.3231141076198956, + "ewc_loss": 0.03990430012345314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.995215097849723e-05, + "grad_norm": 27.26927947998047, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8751630187034607, + "num_tokens": 396998275.0, + "step": 10401 + }, + { + "epoch": 1.323241317898486, + "ewc_loss": 0.039919428527355194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9959714336437173e-05, + "grad_norm": 27.316377639770508, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8622152805328369, + "num_tokens": 397040034.0, + "step": 10402 + }, + { + "epoch": 1.3233685281770766, + "ewc_loss": 0.03993385657668114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9966928448411636e-05, + "grad_norm": 27.284000396728516, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8836580514907837, + "num_tokens": 397080549.0, + "step": 10403 + }, + { + "epoch": 1.3234957384556671, + "ewc_loss": 0.03987862542271614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9939312551287003e-05, + "grad_norm": 27.415454864501953, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8776692152023315, + "num_tokens": 397120803.0, + "step": 10404 + }, + { + "epoch": 1.3236229487342577, + "ewc_loss": 0.03993121162056923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9965606043115258e-05, + "grad_norm": 27.36640167236328, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8698706030845642, + "num_tokens": 397160463.0, + "step": 10405 + }, + { + "epoch": 1.3237501590128482, + "ewc_loss": 0.03983675315976143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.991837598325219e-05, + "grad_norm": 27.340585708618164, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8587101697921753, + "num_tokens": 397205425.0, + "step": 10406 + }, + { + "epoch": 1.3238773692914387, + "ewc_loss": 0.039877764880657196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9938883269787766e-05, + "grad_norm": 27.343780517578125, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8598690629005432, + "num_tokens": 397238907.0, + "step": 10407 + }, + { + "epoch": 1.3240045795700293, + "ewc_loss": 0.03997058793902397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.998529478441924e-05, + "grad_norm": 27.47112274169922, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8653398752212524, + "num_tokens": 397281639.0, + "step": 10408 + }, + { + "epoch": 1.3241317898486198, + "ewc_loss": 0.039845891296863556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9922945284633897e-05, + "grad_norm": 27.28489875793457, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8535326719284058, + "num_tokens": 397323350.0, + "step": 10409 + }, + { + "epoch": 1.3242590001272103, + "ewc_loss": 0.0398699976503849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9934999727411196e-05, + "grad_norm": 27.445833206176758, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8580545783042908, + "num_tokens": 397364438.0, + "step": 10410 + }, + { + "epoch": 1.3243862104058008, + "ewc_loss": 0.03985375165939331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.992687612073496e-05, + "grad_norm": 27.32797622680664, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8526055216789246, + "num_tokens": 397406881.0, + "step": 10411 + }, + { + "epoch": 1.3245134206843914, + "ewc_loss": 0.03975120931863785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9875604266417213e-05, + "grad_norm": 27.427824020385742, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8657525181770325, + "num_tokens": 397451190.0, + "step": 10412 + }, + { + "epoch": 1.324640630962982, + "ewc_loss": 0.03988085314631462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9940425772801973e-05, + "grad_norm": 27.33038330078125, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8664090633392334, + "num_tokens": 397491063.0, + "step": 10413 + }, + { + "epoch": 1.3247678412415724, + "ewc_loss": 0.039822932332754135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.991146564250812e-05, + "grad_norm": 27.262405395507812, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.865554928779602, + "num_tokens": 397529961.0, + "step": 10414 + }, + { + "epoch": 1.324895051520163, + "ewc_loss": 0.03985549882054329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9927749235648662e-05, + "grad_norm": 27.379528045654297, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8757045269012451, + "num_tokens": 397565616.0, + "step": 10415 + }, + { + "epoch": 1.3250222617987533, + "ewc_loss": 0.039865851402282715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9932926079491153e-05, + "grad_norm": 27.306798934936523, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8719966411590576, + "num_tokens": 397602223.0, + "step": 10416 + }, + { + "epoch": 1.3251494720773438, + "ewc_loss": 0.03981814160943031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.990907003346365e-05, + "grad_norm": 27.25961685180664, + "learning_rate": 1e-06, + "loss": 0.5046, + "mean_token_accuracy": 0.8439092636108398, + "num_tokens": 397638910.0, + "step": 10417 + }, + { + "epoch": 1.3252766823559343, + "ewc_loss": 0.03990650177001953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9953251467086375e-05, + "grad_norm": 27.31221580505371, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8574309945106506, + "num_tokens": 397681200.0, + "step": 10418 + }, + { + "epoch": 1.3254038926345248, + "ewc_loss": 0.03983885422348976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.991942735912744e-05, + "grad_norm": 27.26696014404297, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8663057684898376, + "num_tokens": 397724832.0, + "step": 10419 + }, + { + "epoch": 1.3255311029131154, + "ewc_loss": 0.039930395781993866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9965198589488864e-05, + "grad_norm": 27.36079216003418, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8575973510742188, + "num_tokens": 397765537.0, + "step": 10420 + }, + { + "epoch": 1.325658313191706, + "ewc_loss": 0.03994977846741676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9974888346041553e-05, + "grad_norm": 27.33702850341797, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8561371564865112, + "num_tokens": 397800945.0, + "step": 10421 + }, + { + "epoch": 1.3257855234702964, + "ewc_loss": 0.03989655151963234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9948276531067677e-05, + "grad_norm": 27.25889778137207, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.871091365814209, + "num_tokens": 397840786.0, + "step": 10422 + }, + { + "epoch": 1.325912733748887, + "ewc_loss": 0.0399426594376564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9971330402768217e-05, + "grad_norm": 27.326370239257812, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8687700629234314, + "num_tokens": 397875607.0, + "step": 10423 + }, + { + "epoch": 1.3260399440274775, + "ewc_loss": 0.03994901105761528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9974506358266808e-05, + "grad_norm": 27.205968856811523, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8803631067276001, + "num_tokens": 397909524.0, + "step": 10424 + }, + { + "epoch": 1.3261671543060678, + "ewc_loss": 0.03995713219046593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9978566342615522e-05, + "grad_norm": 27.463645935058594, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8486171960830688, + "num_tokens": 397947862.0, + "step": 10425 + }, + { + "epoch": 1.3262943645846583, + "ewc_loss": 0.03999463468790054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9997316485387273e-05, + "grad_norm": 27.24538803100586, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.879071831703186, + "num_tokens": 397980958.0, + "step": 10426 + }, + { + "epoch": 1.3264215748632489, + "ewc_loss": 0.03993113711476326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9965567844337784e-05, + "grad_norm": 27.357391357421875, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8627235293388367, + "num_tokens": 398021231.0, + "step": 10427 + }, + { + "epoch": 1.3265487851418394, + "ewc_loss": 0.039980411529541016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9990206055808812e-05, + "grad_norm": 27.24669075012207, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8625580668449402, + "num_tokens": 398059202.0, + "step": 10428 + }, + { + "epoch": 1.32667599542043, + "ewc_loss": 0.039954278618097305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.997713843593374e-05, + "grad_norm": 27.27487564086914, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8855463266372681, + "num_tokens": 398097121.0, + "step": 10429 + }, + { + "epoch": 1.3268032056990204, + "ewc_loss": 0.040054649114608765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.002732435357757e-05, + "grad_norm": 27.41652488708496, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8649095296859741, + "num_tokens": 398130413.0, + "step": 10430 + }, + { + "epoch": 1.326930415977611, + "ewc_loss": 0.03991086781024933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.995543425437063e-05, + "grad_norm": 27.242788314819336, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8640521764755249, + "num_tokens": 398172535.0, + "step": 10431 + }, + { + "epoch": 1.3270576262562015, + "ewc_loss": 0.03994737192988396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.997368599404581e-05, + "grad_norm": 27.305679321289062, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8663749694824219, + "num_tokens": 398206681.0, + "step": 10432 + }, + { + "epoch": 1.327184836534792, + "ewc_loss": 0.039993856102228165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9996927221654914e-05, + "grad_norm": 27.26692008972168, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8687086701393127, + "num_tokens": 398245206.0, + "step": 10433 + }, + { + "epoch": 1.3273120468133826, + "ewc_loss": 0.03996332734823227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.998166408156976e-05, + "grad_norm": 27.291240692138672, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8618825674057007, + "num_tokens": 398278902.0, + "step": 10434 + }, + { + "epoch": 1.327439257091973, + "ewc_loss": 0.04012570157647133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0062851035618223e-05, + "grad_norm": 27.42041015625, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8661908507347107, + "num_tokens": 398315833.0, + "step": 10435 + }, + { + "epoch": 1.3275664673705636, + "ewc_loss": 0.040018532425165176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0009265426779166e-05, + "grad_norm": 27.242019653320312, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.8470204472541809, + "num_tokens": 398356496.0, + "step": 10436 + }, + { + "epoch": 1.3276936776491541, + "ewc_loss": 0.039977721869945526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.998886000365019e-05, + "grad_norm": 27.21823501586914, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8717363476753235, + "num_tokens": 398393661.0, + "step": 10437 + }, + { + "epoch": 1.3278208879277447, + "ewc_loss": 0.04006810858845711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.003405461437069e-05, + "grad_norm": 27.279014587402344, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8825854063034058, + "num_tokens": 398430463.0, + "step": 10438 + }, + { + "epoch": 1.3279480982063352, + "ewc_loss": 0.040025047957897186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0012523236800916e-05, + "grad_norm": 27.264999389648438, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.863166332244873, + "num_tokens": 398470382.0, + "step": 10439 + }, + { + "epoch": 1.3280753084849257, + "ewc_loss": 0.04011406749486923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0057033907505684e-05, + "grad_norm": 27.333251953125, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8478814363479614, + "num_tokens": 398504793.0, + "step": 10440 + }, + { + "epoch": 1.328202518763516, + "ewc_loss": 0.040021490305662155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.001074426516425e-05, + "grad_norm": 27.31694793701172, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8558351993560791, + "num_tokens": 398541038.0, + "step": 10441 + }, + { + "epoch": 1.3283297290421066, + "ewc_loss": 0.040067967027425766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.003398367378395e-05, + "grad_norm": 27.305335998535156, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8491255640983582, + "num_tokens": 398578047.0, + "step": 10442 + }, + { + "epoch": 1.328456939320697, + "ewc_loss": 0.03999006748199463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9995033653685823e-05, + "grad_norm": 27.3516845703125, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8528650403022766, + "num_tokens": 398618946.0, + "step": 10443 + }, + { + "epoch": 1.3285841495992876, + "ewc_loss": 0.0400799922645092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.003999543376267e-05, + "grad_norm": 27.266170501708984, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8708319664001465, + "num_tokens": 398656895.0, + "step": 10444 + }, + { + "epoch": 1.3287113598778781, + "ewc_loss": 0.04002481326460838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.001240682147909e-05, + "grad_norm": 27.433231353759766, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8642464280128479, + "num_tokens": 398696521.0, + "step": 10445 + }, + { + "epoch": 1.3288385701564687, + "ewc_loss": 0.04008901119232178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0044506527483463e-05, + "grad_norm": 27.308656692504883, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8664085865020752, + "num_tokens": 398735275.0, + "step": 10446 + }, + { + "epoch": 1.3289657804350592, + "ewc_loss": 0.040024954825639725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0012477762065828e-05, + "grad_norm": 27.39968490600586, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.8500837087631226, + "num_tokens": 398774753.0, + "step": 10447 + }, + { + "epoch": 1.3290929907136497, + "ewc_loss": 0.0399896502494812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9994824469904415e-05, + "grad_norm": 27.233890533447266, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.858834445476532, + "num_tokens": 398810893.0, + "step": 10448 + }, + { + "epoch": 1.3292202009922403, + "ewc_loss": 0.040080584585666656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0040291929035448e-05, + "grad_norm": 27.44449234008789, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8594347238540649, + "num_tokens": 398849741.0, + "step": 10449 + }, + { + "epoch": 1.3293474112708306, + "ewc_loss": 0.040023185312747955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.00115919142263e-05, + "grad_norm": 27.277671813964844, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8590301871299744, + "num_tokens": 398881186.0, + "step": 10450 + }, + { + "epoch": 1.329474621549421, + "ewc_loss": 0.04005548357963562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0027742721140385e-05, + "grad_norm": 27.31964874267578, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8829231262207031, + "num_tokens": 398918677.0, + "step": 10451 + }, + { + "epoch": 1.3296018318280116, + "ewc_loss": 0.04007323831319809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.003661938942969e-05, + "grad_norm": 27.386648178100586, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8614809513092041, + "num_tokens": 398953435.0, + "step": 10452 + }, + { + "epoch": 1.3297290421066021, + "ewc_loss": 0.04008916765451431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0044582925038412e-05, + "grad_norm": 27.3007869720459, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8716936111450195, + "num_tokens": 398991799.0, + "step": 10453 + }, + { + "epoch": 1.3298562523851927, + "ewc_loss": 0.04005111753940582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0025558114866726e-05, + "grad_norm": 27.423694610595703, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8701235055923462, + "num_tokens": 399023211.0, + "step": 10454 + }, + { + "epoch": 1.3299834626637832, + "ewc_loss": 0.04012496396899223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0062481780769303e-05, + "grad_norm": 27.348020553588867, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8605350852012634, + "num_tokens": 399061431.0, + "step": 10455 + }, + { + "epoch": 1.3301106729423737, + "ewc_loss": 0.040057260543107986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0028630387969315e-05, + "grad_norm": 27.479673385620117, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8663551211357117, + "num_tokens": 399097384.0, + "step": 10456 + }, + { + "epoch": 1.3302378832209643, + "ewc_loss": 0.04004085063934326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.002042492676992e-05, + "grad_norm": 27.238447189331055, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8671289086341858, + "num_tokens": 399133935.0, + "step": 10457 + }, + { + "epoch": 1.3303650934995548, + "ewc_loss": 0.04003642126917839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0018211216665804e-05, + "grad_norm": 27.522808074951172, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8708237409591675, + "num_tokens": 399165709.0, + "step": 10458 + }, + { + "epoch": 1.3304923037781453, + "ewc_loss": 0.04005276411771774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0026382117066532e-05, + "grad_norm": 27.194780349731445, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8629291653633118, + "num_tokens": 399203548.0, + "step": 10459 + }, + { + "epoch": 1.3306195140567358, + "ewc_loss": 0.03997362032532692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9986810002592392e-05, + "grad_norm": 27.373579025268555, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8630689382553101, + "num_tokens": 399244341.0, + "step": 10460 + }, + { + "epoch": 1.3307467243353264, + "ewc_loss": 0.0401265025138855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0063251213287003e-05, + "grad_norm": 27.401325225830078, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8692348003387451, + "num_tokens": 399282284.0, + "step": 10461 + }, + { + "epoch": 1.330873934613917, + "ewc_loss": 0.040032658725976944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0016328562633134e-05, + "grad_norm": 27.32434844970703, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8784208297729492, + "num_tokens": 399320523.0, + "step": 10462 + }, + { + "epoch": 1.3310011448925074, + "ewc_loss": 0.04003560170531273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0017800125060603e-05, + "grad_norm": 27.380813598632812, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8651355504989624, + "num_tokens": 399357489.0, + "step": 10463 + }, + { + "epoch": 1.331128355171098, + "ewc_loss": 0.04001474007964134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.000737003982067e-05, + "grad_norm": 27.315366744995117, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8522489070892334, + "num_tokens": 399393561.0, + "step": 10464 + }, + { + "epoch": 1.3312555654496883, + "ewc_loss": 0.04008840024471283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0044200937263668e-05, + "grad_norm": 27.533676147460938, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8687825798988342, + "num_tokens": 399424514.0, + "step": 10465 + }, + { + "epoch": 1.3313827757282788, + "ewc_loss": 0.04007670655846596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0038352886331268e-05, + "grad_norm": 27.391023635864258, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8707062005996704, + "num_tokens": 399461372.0, + "step": 10466 + }, + { + "epoch": 1.3315099860068693, + "ewc_loss": 0.039965804666280746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9982902813353576e-05, + "grad_norm": 27.444705963134766, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.879135012626648, + "num_tokens": 399502503.0, + "step": 10467 + }, + { + "epoch": 1.3316371962854598, + "ewc_loss": 0.040060173720121384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0030087398481555e-05, + "grad_norm": 27.44169807434082, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8590966463088989, + "num_tokens": 399537836.0, + "step": 10468 + }, + { + "epoch": 1.3317644065640504, + "ewc_loss": 0.039941150695085526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9970575522165745e-05, + "grad_norm": 27.29754066467285, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8598451018333435, + "num_tokens": 399575189.0, + "step": 10469 + }, + { + "epoch": 1.331891616842641, + "ewc_loss": 0.03999832645058632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9999162759631872e-05, + "grad_norm": 27.383750915527344, + "learning_rate": 1e-06, + "loss": 0.5018, + "mean_token_accuracy": 0.8465896844863892, + "num_tokens": 399614406.0, + "step": 10470 + }, + { + "epoch": 1.3320188271212314, + "ewc_loss": 0.04006006941199303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0030034647788852e-05, + "grad_norm": 27.330432891845703, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8705717325210571, + "num_tokens": 399652847.0, + "step": 10471 + }, + { + "epoch": 1.332146037399822, + "ewc_loss": 0.04006526619195938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.003263398364652e-05, + "grad_norm": 27.464094161987305, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8544018864631653, + "num_tokens": 399691628.0, + "step": 10472 + }, + { + "epoch": 1.3322732476784125, + "ewc_loss": 0.04005341976881027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.002670953515917e-05, + "grad_norm": 27.456512451171875, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.875694215297699, + "num_tokens": 399732875.0, + "step": 10473 + }, + { + "epoch": 1.3324004579570028, + "ewc_loss": 0.040093034505844116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.004651651077438e-05, + "grad_norm": 27.37975311279297, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8752118945121765, + "num_tokens": 399769537.0, + "step": 10474 + }, + { + "epoch": 1.3325276682355933, + "ewc_loss": 0.039911411702632904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9955705283791758e-05, + "grad_norm": 27.379255294799805, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.849840521812439, + "num_tokens": 399807757.0, + "step": 10475 + }, + { + "epoch": 1.3326548785141838, + "ewc_loss": 0.039985138922929764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.999256892304402e-05, + "grad_norm": 27.34032440185547, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8705966472625732, + "num_tokens": 399837351.0, + "step": 10476 + }, + { + "epoch": 1.3327820887927744, + "ewc_loss": 0.03992219269275665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9961096768383868e-05, + "grad_norm": 27.363351821899414, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8831589221954346, + "num_tokens": 399873678.0, + "step": 10477 + }, + { + "epoch": 1.332909299071365, + "ewc_loss": 0.03999297320842743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9996487026219256e-05, + "grad_norm": 27.249820709228516, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8673061728477478, + "num_tokens": 399916206.0, + "step": 10478 + }, + { + "epoch": 1.3330365093499554, + "ewc_loss": 0.04004442319273949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0022211174364202e-05, + "grad_norm": 27.42752456665039, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.877043604850769, + "num_tokens": 399954144.0, + "step": 10479 + }, + { + "epoch": 1.333163719628546, + "ewc_loss": 0.04007616266608238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.003808185691014e-05, + "grad_norm": 27.315332412719727, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8728898167610168, + "num_tokens": 399995865.0, + "step": 10480 + }, + { + "epoch": 1.3332909299071365, + "ewc_loss": 0.04003496468067169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0017481801914982e-05, + "grad_norm": 27.431978225708008, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8668709993362427, + "num_tokens": 400033234.0, + "step": 10481 + }, + { + "epoch": 1.333418140185727, + "ewc_loss": 0.04010550305247307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0052752006449737e-05, + "grad_norm": 27.425607681274414, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8740332126617432, + "num_tokens": 400066842.0, + "step": 10482 + }, + { + "epoch": 1.3335453504643175, + "ewc_loss": 0.04001772403717041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.000886161113158e-05, + "grad_norm": 27.39280891418457, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8763715028762817, + "num_tokens": 400099380.0, + "step": 10483 + }, + { + "epoch": 1.333672560742908, + "ewc_loss": 0.04002530872821808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0012654204037972e-05, + "grad_norm": 27.235721588134766, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8772249221801758, + "num_tokens": 400137419.0, + "step": 10484 + }, + { + "epoch": 1.3337997710214986, + "ewc_loss": 0.040012557059526443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0006278646178544e-05, + "grad_norm": 27.342117309570312, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8663464784622192, + "num_tokens": 400171086.0, + "step": 10485 + }, + { + "epoch": 1.3339269813000891, + "ewc_loss": 0.040063634514808655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0031817257404327e-05, + "grad_norm": 27.33160972595215, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8630483150482178, + "num_tokens": 400206763.0, + "step": 10486 + }, + { + "epoch": 1.3340541915786797, + "ewc_loss": 0.04006928578019142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0034642147948034e-05, + "grad_norm": 27.36193084716797, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.8505808115005493, + "num_tokens": 400242708.0, + "step": 10487 + }, + { + "epoch": 1.3341814018572702, + "ewc_loss": 0.04009797424077988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0048986698384397e-05, + "grad_norm": 27.430912017822266, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8456465601921082, + "num_tokens": 400277417.0, + "step": 10488 + }, + { + "epoch": 1.3343086121358605, + "ewc_loss": 0.04003099724650383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0015499103465118e-05, + "grad_norm": 27.316404342651367, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8641695976257324, + "num_tokens": 400315811.0, + "step": 10489 + }, + { + "epoch": 1.334435822414451, + "ewc_loss": 0.040079012513160706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0039506125613116e-05, + "grad_norm": 27.40801239013672, + "learning_rate": 1e-06, + "loss": 0.4877, + "mean_token_accuracy": 0.8460659980773926, + "num_tokens": 400356321.0, + "step": 10490 + }, + { + "epoch": 1.3345630326930416, + "ewc_loss": 0.04007526487112045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0037632566527463e-05, + "grad_norm": 27.341487884521484, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8660234212875366, + "num_tokens": 400392382.0, + "step": 10491 + }, + { + "epoch": 1.334690242971632, + "ewc_loss": 0.04008376970887184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0041885363752954e-05, + "grad_norm": 27.39556312561035, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8722023963928223, + "num_tokens": 400434947.0, + "step": 10492 + }, + { + "epoch": 1.3348174532502226, + "ewc_loss": 0.04009202495217323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.00460126507096e-05, + "grad_norm": 27.361825942993164, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8466454148292542, + "num_tokens": 400472742.0, + "step": 10493 + }, + { + "epoch": 1.3349446635288131, + "ewc_loss": 0.04014025628566742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.007012881222181e-05, + "grad_norm": 27.447620391845703, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8790890574455261, + "num_tokens": 400510029.0, + "step": 10494 + }, + { + "epoch": 1.3350718738074037, + "ewc_loss": 0.040106818079948425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0053408661624417e-05, + "grad_norm": 27.338665008544922, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8619557023048401, + "num_tokens": 400547383.0, + "step": 10495 + }, + { + "epoch": 1.3351990840859942, + "ewc_loss": 0.040081825107336044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0040912204422057e-05, + "grad_norm": 27.384370803833008, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8769596219062805, + "num_tokens": 400585242.0, + "step": 10496 + }, + { + "epoch": 1.3353262943645847, + "ewc_loss": 0.04018691927194595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.009345917031169e-05, + "grad_norm": 27.507762908935547, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8763443827629089, + "num_tokens": 400623117.0, + "step": 10497 + }, + { + "epoch": 1.3354535046431752, + "ewc_loss": 0.04010399803519249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.005199894483667e-05, + "grad_norm": 27.512819290161133, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8548687696456909, + "num_tokens": 400660170.0, + "step": 10498 + }, + { + "epoch": 1.3355807149217656, + "ewc_loss": 0.04007887467741966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.003943700401578e-05, + "grad_norm": 27.440662384033203, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.87108314037323, + "num_tokens": 400700467.0, + "step": 10499 + }, + { + "epoch": 1.335707925200356, + "ewc_loss": 0.039981722831726074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.999086089199409e-05, + "grad_norm": 27.51375389099121, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8597062826156616, + "num_tokens": 400740269.0, + "step": 10500 + }, + { + "epoch": 1.3358351354789466, + "ewc_loss": 0.04001113027334213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0005565602332354e-05, + "grad_norm": 27.46228790283203, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.875515878200531, + "num_tokens": 400776465.0, + "step": 10501 + }, + { + "epoch": 1.3359623457575371, + "ewc_loss": 0.039937347173690796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9968672859249637e-05, + "grad_norm": 27.44622802734375, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8617072105407715, + "num_tokens": 400811210.0, + "step": 10502 + }, + { + "epoch": 1.3360895560361277, + "ewc_loss": 0.04000496491789818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0002482415293343e-05, + "grad_norm": 27.356210708618164, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8576890230178833, + "num_tokens": 400848820.0, + "step": 10503 + }, + { + "epoch": 1.3362167663147182, + "ewc_loss": 0.03993351012468338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.99667556444183e-05, + "grad_norm": 27.44727897644043, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.867173433303833, + "num_tokens": 400888774.0, + "step": 10504 + }, + { + "epoch": 1.3363439765933087, + "ewc_loss": 0.04000505432486534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0002527890028432e-05, + "grad_norm": 27.436725616455078, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8673128485679626, + "num_tokens": 400935067.0, + "step": 10505 + }, + { + "epoch": 1.3364711868718993, + "ewc_loss": 0.03993157297372818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.996578612306621e-05, + "grad_norm": 27.438039779663086, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8753225803375244, + "num_tokens": 400974122.0, + "step": 10506 + }, + { + "epoch": 1.3365983971504898, + "ewc_loss": 0.039964593946933746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9982297089882195e-05, + "grad_norm": 27.5150203704834, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8746641874313354, + "num_tokens": 401008911.0, + "step": 10507 + }, + { + "epoch": 1.3367256074290803, + "ewc_loss": 0.039936210960149765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.996810533455573e-05, + "grad_norm": 27.38129234313965, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8715068697929382, + "num_tokens": 401039453.0, + "step": 10508 + }, + { + "epoch": 1.3368528177076708, + "ewc_loss": 0.039966680109500885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9983339370810427e-05, + "grad_norm": 27.473834991455078, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8749145269393921, + "num_tokens": 401075854.0, + "step": 10509 + }, + { + "epoch": 1.3369800279862614, + "ewc_loss": 0.039970122277736664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9985061953775585e-05, + "grad_norm": 27.402315139770508, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8646522760391235, + "num_tokens": 401114922.0, + "step": 10510 + }, + { + "epoch": 1.337107238264852, + "ewc_loss": 0.039961040019989014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.998051993723493e-05, + "grad_norm": 27.410478591918945, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8736603856086731, + "num_tokens": 401155833.0, + "step": 10511 + }, + { + "epoch": 1.3372344485434424, + "ewc_loss": 0.03993062674999237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9965313185821287e-05, + "grad_norm": 27.425498962402344, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8718129396438599, + "num_tokens": 401196971.0, + "step": 10512 + }, + { + "epoch": 1.337361658822033, + "ewc_loss": 0.03994113579392433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.997056824620813e-05, + "grad_norm": 27.45029067993164, + "learning_rate": 1e-06, + "loss": 0.5342, + "mean_token_accuracy": 0.834446370601654, + "num_tokens": 401237312.0, + "step": 10513 + }, + { + "epoch": 1.3374888691006233, + "ewc_loss": 0.039964836090803146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9982418962172233e-05, + "grad_norm": 27.388336181640625, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8626632690429688, + "num_tokens": 401274630.0, + "step": 10514 + }, + { + "epoch": 1.3376160793792138, + "ewc_loss": 0.03992294520139694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9961473299190402e-05, + "grad_norm": 27.392026901245117, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8706179857254028, + "num_tokens": 401311780.0, + "step": 10515 + }, + { + "epoch": 1.3377432896578043, + "ewc_loss": 0.04006533697247505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0032668544445187e-05, + "grad_norm": 27.42600440979004, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8728760480880737, + "num_tokens": 401352614.0, + "step": 10516 + }, + { + "epoch": 1.3378704999363948, + "ewc_loss": 0.03996290639042854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.998145307879895e-05, + "grad_norm": 27.47783660888672, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8687228560447693, + "num_tokens": 401394495.0, + "step": 10517 + }, + { + "epoch": 1.3379977102149854, + "ewc_loss": 0.039959732443094254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9979866920039058e-05, + "grad_norm": 27.402536392211914, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8525258302688599, + "num_tokens": 401427758.0, + "step": 10518 + }, + { + "epoch": 1.338124920493576, + "ewc_loss": 0.039936840534210205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9968420019722544e-05, + "grad_norm": 27.443208694458008, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8506819009780884, + "num_tokens": 401473868.0, + "step": 10519 + }, + { + "epoch": 1.3382521307721664, + "ewc_loss": 0.03997248038649559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9986240658909082e-05, + "grad_norm": 27.429256439208984, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8478553295135498, + "num_tokens": 401510459.0, + "step": 10520 + }, + { + "epoch": 1.338379341050757, + "ewc_loss": 0.03998083248734474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.9990417058579624e-05, + "grad_norm": 27.38129425048828, + "learning_rate": 1e-06, + "loss": 0.532, + "mean_token_accuracy": 0.8365978002548218, + "num_tokens": 401545283.0, + "step": 10521 + }, + { + "epoch": 1.3385065513293475, + "ewc_loss": 0.03998693451285362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 1.999346750380937e-05, + "grad_norm": 27.441734313964844, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8665324449539185, + "num_tokens": 401577166.0, + "step": 10522 + }, + { + "epoch": 1.3386337616079378, + "ewc_loss": 0.040013477206230164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.000673885049764e-05, + "grad_norm": 27.278528213500977, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8782956600189209, + "num_tokens": 401619075.0, + "step": 10523 + }, + { + "epoch": 1.3387609718865283, + "ewc_loss": 0.04002077504992485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0010387743241154e-05, + "grad_norm": 27.312665939331055, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8594905138015747, + "num_tokens": 401661286.0, + "step": 10524 + }, + { + "epoch": 1.3388881821651188, + "ewc_loss": 0.04009496048092842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.004748057515826e-05, + "grad_norm": 27.33175277709961, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8556268811225891, + "num_tokens": 401699672.0, + "step": 10525 + }, + { + "epoch": 1.3390153924437094, + "ewc_loss": 0.04006398841738701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.003199369937647e-05, + "grad_norm": 27.38368797302246, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.858310878276825, + "num_tokens": 401732676.0, + "step": 10526 + }, + { + "epoch": 1.3391426027223, + "ewc_loss": 0.04010616987943649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0053084881510586e-05, + "grad_norm": 27.283781051635742, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8639860153198242, + "num_tokens": 401771593.0, + "step": 10527 + }, + { + "epoch": 1.3392698130008904, + "ewc_loss": 0.040101729333400726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0050863895448856e-05, + "grad_norm": 27.450788497924805, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8516308069229126, + "num_tokens": 401809605.0, + "step": 10528 + }, + { + "epoch": 1.339397023279481, + "ewc_loss": 0.04013899341225624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0069495803909376e-05, + "grad_norm": 27.257431030273438, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8558966517448425, + "num_tokens": 401845349.0, + "step": 10529 + }, + { + "epoch": 1.3395242335580715, + "ewc_loss": 0.04008788242936134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.004394082177896e-05, + "grad_norm": 27.333843231201172, + "learning_rate": 1e-06, + "loss": 0.3532, + "mean_token_accuracy": 0.888039231300354, + "num_tokens": 401883186.0, + "step": 10530 + }, + { + "epoch": 1.339651443836662, + "ewc_loss": 0.0402175672352314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0108784156036563e-05, + "grad_norm": 27.375097274780273, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8506656885147095, + "num_tokens": 401921149.0, + "step": 10531 + }, + { + "epoch": 1.3397786541152525, + "ewc_loss": 0.04015049710869789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0075249267392792e-05, + "grad_norm": 27.31783103942871, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.859378457069397, + "num_tokens": 401961813.0, + "step": 10532 + }, + { + "epoch": 1.339905864393843, + "ewc_loss": 0.0402003638446331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0100182155147195e-05, + "grad_norm": 27.380563735961914, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8483878374099731, + "num_tokens": 402001212.0, + "step": 10533 + }, + { + "epoch": 1.3400330746724336, + "ewc_loss": 0.040246445685625076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.012322329392191e-05, + "grad_norm": 27.357044219970703, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8586753010749817, + "num_tokens": 402044648.0, + "step": 10534 + }, + { + "epoch": 1.3401602849510241, + "ewc_loss": 0.04023918882012367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0119594410061836e-05, + "grad_norm": 27.407411575317383, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8680969476699829, + "num_tokens": 402087391.0, + "step": 10535 + }, + { + "epoch": 1.3402874952296147, + "ewc_loss": 0.040136318653821945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.006815884669777e-05, + "grad_norm": 27.34821891784668, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8847119808197021, + "num_tokens": 402119598.0, + "step": 10536 + }, + { + "epoch": 1.3404147055082052, + "ewc_loss": 0.04020409658551216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0102048438275233e-05, + "grad_norm": 27.397403717041016, + "learning_rate": 1e-06, + "loss": 0.5204, + "mean_token_accuracy": 0.8399174213409424, + "num_tokens": 402153628.0, + "step": 10537 + }, + { + "epoch": 1.3405419157867955, + "ewc_loss": 0.040252916514873505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0126457457081415e-05, + "grad_norm": 27.453388214111328, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8819374442100525, + "num_tokens": 402190876.0, + "step": 10538 + }, + { + "epoch": 1.340669126065386, + "ewc_loss": 0.040128789842128754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0064395357621834e-05, + "grad_norm": 27.270933151245117, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8749698400497437, + "num_tokens": 402227168.0, + "step": 10539 + }, + { + "epoch": 1.3407963363439765, + "ewc_loss": 0.0401880219578743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0094010324100964e-05, + "grad_norm": 27.506397247314453, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8737474679946899, + "num_tokens": 402262731.0, + "step": 10540 + }, + { + "epoch": 1.340923546622567, + "ewc_loss": 0.04020065441727638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.010032767429948e-05, + "grad_norm": 27.269811630249023, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8660871982574463, + "num_tokens": 402299992.0, + "step": 10541 + }, + { + "epoch": 1.3410507569011576, + "ewc_loss": 0.04012247547507286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0061237592017278e-05, + "grad_norm": 27.470457077026367, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8564342260360718, + "num_tokens": 402339129.0, + "step": 10542 + }, + { + "epoch": 1.3411779671797481, + "ewc_loss": 0.040250904858112335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0125451555941254e-05, + "grad_norm": 27.416627883911133, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8483680486679077, + "num_tokens": 402375380.0, + "step": 10543 + }, + { + "epoch": 1.3413051774583387, + "ewc_loss": 0.040180109441280365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.009005402214825e-05, + "grad_norm": 27.44670867919922, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8538734316825867, + "num_tokens": 402412159.0, + "step": 10544 + }, + { + "epoch": 1.3414323877369292, + "ewc_loss": 0.04018864408135414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.009432137128897e-05, + "grad_norm": 27.376914978027344, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8616089820861816, + "num_tokens": 402446676.0, + "step": 10545 + }, + { + "epoch": 1.3415595980155197, + "ewc_loss": 0.04014210030436516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0071051039849408e-05, + "grad_norm": 27.313844680786133, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8660935163497925, + "num_tokens": 402490741.0, + "step": 10546 + }, + { + "epoch": 1.3416868082941102, + "ewc_loss": 0.04019036144018173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0095179934287444e-05, + "grad_norm": 27.430267333984375, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8742824792861938, + "num_tokens": 402527398.0, + "step": 10547 + }, + { + "epoch": 1.3418140185727006, + "ewc_loss": 0.040222037583589554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.011101969401352e-05, + "grad_norm": 27.34759521484375, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8636411428451538, + "num_tokens": 402565914.0, + "step": 10548 + }, + { + "epoch": 1.341941228851291, + "ewc_loss": 0.040225885808467865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.011294236581307e-05, + "grad_norm": 27.342205047607422, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8688085079193115, + "num_tokens": 402600671.0, + "step": 10549 + }, + { + "epoch": 1.3420684391298816, + "ewc_loss": 0.04028854891657829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0144274458289146e-05, + "grad_norm": 27.440420150756836, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8771736025810242, + "num_tokens": 402637381.0, + "step": 10550 + }, + { + "epoch": 1.3421956494084721, + "ewc_loss": 0.04022567719221115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0112838683417067e-05, + "grad_norm": 27.360517501831055, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8693090677261353, + "num_tokens": 402671190.0, + "step": 10551 + }, + { + "epoch": 1.3423228596870627, + "ewc_loss": 0.040275201201438904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0137600586167537e-05, + "grad_norm": 27.43988609313965, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8656775951385498, + "num_tokens": 402721518.0, + "step": 10552 + }, + { + "epoch": 1.3424500699656532, + "ewc_loss": 0.04024634510278702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.012317236221861e-05, + "grad_norm": 27.434022903442383, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8656181693077087, + "num_tokens": 402758949.0, + "step": 10553 + }, + { + "epoch": 1.3425772802442437, + "ewc_loss": 0.04023533686995506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.011766810028348e-05, + "grad_norm": 27.588544845581055, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8595137596130371, + "num_tokens": 402794342.0, + "step": 10554 + }, + { + "epoch": 1.3427044905228342, + "ewc_loss": 0.04020148515701294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0100742403883487e-05, + "grad_norm": 27.26112937927246, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8773241639137268, + "num_tokens": 402831031.0, + "step": 10555 + }, + { + "epoch": 1.3428317008014248, + "ewc_loss": 0.040164295583963394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.008214869420044e-05, + "grad_norm": 27.420608520507812, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8756332397460938, + "num_tokens": 402871486.0, + "step": 10556 + }, + { + "epoch": 1.3429589110800153, + "ewc_loss": 0.04025539010763168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0127694369875826e-05, + "grad_norm": 27.40040397644043, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.878358006477356, + "num_tokens": 402916024.0, + "step": 10557 + }, + { + "epoch": 1.3430861213586058, + "ewc_loss": 0.040145523846149445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0072762708878145e-05, + "grad_norm": 27.260990142822266, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8483409285545349, + "num_tokens": 402952116.0, + "step": 10558 + }, + { + "epoch": 1.3432133316371964, + "ewc_loss": 0.04020223021507263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0101115296711214e-05, + "grad_norm": 27.36318588256836, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8638467788696289, + "num_tokens": 402998064.0, + "step": 10559 + }, + { + "epoch": 1.343340541915787, + "ewc_loss": 0.04020514711737633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0102574126212858e-05, + "grad_norm": 27.412668228149414, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8700743913650513, + "num_tokens": 403041763.0, + "step": 10560 + }, + { + "epoch": 1.3434677521943774, + "ewc_loss": 0.040129564702510834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.006478280236479e-05, + "grad_norm": 27.318540573120117, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8773525953292847, + "num_tokens": 403080956.0, + "step": 10561 + }, + { + "epoch": 1.343594962472968, + "ewc_loss": 0.04020796716213226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0103983843000606e-05, + "grad_norm": 27.43968391418457, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8742203116416931, + "num_tokens": 403117200.0, + "step": 10562 + }, + { + "epoch": 1.3437221727515583, + "ewc_loss": 0.04021841660141945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.010920798056759e-05, + "grad_norm": 27.423254013061523, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8650710582733154, + "num_tokens": 403154112.0, + "step": 10563 + }, + { + "epoch": 1.3438493830301488, + "ewc_loss": 0.04009518027305603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0047589714522474e-05, + "grad_norm": 27.321533203125, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8624552488327026, + "num_tokens": 403188265.0, + "step": 10564 + }, + { + "epoch": 1.3439765933087393, + "ewc_loss": 0.040146734565496445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0073366613360122e-05, + "grad_norm": 27.33428382873535, + "learning_rate": 1e-06, + "loss": 0.4979, + "mean_token_accuracy": 0.8435967564582825, + "num_tokens": 403231236.0, + "step": 10565 + }, + { + "epoch": 1.3441038035873298, + "ewc_loss": 0.04023919999599457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0119599867030047e-05, + "grad_norm": 27.33405303955078, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8669444918632507, + "num_tokens": 403268642.0, + "step": 10566 + }, + { + "epoch": 1.3442310138659204, + "ewc_loss": 0.0401635468006134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.008177398238331e-05, + "grad_norm": 27.388580322265625, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8734664916992188, + "num_tokens": 403307679.0, + "step": 10567 + }, + { + "epoch": 1.344358224144511, + "ewc_loss": 0.04019755870103836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.009877971431706e-05, + "grad_norm": 27.313758850097656, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8714345097541809, + "num_tokens": 403346395.0, + "step": 10568 + }, + { + "epoch": 1.3444854344231014, + "ewc_loss": 0.04020032286643982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0100162146263756e-05, + "grad_norm": 27.45083999633789, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8736355304718018, + "num_tokens": 403383001.0, + "step": 10569 + }, + { + "epoch": 1.344612644701692, + "ewc_loss": 0.04022149369120598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.011074684560299e-05, + "grad_norm": 27.343204498291016, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8566685914993286, + "num_tokens": 403426090.0, + "step": 10570 + }, + { + "epoch": 1.3447398549802825, + "ewc_loss": 0.04017723351716995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.008861702051945e-05, + "grad_norm": 27.4462947845459, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.858954668045044, + "num_tokens": 403470231.0, + "step": 10571 + }, + { + "epoch": 1.3448670652588728, + "ewc_loss": 0.04013555124402046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0067775039933622e-05, + "grad_norm": 27.30716896057129, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8705931901931763, + "num_tokens": 403508677.0, + "step": 10572 + }, + { + "epoch": 1.3449942755374633, + "ewc_loss": 0.04016384482383728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.00819231395144e-05, + "grad_norm": 27.542076110839844, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8463910818099976, + "num_tokens": 403545175.0, + "step": 10573 + }, + { + "epoch": 1.3451214858160538, + "ewc_loss": 0.04017241299152374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0086206859559752e-05, + "grad_norm": 27.409826278686523, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8793283700942993, + "num_tokens": 403583321.0, + "step": 10574 + }, + { + "epoch": 1.3452486960946444, + "ewc_loss": 0.040012799203395844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.000640051846858e-05, + "grad_norm": 27.352230072021484, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8589248657226562, + "num_tokens": 403618547.0, + "step": 10575 + }, + { + "epoch": 1.345375906373235, + "ewc_loss": 0.04013305529952049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.006652721320279e-05, + "grad_norm": 27.274478912353516, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.869523286819458, + "num_tokens": 403657552.0, + "step": 10576 + }, + { + "epoch": 1.3455031166518254, + "ewc_loss": 0.04017643630504608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0088218661840074e-05, + "grad_norm": 27.425045013427734, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.867923378944397, + "num_tokens": 403697021.0, + "step": 10577 + }, + { + "epoch": 1.345630326930416, + "ewc_loss": 0.040170829743146896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.008541559916921e-05, + "grad_norm": 27.46601104736328, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8558762669563293, + "num_tokens": 403736480.0, + "step": 10578 + }, + { + "epoch": 1.3457575372090065, + "ewc_loss": 0.04015255719423294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.00762788153952e-05, + "grad_norm": 27.43886947631836, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8725863695144653, + "num_tokens": 403772056.0, + "step": 10579 + }, + { + "epoch": 1.345884747487597, + "ewc_loss": 0.040141887962818146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.00709437194746e-05, + "grad_norm": 27.314239501953125, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.858572244644165, + "num_tokens": 403813646.0, + "step": 10580 + }, + { + "epoch": 1.3460119577661875, + "ewc_loss": 0.040062449872493744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.003122426685877e-05, + "grad_norm": 27.394058227539062, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8692530989646912, + "num_tokens": 403853904.0, + "step": 10581 + }, + { + "epoch": 1.346139168044778, + "ewc_loss": 0.040164340287446976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0082170522073284e-05, + "grad_norm": 27.326866149902344, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8503589630126953, + "num_tokens": 403898528.0, + "step": 10582 + }, + { + "epoch": 1.3462663783233686, + "ewc_loss": 0.04010743647813797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.005371788982302e-05, + "grad_norm": 27.422985076904297, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8504319787025452, + "num_tokens": 403929518.0, + "step": 10583 + }, + { + "epoch": 1.3463935886019591, + "ewc_loss": 0.04017386585474014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0086932636331767e-05, + "grad_norm": 27.428205490112305, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8671640753746033, + "num_tokens": 403970323.0, + "step": 10584 + }, + { + "epoch": 1.3465207988805497, + "ewc_loss": 0.04018280282616615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.009140189329628e-05, + "grad_norm": 27.438358306884766, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8684453368186951, + "num_tokens": 404012499.0, + "step": 10585 + }, + { + "epoch": 1.3466480091591402, + "ewc_loss": 0.040148526430130005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.007426337513607e-05, + "grad_norm": 27.41720199584961, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8604198098182678, + "num_tokens": 404055370.0, + "step": 10586 + }, + { + "epoch": 1.3467752194377305, + "ewc_loss": 0.04010181128978729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0050905732205138e-05, + "grad_norm": 27.410385131835938, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8653538227081299, + "num_tokens": 404092498.0, + "step": 10587 + }, + { + "epoch": 1.346902429716321, + "ewc_loss": 0.0401359461247921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.006797330977861e-05, + "grad_norm": 27.3389892578125, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8724281787872314, + "num_tokens": 404132466.0, + "step": 10588 + }, + { + "epoch": 1.3470296399949115, + "ewc_loss": 0.04014463350176811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0072317056474276e-05, + "grad_norm": 27.40521240234375, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8649499416351318, + "num_tokens": 404171913.0, + "step": 10589 + }, + { + "epoch": 1.347156850273502, + "ewc_loss": 0.04020232334733009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0101160771446303e-05, + "grad_norm": 27.411935806274414, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8790093064308167, + "num_tokens": 404205817.0, + "step": 10590 + }, + { + "epoch": 1.3472840605520926, + "ewc_loss": 0.04018212854862213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.009106356126722e-05, + "grad_norm": 27.46652603149414, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8689456582069397, + "num_tokens": 404243922.0, + "step": 10591 + }, + { + "epoch": 1.3474112708306831, + "ewc_loss": 0.040233321487903595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0116660380153917e-05, + "grad_norm": 27.4915771484375, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8686647415161133, + "num_tokens": 404283392.0, + "step": 10592 + }, + { + "epoch": 1.3475384811092737, + "ewc_loss": 0.04004239663481712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0021197997266427e-05, + "grad_norm": 27.35244369506836, + "learning_rate": 1e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.8404866456985474, + "num_tokens": 404323044.0, + "step": 10593 + }, + { + "epoch": 1.3476656913878642, + "ewc_loss": 0.040189728140830994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0094863430131227e-05, + "grad_norm": 27.44579315185547, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8713124990463257, + "num_tokens": 404359761.0, + "step": 10594 + }, + { + "epoch": 1.3477929016664547, + "ewc_loss": 0.04017714038491249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0088569726794958e-05, + "grad_norm": 27.437355041503906, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8529585599899292, + "num_tokens": 404395354.0, + "step": 10595 + }, + { + "epoch": 1.3479201119450452, + "ewc_loss": 0.040181346237659454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0090672478545457e-05, + "grad_norm": 27.516036987304688, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8483393788337708, + "num_tokens": 404432052.0, + "step": 10596 + }, + { + "epoch": 1.3480473222236355, + "ewc_loss": 0.04014283046126366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0071414837730117e-05, + "grad_norm": 27.355443954467773, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.8579205274581909, + "num_tokens": 404472689.0, + "step": 10597 + }, + { + "epoch": 1.348174532502226, + "ewc_loss": 0.04009848088026047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.004923953791149e-05, + "grad_norm": 27.38066291809082, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8509352207183838, + "num_tokens": 404515916.0, + "step": 10598 + }, + { + "epoch": 1.3483017427808166, + "ewc_loss": 0.04017523676156998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0087618395336904e-05, + "grad_norm": 27.40538215637207, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8642245531082153, + "num_tokens": 404557399.0, + "step": 10599 + }, + { + "epoch": 1.3484289530594071, + "ewc_loss": 0.040234554558992386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.011727701756172e-05, + "grad_norm": 27.479236602783203, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8617128729820251, + "num_tokens": 404592573.0, + "step": 10600 + }, + { + "epoch": 1.3485561633379977, + "ewc_loss": 0.04011136293411255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0055680579389445e-05, + "grad_norm": 27.374666213989258, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8622448444366455, + "num_tokens": 404635266.0, + "step": 10601 + }, + { + "epoch": 1.3486833736165882, + "ewc_loss": 0.040206074714660645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0103037968510762e-05, + "grad_norm": 27.541200637817383, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8740590810775757, + "num_tokens": 404673004.0, + "step": 10602 + }, + { + "epoch": 1.3488105838951787, + "ewc_loss": 0.04014800116419792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.007400144066196e-05, + "grad_norm": 27.333410263061523, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.849561333656311, + "num_tokens": 404705153.0, + "step": 10603 + }, + { + "epoch": 1.3489377941737692, + "ewc_loss": 0.04019158333539963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0095791114727035e-05, + "grad_norm": 27.454423904418945, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8561791181564331, + "num_tokens": 404737068.0, + "step": 10604 + }, + { + "epoch": 1.3490650044523598, + "ewc_loss": 0.0402660071849823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0133003999944776e-05, + "grad_norm": 27.5760440826416, + "learning_rate": 1e-06, + "loss": 0.5111, + "mean_token_accuracy": 0.843525230884552, + "num_tokens": 404773680.0, + "step": 10605 + }, + { + "epoch": 1.3491922147309503, + "ewc_loss": 0.040199633687734604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0099816538277082e-05, + "grad_norm": 27.38430404663086, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8709017038345337, + "num_tokens": 404809937.0, + "step": 10606 + }, + { + "epoch": 1.3493194250095408, + "ewc_loss": 0.040138181298971176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0069090169272386e-05, + "grad_norm": 27.4849853515625, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.855755090713501, + "num_tokens": 404847146.0, + "step": 10607 + }, + { + "epoch": 1.3494466352881314, + "ewc_loss": 0.040310680866241455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.015534118982032e-05, + "grad_norm": 27.5325927734375, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8660728931427002, + "num_tokens": 404882952.0, + "step": 10608 + }, + { + "epoch": 1.3495738455667219, + "ewc_loss": 0.040124040096998215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0062019757460803e-05, + "grad_norm": 27.371692657470703, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8733302354812622, + "num_tokens": 404925347.0, + "step": 10609 + }, + { + "epoch": 1.3497010558453124, + "ewc_loss": 0.04026532545685768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.013266202993691e-05, + "grad_norm": 27.52431869506836, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8501938581466675, + "num_tokens": 404963411.0, + "step": 10610 + }, + { + "epoch": 1.349828266123903, + "ewc_loss": 0.04023407772183418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0117038729949854e-05, + "grad_norm": 27.477493286132812, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8627738952636719, + "num_tokens": 404997522.0, + "step": 10611 + }, + { + "epoch": 1.3499554764024932, + "ewc_loss": 0.040203552693128586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.01017755898647e-05, + "grad_norm": 27.540943145751953, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8638253211975098, + "num_tokens": 405034049.0, + "step": 10612 + }, + { + "epoch": 1.3500826866810838, + "ewc_loss": 0.040236279368400574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0118139218539e-05, + "grad_norm": 27.40096664428711, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8621114492416382, + "num_tokens": 405074985.0, + "step": 10613 + }, + { + "epoch": 1.3502098969596743, + "ewc_loss": 0.040209170430898666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.010458592849318e-05, + "grad_norm": 27.525083541870117, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8797844648361206, + "num_tokens": 405116133.0, + "step": 10614 + }, + { + "epoch": 1.3503371072382648, + "ewc_loss": 0.04030238091945648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0151190256001428e-05, + "grad_norm": 27.515399932861328, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8636981248855591, + "num_tokens": 405155572.0, + "step": 10615 + }, + { + "epoch": 1.3504643175168554, + "ewc_loss": 0.040218815207481384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.010940806940198e-05, + "grad_norm": 27.547285079956055, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8670625686645508, + "num_tokens": 405194700.0, + "step": 10616 + }, + { + "epoch": 1.350591527795446, + "ewc_loss": 0.04026629775762558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.013314951909706e-05, + "grad_norm": 27.56199073791504, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8609621524810791, + "num_tokens": 405233582.0, + "step": 10617 + }, + { + "epoch": 1.3507187380740364, + "ewc_loss": 0.04011928290128708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.005964233831037e-05, + "grad_norm": 27.58431625366211, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8767424821853638, + "num_tokens": 405274987.0, + "step": 10618 + }, + { + "epoch": 1.350845948352627, + "ewc_loss": 0.040145982056856155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.007299190154299e-05, + "grad_norm": 27.604223251342773, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8628145456314087, + "num_tokens": 405308151.0, + "step": 10619 + }, + { + "epoch": 1.3509731586312175, + "ewc_loss": 0.04016084223985672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0080420654267073e-05, + "grad_norm": 27.546234130859375, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8645161986351013, + "num_tokens": 405343939.0, + "step": 10620 + }, + { + "epoch": 1.3511003689098078, + "ewc_loss": 0.04010889306664467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.005444730457384e-05, + "grad_norm": 27.499834060668945, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8635264039039612, + "num_tokens": 405384211.0, + "step": 10621 + }, + { + "epoch": 1.3512275791883983, + "ewc_loss": 0.04018605127930641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0093026250833645e-05, + "grad_norm": 27.564098358154297, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.87247234582901, + "num_tokens": 405419971.0, + "step": 10622 + }, + { + "epoch": 1.3513547894669888, + "ewc_loss": 0.040125809609889984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.006290560530033e-05, + "grad_norm": 27.38237953186035, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8576821088790894, + "num_tokens": 405451322.0, + "step": 10623 + }, + { + "epoch": 1.3514819997455794, + "ewc_loss": 0.04007333144545555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0036664864164777e-05, + "grad_norm": 27.396068572998047, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8569613695144653, + "num_tokens": 405496101.0, + "step": 10624 + }, + { + "epoch": 1.35160921002417, + "ewc_loss": 0.0401458777487278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.007293915085029e-05, + "grad_norm": 27.530744552612305, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8496880531311035, + "num_tokens": 405530034.0, + "step": 10625 + }, + { + "epoch": 1.3517364203027604, + "ewc_loss": 0.040156688541173935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0078345187357627e-05, + "grad_norm": 27.333972930908203, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8771887421607971, + "num_tokens": 405570893.0, + "step": 10626 + }, + { + "epoch": 1.351863630581351, + "ewc_loss": 0.040181707590818405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0090854377485812e-05, + "grad_norm": 27.544052124023438, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8516401648521423, + "num_tokens": 405610120.0, + "step": 10627 + }, + { + "epoch": 1.3519908408599415, + "ewc_loss": 0.04028420150279999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.014210076595191e-05, + "grad_norm": 27.480127334594727, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8637121915817261, + "num_tokens": 405651878.0, + "step": 10628 + }, + { + "epoch": 1.352118051138532, + "ewc_loss": 0.04012749344110489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0063745978404768e-05, + "grad_norm": 27.365859985351562, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8701566457748413, + "num_tokens": 405694578.0, + "step": 10629 + }, + { + "epoch": 1.3522452614171225, + "ewc_loss": 0.04021359235048294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0106796000618488e-05, + "grad_norm": 27.358213424682617, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8710481524467468, + "num_tokens": 405734128.0, + "step": 10630 + }, + { + "epoch": 1.352372471695713, + "ewc_loss": 0.04033995047211647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0169974959571846e-05, + "grad_norm": 27.467823028564453, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8707336783409119, + "num_tokens": 405776588.0, + "step": 10631 + }, + { + "epoch": 1.3524996819743036, + "ewc_loss": 0.040302824229002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.015141217270866e-05, + "grad_norm": 27.543699264526367, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8622885942459106, + "num_tokens": 405810867.0, + "step": 10632 + }, + { + "epoch": 1.3526268922528941, + "ewc_loss": 0.04027047008275986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0135235899942927e-05, + "grad_norm": 27.394580841064453, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8824941515922546, + "num_tokens": 405848884.0, + "step": 10633 + }, + { + "epoch": 1.3527541025314846, + "ewc_loss": 0.04025882109999657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.012940967688337e-05, + "grad_norm": 27.50704574584961, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8849130272865295, + "num_tokens": 405888759.0, + "step": 10634 + }, + { + "epoch": 1.3528813128100752, + "ewc_loss": 0.04020611569285393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.01030579773942e-05, + "grad_norm": 27.337223052978516, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.878625750541687, + "num_tokens": 405925686.0, + "step": 10635 + }, + { + "epoch": 1.3530085230886655, + "ewc_loss": 0.04022334888577461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0111674530198798e-05, + "grad_norm": 27.302066802978516, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.877519428730011, + "num_tokens": 405960836.0, + "step": 10636 + }, + { + "epoch": 1.353135733367256, + "ewc_loss": 0.04028824716806412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0144123482168652e-05, + "grad_norm": 27.473846435546875, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8753684759140015, + "num_tokens": 406000314.0, + "step": 10637 + }, + { + "epoch": 1.3532629436458465, + "ewc_loss": 0.040312327444553375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0156163373030722e-05, + "grad_norm": 27.423032760620117, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8751999139785767, + "num_tokens": 406039468.0, + "step": 10638 + }, + { + "epoch": 1.353390153924437, + "ewc_loss": 0.04019821435213089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.00991071324097e-05, + "grad_norm": 27.392074584960938, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8887760639190674, + "num_tokens": 406076828.0, + "step": 10639 + }, + { + "epoch": 1.3535173642030276, + "ewc_loss": 0.04022901877760887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0114508515689522e-05, + "grad_norm": 27.417362213134766, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.864006757736206, + "num_tokens": 406116504.0, + "step": 10640 + }, + { + "epoch": 1.3536445744816181, + "ewc_loss": 0.04027828946709633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0139144908171147e-05, + "grad_norm": 27.46047019958496, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8653154969215393, + "num_tokens": 406155184.0, + "step": 10641 + }, + { + "epoch": 1.3537717847602087, + "ewc_loss": 0.04017779603600502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0088897144887596e-05, + "grad_norm": 27.363056182861328, + "learning_rate": 1e-06, + "loss": 0.5159, + "mean_token_accuracy": 0.8463811874389648, + "num_tokens": 406195411.0, + "step": 10642 + }, + { + "epoch": 1.3538989950387992, + "ewc_loss": 0.0402778722345829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.013893572438974e-05, + "grad_norm": 27.488906860351562, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8559414148330688, + "num_tokens": 406230159.0, + "step": 10643 + }, + { + "epoch": 1.3540262053173897, + "ewc_loss": 0.040252935141325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0126468371017836e-05, + "grad_norm": 27.405473709106445, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8539760112762451, + "num_tokens": 406264541.0, + "step": 10644 + }, + { + "epoch": 1.3541534155959802, + "ewc_loss": 0.04030478745698929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0152394426986575e-05, + "grad_norm": 27.498605728149414, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8712807297706604, + "num_tokens": 406301869.0, + "step": 10645 + }, + { + "epoch": 1.3542806258745705, + "ewc_loss": 0.04024507477879524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0122537534916773e-05, + "grad_norm": 27.3223819732666, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8673199415206909, + "num_tokens": 406338819.0, + "step": 10646 + }, + { + "epoch": 1.354407836153161, + "ewc_loss": 0.04022982716560364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0114914150326513e-05, + "grad_norm": 27.445262908935547, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8666529655456543, + "num_tokens": 406375914.0, + "step": 10647 + }, + { + "epoch": 1.3545350464317516, + "ewc_loss": 0.04033568874001503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0167844922980294e-05, + "grad_norm": 27.453609466552734, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8711181282997131, + "num_tokens": 406418543.0, + "step": 10648 + }, + { + "epoch": 1.3546622567103421, + "ewc_loss": 0.04029987007379532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.014993515331298e-05, + "grad_norm": 27.51825714111328, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.862647533416748, + "num_tokens": 406455731.0, + "step": 10649 + }, + { + "epoch": 1.3547894669889327, + "ewc_loss": 0.04026082903146744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0130413759034127e-05, + "grad_norm": 27.44866180419922, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8646688461303711, + "num_tokens": 406496266.0, + "step": 10650 + }, + { + "epoch": 1.3549166772675232, + "ewc_loss": 0.04028370603919029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0141853383393027e-05, + "grad_norm": 27.43805694580078, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8606313467025757, + "num_tokens": 406541676.0, + "step": 10651 + }, + { + "epoch": 1.3550438875461137, + "ewc_loss": 0.04027394950389862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0136974853812717e-05, + "grad_norm": 27.338924407958984, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8609666228294373, + "num_tokens": 406578997.0, + "step": 10652 + }, + { + "epoch": 1.3551710978247042, + "ewc_loss": 0.04027341306209564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0136707462370396e-05, + "grad_norm": 27.459278106689453, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8702142238616943, + "num_tokens": 406619404.0, + "step": 10653 + }, + { + "epoch": 1.3552983081032948, + "ewc_loss": 0.040465403348207474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.023270099016372e-05, + "grad_norm": 27.4581298828125, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8825287222862244, + "num_tokens": 406656092.0, + "step": 10654 + }, + { + "epoch": 1.3554255183818853, + "ewc_loss": 0.040259163826704025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0129582480876707e-05, + "grad_norm": 27.452293395996094, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8490656614303589, + "num_tokens": 406701600.0, + "step": 10655 + }, + { + "epoch": 1.3555527286604758, + "ewc_loss": 0.04033268243074417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.016634061874356e-05, + "grad_norm": 27.524982452392578, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8644641637802124, + "num_tokens": 406739649.0, + "step": 10656 + }, + { + "epoch": 1.3556799389390664, + "ewc_loss": 0.04027997702360153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0139988919254392e-05, + "grad_norm": 27.452482223510742, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8839614987373352, + "num_tokens": 406774962.0, + "step": 10657 + }, + { + "epoch": 1.3558071492176569, + "ewc_loss": 0.04030710458755493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0153553123236634e-05, + "grad_norm": 27.516618728637695, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8608181476593018, + "num_tokens": 406818047.0, + "step": 10658 + }, + { + "epoch": 1.3559343594962474, + "ewc_loss": 0.0402139276266098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0106963347643614e-05, + "grad_norm": 27.4802303314209, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8581404685974121, + "num_tokens": 406859464.0, + "step": 10659 + }, + { + "epoch": 1.356061569774838, + "ewc_loss": 0.04025156423449516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.01257826120127e-05, + "grad_norm": 27.58995819091797, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8637287616729736, + "num_tokens": 406896444.0, + "step": 10660 + }, + { + "epoch": 1.3561887800534282, + "ewc_loss": 0.04020106419920921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0100531401112676e-05, + "grad_norm": 27.589139938354492, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8593998551368713, + "num_tokens": 406934623.0, + "step": 10661 + }, + { + "epoch": 1.3563159903320188, + "ewc_loss": 0.04018152132630348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0090759790036827e-05, + "grad_norm": 27.38908576965332, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8576102256774902, + "num_tokens": 406974290.0, + "step": 10662 + }, + { + "epoch": 1.3564432006106093, + "ewc_loss": 0.040225476026535034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0112738638999872e-05, + "grad_norm": 27.612627029418945, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8842310905456543, + "num_tokens": 407011279.0, + "step": 10663 + }, + { + "epoch": 1.3565704108891998, + "ewc_loss": 0.04019314423203468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0096571461181156e-05, + "grad_norm": 27.423118591308594, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8552315831184387, + "num_tokens": 407054733.0, + "step": 10664 + }, + { + "epoch": 1.3566976211677904, + "ewc_loss": 0.04011549428105354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0057746951351874e-05, + "grad_norm": 27.443796157836914, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.866369903087616, + "num_tokens": 407098410.0, + "step": 10665 + }, + { + "epoch": 1.3568248314463809, + "ewc_loss": 0.04018503054976463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.009251511481125e-05, + "grad_norm": 27.446508407592773, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8599305152893066, + "num_tokens": 407134231.0, + "step": 10666 + }, + { + "epoch": 1.3569520417249714, + "ewc_loss": 0.04016263410449028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.008131741604302e-05, + "grad_norm": 27.4232120513916, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8512717485427856, + "num_tokens": 407167231.0, + "step": 10667 + }, + { + "epoch": 1.357079252003562, + "ewc_loss": 0.04025145247578621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.012572622334119e-05, + "grad_norm": 27.51150131225586, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.866742730140686, + "num_tokens": 407206604.0, + "step": 10668 + }, + { + "epoch": 1.3572064622821525, + "ewc_loss": 0.04021367058157921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0106836018385366e-05, + "grad_norm": 27.447940826416016, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8672093152999878, + "num_tokens": 407248247.0, + "step": 10669 + }, + { + "epoch": 1.3573336725607428, + "ewc_loss": 0.04014555737376213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0072779079782777e-05, + "grad_norm": 27.466896057128906, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8751204013824463, + "num_tokens": 407282870.0, + "step": 10670 + }, + { + "epoch": 1.3574608828393333, + "ewc_loss": 0.04027244821190834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0136223611189052e-05, + "grad_norm": 27.64335823059082, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8716572523117065, + "num_tokens": 407318704.0, + "step": 10671 + }, + { + "epoch": 1.3575880931179238, + "ewc_loss": 0.040143147110939026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0071573089808226e-05, + "grad_norm": 27.337621688842773, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.867772102355957, + "num_tokens": 407360003.0, + "step": 10672 + }, + { + "epoch": 1.3577153033965144, + "ewc_loss": 0.04020671918988228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.010335992963519e-05, + "grad_norm": 27.645023345947266, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8592917323112488, + "num_tokens": 407399473.0, + "step": 10673 + }, + { + "epoch": 1.357842513675105, + "ewc_loss": 0.04024567827582359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0122839487157762e-05, + "grad_norm": 27.43943977355957, + "learning_rate": 1e-06, + "loss": 0.5295, + "mean_token_accuracy": 0.8354084491729736, + "num_tokens": 407440576.0, + "step": 10674 + }, + { + "epoch": 1.3579697239536954, + "ewc_loss": 0.04024314880371094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0121575289522298e-05, + "grad_norm": 27.5784854888916, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8613314032554626, + "num_tokens": 407481057.0, + "step": 10675 + }, + { + "epoch": 1.358096934232286, + "ewc_loss": 0.040326476097106934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0163237422821112e-05, + "grad_norm": 27.54584503173828, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8763377666473389, + "num_tokens": 407517104.0, + "step": 10676 + }, + { + "epoch": 1.3582241445108765, + "ewc_loss": 0.04020122438669205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.010061143664643e-05, + "grad_norm": 27.476062774658203, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8727112412452698, + "num_tokens": 407552604.0, + "step": 10677 + }, + { + "epoch": 1.358351354789467, + "ewc_loss": 0.04031706973910332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0158535335212946e-05, + "grad_norm": 27.48604965209961, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8786869645118713, + "num_tokens": 407588282.0, + "step": 10678 + }, + { + "epoch": 1.3584785650680575, + "ewc_loss": 0.040251683443784714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0125840819673613e-05, + "grad_norm": 27.30689811706543, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8597719669342041, + "num_tokens": 407619837.0, + "step": 10679 + }, + { + "epoch": 1.358605775346648, + "ewc_loss": 0.04027751088142395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0138755644438788e-05, + "grad_norm": 27.46933937072754, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.862310528755188, + "num_tokens": 407656939.0, + "step": 10680 + }, + { + "epoch": 1.3587329856252386, + "ewc_loss": 0.04035681113600731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.017840597545728e-05, + "grad_norm": 27.493642807006836, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8549995422363281, + "num_tokens": 407702756.0, + "step": 10681 + }, + { + "epoch": 1.3588601959038291, + "ewc_loss": 0.04029115289449692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0145576854702085e-05, + "grad_norm": 27.41758918762207, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8732168674468994, + "num_tokens": 407736142.0, + "step": 10682 + }, + { + "epoch": 1.3589874061824196, + "ewc_loss": 0.04031088203191757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0155441234237514e-05, + "grad_norm": 27.49070167541504, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8613324165344238, + "num_tokens": 407773184.0, + "step": 10683 + }, + { + "epoch": 1.3591146164610102, + "ewc_loss": 0.040290843695402145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0145422240602784e-05, + "grad_norm": 27.34733772277832, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8499406576156616, + "num_tokens": 407806543.0, + "step": 10684 + }, + { + "epoch": 1.3592418267396005, + "ewc_loss": 0.04038306325674057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0191531802993268e-05, + "grad_norm": 27.749229431152344, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8639432787895203, + "num_tokens": 407841371.0, + "step": 10685 + }, + { + "epoch": 1.359369037018191, + "ewc_loss": 0.04032197594642639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0160987332928926e-05, + "grad_norm": 27.461870193481445, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8688153028488159, + "num_tokens": 407879862.0, + "step": 10686 + }, + { + "epoch": 1.3594962472967815, + "ewc_loss": 0.04028073698282242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0140369088039733e-05, + "grad_norm": 27.608463287353516, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.859420895576477, + "num_tokens": 407917947.0, + "step": 10687 + }, + { + "epoch": 1.359623457575372, + "ewc_loss": 0.040329426527023315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0164712623227388e-05, + "grad_norm": 27.49427032470703, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8719936013221741, + "num_tokens": 407953987.0, + "step": 10688 + }, + { + "epoch": 1.3597506678539626, + "ewc_loss": 0.04032060503959656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0160303392913193e-05, + "grad_norm": 27.524105072021484, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.8495835661888123, + "num_tokens": 407993166.0, + "step": 10689 + }, + { + "epoch": 1.3598778781325531, + "ewc_loss": 0.04029403626918793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0147017494309694e-05, + "grad_norm": 27.316200256347656, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8735241889953613, + "num_tokens": 408037063.0, + "step": 10690 + }, + { + "epoch": 1.3600050884111436, + "ewc_loss": 0.040327850729227066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0163925000815652e-05, + "grad_norm": 27.690628051757812, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8678003549575806, + "num_tokens": 408078526.0, + "step": 10691 + }, + { + "epoch": 1.3601322986897342, + "ewc_loss": 0.04032956436276436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0164781744824722e-05, + "grad_norm": 27.464876174926758, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8726036548614502, + "num_tokens": 408116002.0, + "step": 10692 + }, + { + "epoch": 1.3602595089683247, + "ewc_loss": 0.040226202458143234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0113100617891178e-05, + "grad_norm": 27.594575881958008, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8618436455726624, + "num_tokens": 408155885.0, + "step": 10693 + }, + { + "epoch": 1.3603867192469152, + "ewc_loss": 0.04030601680278778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0153009245404974e-05, + "grad_norm": 27.537965774536133, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8706678748130798, + "num_tokens": 408190085.0, + "step": 10694 + }, + { + "epoch": 1.3605139295255055, + "ewc_loss": 0.040237825363874435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0118912289035507e-05, + "grad_norm": 27.53615951538086, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8665393590927124, + "num_tokens": 408227289.0, + "step": 10695 + }, + { + "epoch": 1.360641139804096, + "ewc_loss": 0.04025804623961449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0129022232140414e-05, + "grad_norm": 27.509998321533203, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8732835054397583, + "num_tokens": 408271853.0, + "step": 10696 + }, + { + "epoch": 1.3607683500826866, + "ewc_loss": 0.04021017253398895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0105086150579154e-05, + "grad_norm": 27.592269897460938, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8780803680419922, + "num_tokens": 408311133.0, + "step": 10697 + }, + { + "epoch": 1.3608955603612771, + "ewc_loss": 0.04023345932364464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.011672950175125e-05, + "grad_norm": 27.434629440307617, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8662904500961304, + "num_tokens": 408342775.0, + "step": 10698 + }, + { + "epoch": 1.3610227706398677, + "ewc_loss": 0.040155328810214996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.00776648853207e-05, + "grad_norm": 27.505945205688477, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8779790997505188, + "num_tokens": 408379063.0, + "step": 10699 + }, + { + "epoch": 1.3611499809184582, + "ewc_loss": 0.040239933878183365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0119967302889563e-05, + "grad_norm": 27.46627426147461, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8530710935592651, + "num_tokens": 408415856.0, + "step": 10700 + }, + { + "epoch": 1.3612771911970487, + "ewc_loss": 0.040163956582546234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0081977709196508e-05, + "grad_norm": 27.43305015563965, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8702268004417419, + "num_tokens": 408451522.0, + "step": 10701 + }, + { + "epoch": 1.3614044014756392, + "ewc_loss": 0.04022396355867386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0111981939407997e-05, + "grad_norm": 27.4229736328125, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8715336918830872, + "num_tokens": 408494605.0, + "step": 10702 + }, + { + "epoch": 1.3615316117542298, + "ewc_loss": 0.04024484381079674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0122421119594947e-05, + "grad_norm": 27.523651123046875, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8650611639022827, + "num_tokens": 408536373.0, + "step": 10703 + }, + { + "epoch": 1.3616588220328203, + "ewc_loss": 0.04025905206799507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0129526092205197e-05, + "grad_norm": 27.454286575317383, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8527517318725586, + "num_tokens": 408574724.0, + "step": 10704 + }, + { + "epoch": 1.3617860323114108, + "ewc_loss": 0.04017401859164238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0087009033886716e-05, + "grad_norm": 27.425710678100586, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8584592342376709, + "num_tokens": 408615057.0, + "step": 10705 + }, + { + "epoch": 1.3619132425900013, + "ewc_loss": 0.04018694907426834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0093473722226918e-05, + "grad_norm": 27.347444534301758, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8536267280578613, + "num_tokens": 408651324.0, + "step": 10706 + }, + { + "epoch": 1.3620404528685919, + "ewc_loss": 0.04027590528130531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0137953470111825e-05, + "grad_norm": 27.432750701904297, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8701071739196777, + "num_tokens": 408690533.0, + "step": 10707 + }, + { + "epoch": 1.3621676631471824, + "ewc_loss": 0.04031745716929436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0158728148089722e-05, + "grad_norm": 27.384265899658203, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8582074046134949, + "num_tokens": 408725970.0, + "step": 10708 + }, + { + "epoch": 1.362294873425773, + "ewc_loss": 0.040306456387043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.01532275241334e-05, + "grad_norm": 27.433042526245117, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8727781772613525, + "num_tokens": 408766550.0, + "step": 10709 + }, + { + "epoch": 1.3624220837043632, + "ewc_loss": 0.040312569588422775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.015628524532076e-05, + "grad_norm": 27.35905647277832, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8692229986190796, + "num_tokens": 408804320.0, + "step": 10710 + }, + { + "epoch": 1.3625492939829538, + "ewc_loss": 0.040283411741256714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.014170604525134e-05, + "grad_norm": 27.38665008544922, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8674415946006775, + "num_tokens": 408840902.0, + "step": 10711 + }, + { + "epoch": 1.3626765042615443, + "ewc_loss": 0.04036590829491615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0182953448966146e-05, + "grad_norm": 27.487274169921875, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8645491600036621, + "num_tokens": 408879346.0, + "step": 10712 + }, + { + "epoch": 1.3628037145401348, + "ewc_loss": 0.04029257968068123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0146289898548275e-05, + "grad_norm": 27.45547866821289, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.877581775188446, + "num_tokens": 408919336.0, + "step": 10713 + }, + { + "epoch": 1.3629309248187254, + "ewc_loss": 0.04032979905605316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0164899979135953e-05, + "grad_norm": 27.50613021850586, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8611547946929932, + "num_tokens": 408961547.0, + "step": 10714 + }, + { + "epoch": 1.3630581350973159, + "ewc_loss": 0.04027700796723366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.01385046239011e-05, + "grad_norm": 27.43100929260254, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8803367614746094, + "num_tokens": 409003573.0, + "step": 10715 + }, + { + "epoch": 1.3631853453759064, + "ewc_loss": 0.04027993604540825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0139968910370953e-05, + "grad_norm": 27.323671340942383, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8582181334495544, + "num_tokens": 409045560.0, + "step": 10716 + }, + { + "epoch": 1.363312555654497, + "ewc_loss": 0.040329769253730774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0164885427220725e-05, + "grad_norm": 27.55182647705078, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8689866065979004, + "num_tokens": 409087245.0, + "step": 10717 + }, + { + "epoch": 1.3634397659330875, + "ewc_loss": 0.04034280404448509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.017140286625363e-05, + "grad_norm": 27.556133270263672, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8748471736907959, + "num_tokens": 409134177.0, + "step": 10718 + }, + { + "epoch": 1.3635669762116778, + "ewc_loss": 0.04030578210949898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0152891011093743e-05, + "grad_norm": 27.463359832763672, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.851852297782898, + "num_tokens": 409169495.0, + "step": 10719 + }, + { + "epoch": 1.3636941864902683, + "ewc_loss": 0.04037633165717125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0188166672596708e-05, + "grad_norm": 27.65025520324707, + "learning_rate": 1e-06, + "loss": 0.5045, + "mean_token_accuracy": 0.8414332866668701, + "num_tokens": 409207781.0, + "step": 10720 + }, + { + "epoch": 1.3638213967688588, + "ewc_loss": 0.04030106961727142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.015053541981615e-05, + "grad_norm": 27.46375846862793, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8656781911849976, + "num_tokens": 409245411.0, + "step": 10721 + }, + { + "epoch": 1.3639486070474494, + "ewc_loss": 0.04025152325630188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0125760784139857e-05, + "grad_norm": 27.493183135986328, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8733917474746704, + "num_tokens": 409281757.0, + "step": 10722 + }, + { + "epoch": 1.3640758173260399, + "ewc_loss": 0.04034271836280823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0171359210507944e-05, + "grad_norm": 27.530471801757812, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8543885946273804, + "num_tokens": 409321317.0, + "step": 10723 + }, + { + "epoch": 1.3642030276046304, + "ewc_loss": 0.04027852788567543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0139263142482378e-05, + "grad_norm": 27.569385528564453, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8725864887237549, + "num_tokens": 409358919.0, + "step": 10724 + }, + { + "epoch": 1.364330237883221, + "ewc_loss": 0.040298108011484146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0149054762441665e-05, + "grad_norm": 27.44661521911621, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8572803735733032, + "num_tokens": 409394355.0, + "step": 10725 + }, + { + "epoch": 1.3644574481618115, + "ewc_loss": 0.04024451598525047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0122257410548627e-05, + "grad_norm": 27.654537200927734, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.875350832939148, + "num_tokens": 409433781.0, + "step": 10726 + }, + { + "epoch": 1.364584658440402, + "ewc_loss": 0.040314093232154846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0157047401880845e-05, + "grad_norm": 27.532939910888672, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8562946915626526, + "num_tokens": 409471728.0, + "step": 10727 + }, + { + "epoch": 1.3647118687189925, + "ewc_loss": 0.040247224271297455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.012361255765427e-05, + "grad_norm": 27.508378982543945, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8830296397209167, + "num_tokens": 409502294.0, + "step": 10728 + }, + { + "epoch": 1.364839078997583, + "ewc_loss": 0.04027861729264259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0139308617217466e-05, + "grad_norm": 27.579120635986328, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8664986491203308, + "num_tokens": 409545878.0, + "step": 10729 + }, + { + "epoch": 1.3649662892761736, + "ewc_loss": 0.04031042754650116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.015521386056207e-05, + "grad_norm": 27.49066734313965, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8681878447532654, + "num_tokens": 409584753.0, + "step": 10730 + }, + { + "epoch": 1.3650934995547641, + "ewc_loss": 0.04032843932509422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0164219677099027e-05, + "grad_norm": 27.606748580932617, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8695939183235168, + "num_tokens": 409613725.0, + "step": 10731 + }, + { + "epoch": 1.3652207098333546, + "ewc_loss": 0.04029614105820656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0148070689174347e-05, + "grad_norm": 27.467588424682617, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8588840961456299, + "num_tokens": 409652443.0, + "step": 10732 + }, + { + "epoch": 1.3653479201119452, + "ewc_loss": 0.04027791693806648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0138957552262582e-05, + "grad_norm": 27.549625396728516, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8719927668571472, + "num_tokens": 409690683.0, + "step": 10733 + }, + { + "epoch": 1.3654751303905355, + "ewc_loss": 0.04040519893169403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0202600353513844e-05, + "grad_norm": 27.609121322631836, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8661478757858276, + "num_tokens": 409727502.0, + "step": 10734 + }, + { + "epoch": 1.365602340669126, + "ewc_loss": 0.04030494764447212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.015247446252033e-05, + "grad_norm": 27.663166046142578, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8617825508117676, + "num_tokens": 409764592.0, + "step": 10735 + }, + { + "epoch": 1.3657295509477165, + "ewc_loss": 0.04034696891903877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0173483790131286e-05, + "grad_norm": 27.512521743774414, + "learning_rate": 1e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8444673418998718, + "num_tokens": 409809645.0, + "step": 10736 + }, + { + "epoch": 1.365856761226307, + "ewc_loss": 0.04024450480937958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0122251953580417e-05, + "grad_norm": 27.53785514831543, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8650481700897217, + "num_tokens": 409848048.0, + "step": 10737 + }, + { + "epoch": 1.3659839715048976, + "ewc_loss": 0.040364544838666916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.018227314692922e-05, + "grad_norm": 27.579723358154297, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8602148294448853, + "num_tokens": 409884176.0, + "step": 10738 + }, + { + "epoch": 1.3661111817834881, + "ewc_loss": 0.040212392807006836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0106195734115317e-05, + "grad_norm": 27.464130401611328, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8684396743774414, + "num_tokens": 409922630.0, + "step": 10739 + }, + { + "epoch": 1.3662383920620786, + "ewc_loss": 0.040330223739147186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0165112800896168e-05, + "grad_norm": 27.52513885498047, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8723217248916626, + "num_tokens": 409961876.0, + "step": 10740 + }, + { + "epoch": 1.3663656023406692, + "ewc_loss": 0.040338121354579926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0169060007901862e-05, + "grad_norm": 27.54975700378418, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8755178451538086, + "num_tokens": 409995432.0, + "step": 10741 + }, + { + "epoch": 1.3664928126192597, + "ewc_loss": 0.040303342044353485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0151670469203964e-05, + "grad_norm": 27.678136825561523, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8642849326133728, + "num_tokens": 410034876.0, + "step": 10742 + }, + { + "epoch": 1.3666200228978502, + "ewc_loss": 0.040305741131305695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0152871002210304e-05, + "grad_norm": 27.4514217376709, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8685991764068604, + "num_tokens": 410079190.0, + "step": 10743 + }, + { + "epoch": 1.3667472331764405, + "ewc_loss": 0.040231890976428986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0115945517318323e-05, + "grad_norm": 27.400461196899414, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8690078854560852, + "num_tokens": 410112475.0, + "step": 10744 + }, + { + "epoch": 1.366874443455031, + "ewc_loss": 0.040357962250709534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.01789807761088e-05, + "grad_norm": 27.557668685913086, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8811179399490356, + "num_tokens": 410154583.0, + "step": 10745 + }, + { + "epoch": 1.3670016537336216, + "ewc_loss": 0.04029664024710655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0148319890722632e-05, + "grad_norm": 27.43221092224121, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8650615215301514, + "num_tokens": 410191631.0, + "step": 10746 + }, + { + "epoch": 1.3671288640122121, + "ewc_loss": 0.04029928520321846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.014964229601901e-05, + "grad_norm": 27.43520164489746, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.8420677185058594, + "num_tokens": 410232435.0, + "step": 10747 + }, + { + "epoch": 1.3672560742908026, + "ewc_loss": 0.040394242852926254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0197121557430364e-05, + "grad_norm": 27.480459213256836, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8694714307785034, + "num_tokens": 410272940.0, + "step": 10748 + }, + { + "epoch": 1.3673832845693932, + "ewc_loss": 0.04032416269183159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0162080545560457e-05, + "grad_norm": 27.528350830078125, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8609874248504639, + "num_tokens": 410312807.0, + "step": 10749 + }, + { + "epoch": 1.3675104948479837, + "ewc_loss": 0.04043028876185417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0215144104440697e-05, + "grad_norm": 27.49135971069336, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8720078468322754, + "num_tokens": 410352433.0, + "step": 10750 + }, + { + "epoch": 1.3676377051265742, + "ewc_loss": 0.04034750163555145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0173751181573607e-05, + "grad_norm": 27.41379737854004, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8653584122657776, + "num_tokens": 410394843.0, + "step": 10751 + }, + { + "epoch": 1.3677649154051648, + "ewc_loss": 0.04035436734557152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0177183614578098e-05, + "grad_norm": 27.464635848999023, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8673902153968811, + "num_tokens": 410439353.0, + "step": 10752 + }, + { + "epoch": 1.3678921256837553, + "ewc_loss": 0.04041628912091255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0208144633215852e-05, + "grad_norm": 27.476253509521484, + "learning_rate": 1e-06, + "loss": 0.5072, + "mean_token_accuracy": 0.8440483808517456, + "num_tokens": 410485789.0, + "step": 10753 + }, + { + "epoch": 1.3680193359623458, + "ewc_loss": 0.04040742665529251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0203713575028814e-05, + "grad_norm": 27.535505294799805, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8507493138313293, + "num_tokens": 410523385.0, + "step": 10754 + }, + { + "epoch": 1.3681465462409363, + "ewc_loss": 0.040396519005298615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0198260244796984e-05, + "grad_norm": 27.452442169189453, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8588491678237915, + "num_tokens": 410553340.0, + "step": 10755 + }, + { + "epoch": 1.3682737565195269, + "ewc_loss": 0.040363382548093796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0181691070320085e-05, + "grad_norm": 27.516836166381836, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8732378482818604, + "num_tokens": 410590041.0, + "step": 10756 + }, + { + "epoch": 1.3684009667981174, + "ewc_loss": 0.04039762541651726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0198813217575662e-05, + "grad_norm": 27.42914581298828, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8664737939834595, + "num_tokens": 410627703.0, + "step": 10757 + }, + { + "epoch": 1.368528177076708, + "ewc_loss": 0.040337517857551575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0168758055660874e-05, + "grad_norm": 27.530176162719727, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.857597827911377, + "num_tokens": 410662555.0, + "step": 10758 + }, + { + "epoch": 1.3686553873552982, + "ewc_loss": 0.04040386155247688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.020193096541334e-05, + "grad_norm": 27.454172134399414, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8478502035140991, + "num_tokens": 410706026.0, + "step": 10759 + }, + { + "epoch": 1.3687825976338888, + "ewc_loss": 0.040403347462415695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.020167448790744e-05, + "grad_norm": 27.52281951904297, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.863402247428894, + "num_tokens": 410743275.0, + "step": 10760 + }, + { + "epoch": 1.3689098079124793, + "ewc_loss": 0.040431566536426544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.021578256972134e-05, + "grad_norm": 27.57373809814453, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8748412132263184, + "num_tokens": 410781220.0, + "step": 10761 + }, + { + "epoch": 1.3690370181910698, + "ewc_loss": 0.04033595696091652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0167977709206752e-05, + "grad_norm": 27.45313835144043, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8735635876655579, + "num_tokens": 410820014.0, + "step": 10762 + }, + { + "epoch": 1.3691642284696603, + "ewc_loss": 0.0402887687087059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.014438359765336e-05, + "grad_norm": 27.43392562866211, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8602721095085144, + "num_tokens": 410858721.0, + "step": 10763 + }, + { + "epoch": 1.3692914387482509, + "ewc_loss": 0.04043540731072426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0217703422531486e-05, + "grad_norm": 27.653789520263672, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.870701789855957, + "num_tokens": 410898527.0, + "step": 10764 + }, + { + "epoch": 1.3694186490268414, + "ewc_loss": 0.04042428731918335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.021214459091425e-05, + "grad_norm": 27.53908348083496, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8662634491920471, + "num_tokens": 410938426.0, + "step": 10765 + }, + { + "epoch": 1.369545859305432, + "ewc_loss": 0.040230754762887955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0115377992624417e-05, + "grad_norm": 27.429738998413086, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8710446357727051, + "num_tokens": 410969940.0, + "step": 10766 + }, + { + "epoch": 1.3696730695840225, + "ewc_loss": 0.040361467748880386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.018073428189382e-05, + "grad_norm": 27.562013626098633, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8510086536407471, + "num_tokens": 411008859.0, + "step": 10767 + }, + { + "epoch": 1.3698002798626128, + "ewc_loss": 0.04037090390920639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0185452740406618e-05, + "grad_norm": 27.48137092590332, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8793831467628479, + "num_tokens": 411043516.0, + "step": 10768 + }, + { + "epoch": 1.3699274901412033, + "ewc_loss": 0.04032975062727928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0164874513284303e-05, + "grad_norm": 27.485279083251953, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8587939739227295, + "num_tokens": 411079092.0, + "step": 10769 + }, + { + "epoch": 1.3700547004197938, + "ewc_loss": 0.04036476090550423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.018238046730403e-05, + "grad_norm": 27.426393508911133, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8693480491638184, + "num_tokens": 411117360.0, + "step": 10770 + }, + { + "epoch": 1.3701819106983844, + "ewc_loss": 0.040418077260255814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0209037757012993e-05, + "grad_norm": 27.518295288085938, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8559516668319702, + "num_tokens": 411153615.0, + "step": 10771 + }, + { + "epoch": 1.3703091209769749, + "ewc_loss": 0.040524259209632874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.026213041972369e-05, + "grad_norm": 27.50874900817871, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8673039674758911, + "num_tokens": 411193571.0, + "step": 10772 + }, + { + "epoch": 1.3704363312555654, + "ewc_loss": 0.04041629657149315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.020814827119466e-05, + "grad_norm": 27.416250228881836, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8455709218978882, + "num_tokens": 411230427.0, + "step": 10773 + }, + { + "epoch": 1.370563541534156, + "ewc_loss": 0.04045628756284714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.022814442170784e-05, + "grad_norm": 27.689176559448242, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8634612560272217, + "num_tokens": 411271251.0, + "step": 10774 + }, + { + "epoch": 1.3706907518127465, + "ewc_loss": 0.04051658883690834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.025829417107161e-05, + "grad_norm": 27.436552047729492, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8683068156242371, + "num_tokens": 411304906.0, + "step": 10775 + }, + { + "epoch": 1.370817962091337, + "ewc_loss": 0.04042653739452362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.021326872636564e-05, + "grad_norm": 27.57025718688965, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.882439911365509, + "num_tokens": 411341417.0, + "step": 10776 + }, + { + "epoch": 1.3709451723699275, + "ewc_loss": 0.04048388823866844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0241943275323138e-05, + "grad_norm": 27.36225700378418, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8668898940086365, + "num_tokens": 411382118.0, + "step": 10777 + }, + { + "epoch": 1.371072382648518, + "ewc_loss": 0.040444131940603256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.022206535912119e-05, + "grad_norm": 27.626934051513672, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8544849157333374, + "num_tokens": 411426702.0, + "step": 10778 + }, + { + "epoch": 1.3711995929271086, + "ewc_loss": 0.040513623505830765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.025681169470772e-05, + "grad_norm": 27.619121551513672, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8643003702163696, + "num_tokens": 411461689.0, + "step": 10779 + }, + { + "epoch": 1.371326803205699, + "ewc_loss": 0.040394965559244156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.019748353632167e-05, + "grad_norm": 27.491077423095703, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8670789003372192, + "num_tokens": 411499613.0, + "step": 10780 + }, + { + "epoch": 1.3714540134842896, + "ewc_loss": 0.04044779762625694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0223898900439963e-05, + "grad_norm": 27.508647918701172, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8640525341033936, + "num_tokens": 411538422.0, + "step": 10781 + }, + { + "epoch": 1.3715812237628802, + "ewc_loss": 0.04046536237001419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0232680981280282e-05, + "grad_norm": 27.62068748474121, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8770761489868164, + "num_tokens": 411572401.0, + "step": 10782 + }, + { + "epoch": 1.3717084340414705, + "ewc_loss": 0.04047997295856476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0239986042724922e-05, + "grad_norm": 27.633895874023438, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8729866743087769, + "num_tokens": 411611268.0, + "step": 10783 + }, + { + "epoch": 1.371835644320061, + "ewc_loss": 0.04041444510221481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0207222405588254e-05, + "grad_norm": 27.44945526123047, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8642706871032715, + "num_tokens": 411648837.0, + "step": 10784 + }, + { + "epoch": 1.3719628545986515, + "ewc_loss": 0.040379542857408524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0189771021250635e-05, + "grad_norm": 27.513208389282227, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.868462324142456, + "num_tokens": 411690327.0, + "step": 10785 + }, + { + "epoch": 1.372090064877242, + "ewc_loss": 0.04048129916191101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0240649973857217e-05, + "grad_norm": 27.653898239135742, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8644484877586365, + "num_tokens": 411729665.0, + "step": 10786 + }, + { + "epoch": 1.3722172751558326, + "ewc_loss": 0.04042663052678108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0213316020090133e-05, + "grad_norm": 27.404878616333008, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8792777061462402, + "num_tokens": 411764716.0, + "step": 10787 + }, + { + "epoch": 1.3723444854344231, + "ewc_loss": 0.04041985049843788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0209925423841923e-05, + "grad_norm": 27.720136642456055, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8671608567237854, + "num_tokens": 411805560.0, + "step": 10788 + }, + { + "epoch": 1.3724716957130136, + "ewc_loss": 0.0404774472117424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.023872366407886e-05, + "grad_norm": 27.558635711669922, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8699614405632019, + "num_tokens": 411841151.0, + "step": 10789 + }, + { + "epoch": 1.3725989059916042, + "ewc_loss": 0.040303539484739304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.015177051362116e-05, + "grad_norm": 27.565099716186523, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8720756769180298, + "num_tokens": 411875088.0, + "step": 10790 + }, + { + "epoch": 1.3727261162701947, + "ewc_loss": 0.04035082086920738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.017541009990964e-05, + "grad_norm": 27.433029174804688, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8661676049232483, + "num_tokens": 411914554.0, + "step": 10791 + }, + { + "epoch": 1.3728533265487852, + "ewc_loss": 0.040326058864593506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0163030058029108e-05, + "grad_norm": 27.524259567260742, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8659168481826782, + "num_tokens": 411952237.0, + "step": 10792 + }, + { + "epoch": 1.3729805368273755, + "ewc_loss": 0.04043726623058319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0218632926116697e-05, + "grad_norm": 27.474002838134766, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8605538606643677, + "num_tokens": 411989676.0, + "step": 10793 + }, + { + "epoch": 1.373107747105966, + "ewc_loss": 0.04042012616991997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0210063667036593e-05, + "grad_norm": 27.531587600708008, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8776154518127441, + "num_tokens": 412025804.0, + "step": 10794 + }, + { + "epoch": 1.3732349573845566, + "ewc_loss": 0.04046430066227913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0232149836374447e-05, + "grad_norm": 27.511316299438477, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8821276426315308, + "num_tokens": 412060085.0, + "step": 10795 + }, + { + "epoch": 1.3733621676631471, + "ewc_loss": 0.040388643741607666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0194322132738307e-05, + "grad_norm": 27.460430145263672, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8725427985191345, + "num_tokens": 412100750.0, + "step": 10796 + }, + { + "epoch": 1.3734893779417376, + "ewc_loss": 0.040374498814344406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.018724990193732e-05, + "grad_norm": 27.45404815673828, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8685818314552307, + "num_tokens": 412145829.0, + "step": 10797 + }, + { + "epoch": 1.3736165882203282, + "ewc_loss": 0.040372297167778015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0186149413348176e-05, + "grad_norm": 27.369327545166016, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8717377185821533, + "num_tokens": 412187607.0, + "step": 10798 + }, + { + "epoch": 1.3737437984989187, + "ewc_loss": 0.0404852032661438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.024260174948722e-05, + "grad_norm": 27.574064254760742, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8534955978393555, + "num_tokens": 412227214.0, + "step": 10799 + }, + { + "epoch": 1.3738710087775092, + "ewc_loss": 0.040489643812179565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.024482273554895e-05, + "grad_norm": 27.37808609008789, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8691980838775635, + "num_tokens": 412261929.0, + "step": 10800 + }, + { + "epoch": 1.3739982190560998, + "ewc_loss": 0.040397558361291885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0198778656776994e-05, + "grad_norm": 27.58621597290039, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8614879846572876, + "num_tokens": 412299441.0, + "step": 10801 + }, + { + "epoch": 1.3741254293346903, + "ewc_loss": 0.04049655795097351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0248278815415688e-05, + "grad_norm": 27.491111755371094, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8646252751350403, + "num_tokens": 412339248.0, + "step": 10802 + }, + { + "epoch": 1.3742526396132808, + "ewc_loss": 0.04041566699743271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0207833586027846e-05, + "grad_norm": 27.647796630859375, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8647626638412476, + "num_tokens": 412377224.0, + "step": 10803 + }, + { + "epoch": 1.3743798498918713, + "ewc_loss": 0.0405130609869957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0256529751350172e-05, + "grad_norm": 27.552043914794922, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8438608050346375, + "num_tokens": 412415331.0, + "step": 10804 + }, + { + "epoch": 1.3745070601704619, + "ewc_loss": 0.040349092334508896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0174546079942957e-05, + "grad_norm": 27.51050567626953, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8657568693161011, + "num_tokens": 412455604.0, + "step": 10805 + }, + { + "epoch": 1.3746342704490524, + "ewc_loss": 0.04042179509997368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0210896764183417e-05, + "grad_norm": 27.59665870666504, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8728516101837158, + "num_tokens": 412489295.0, + "step": 10806 + }, + { + "epoch": 1.374761480727643, + "ewc_loss": 0.04047815501689911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0239078367012553e-05, + "grad_norm": 27.624794006347656, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.862273633480072, + "num_tokens": 412525750.0, + "step": 10807 + }, + { + "epoch": 1.3748886910062332, + "ewc_loss": 0.04035860672593117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.017930273723323e-05, + "grad_norm": 27.420690536499023, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.874809205532074, + "num_tokens": 412567163.0, + "step": 10808 + }, + { + "epoch": 1.3750159012848238, + "ewc_loss": 0.04046427458524704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0232137103448622e-05, + "grad_norm": 27.629241943359375, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8727229833602905, + "num_tokens": 412598789.0, + "step": 10809 + }, + { + "epoch": 1.3751431115634143, + "ewc_loss": 0.04049066826701164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0245333871571347e-05, + "grad_norm": 27.55809211730957, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8696463108062744, + "num_tokens": 412632588.0, + "step": 10810 + }, + { + "epoch": 1.3752703218420048, + "ewc_loss": 0.04039079695940018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0195398974465206e-05, + "grad_norm": 27.525508880615234, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8645497560501099, + "num_tokens": 412676369.0, + "step": 10811 + }, + { + "epoch": 1.3753975321205953, + "ewc_loss": 0.04041854664683342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.020927240664605e-05, + "grad_norm": 27.608327865600586, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8803552389144897, + "num_tokens": 412712674.0, + "step": 10812 + }, + { + "epoch": 1.3755247423991859, + "ewc_loss": 0.040411077439785004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.020553802140057e-05, + "grad_norm": 27.474740982055664, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8602827787399292, + "num_tokens": 412753531.0, + "step": 10813 + }, + { + "epoch": 1.3756519526777764, + "ewc_loss": 0.040456146001815796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.02280734811211e-05, + "grad_norm": 27.567649841308594, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8643980026245117, + "num_tokens": 412799450.0, + "step": 10814 + }, + { + "epoch": 1.375779162956367, + "ewc_loss": 0.04045289382338524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.022644730459433e-05, + "grad_norm": 27.518917083740234, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8883466720581055, + "num_tokens": 412836858.0, + "step": 10815 + }, + { + "epoch": 1.3759063732349575, + "ewc_loss": 0.040491897612810135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0245948689989746e-05, + "grad_norm": 27.620010375976562, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8690816760063171, + "num_tokens": 412871169.0, + "step": 10816 + }, + { + "epoch": 1.3760335835135478, + "ewc_loss": 0.0404132679104805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0206633053021505e-05, + "grad_norm": 27.51186180114746, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8703649640083313, + "num_tokens": 412909281.0, + "step": 10817 + }, + { + "epoch": 1.3761607937921383, + "ewc_loss": 0.04039613530039787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0198067431920208e-05, + "grad_norm": 27.586328506469727, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8665758371353149, + "num_tokens": 412950126.0, + "step": 10818 + }, + { + "epoch": 1.3762880040707288, + "ewc_loss": 0.04039226099848747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0196130208205432e-05, + "grad_norm": 27.500160217285156, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8533406257629395, + "num_tokens": 412988644.0, + "step": 10819 + }, + { + "epoch": 1.3764152143493193, + "ewc_loss": 0.04042457044124603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0212284653098322e-05, + "grad_norm": 27.608491897583008, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8522617816925049, + "num_tokens": 413028403.0, + "step": 10820 + }, + { + "epoch": 1.3765424246279099, + "ewc_loss": 0.040433917194604874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0216959455865435e-05, + "grad_norm": 27.501314163208008, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8569929599761963, + "num_tokens": 413067429.0, + "step": 10821 + }, + { + "epoch": 1.3766696349065004, + "ewc_loss": 0.04038747772574425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.019373823713977e-05, + "grad_norm": 27.47017478942871, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8628036975860596, + "num_tokens": 413101630.0, + "step": 10822 + }, + { + "epoch": 1.376796845185091, + "ewc_loss": 0.04047597944736481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0237990611349232e-05, + "grad_norm": 27.57572364807129, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.859674334526062, + "num_tokens": 413147757.0, + "step": 10823 + }, + { + "epoch": 1.3769240554636815, + "ewc_loss": 0.04040389880537987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0201949155307375e-05, + "grad_norm": 27.584569931030273, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8693719506263733, + "num_tokens": 413186803.0, + "step": 10824 + }, + { + "epoch": 1.377051265742272, + "ewc_loss": 0.04045804217457771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0229021174600348e-05, + "grad_norm": 27.607830047607422, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.859312117099762, + "num_tokens": 413227201.0, + "step": 10825 + }, + { + "epoch": 1.3771784760208625, + "ewc_loss": 0.04032013192772865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0160065105301328e-05, + "grad_norm": 27.5083065032959, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8645243644714355, + "num_tokens": 413270594.0, + "step": 10826 + }, + { + "epoch": 1.377305686299453, + "ewc_loss": 0.04036910831928253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0184554159641266e-05, + "grad_norm": 27.496540069580078, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8485708236694336, + "num_tokens": 413307345.0, + "step": 10827 + }, + { + "epoch": 1.3774328965780436, + "ewc_loss": 0.04042299836874008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.021149884967599e-05, + "grad_norm": 27.602670669555664, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8701520562171936, + "num_tokens": 413344310.0, + "step": 10828 + }, + { + "epoch": 1.377560106856634, + "ewc_loss": 0.04039990156888962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0199950085952878e-05, + "grad_norm": 27.439178466796875, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8840478658676147, + "num_tokens": 413382047.0, + "step": 10829 + }, + { + "epoch": 1.3776873171352246, + "ewc_loss": 0.040335241705179214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0167621187283657e-05, + "grad_norm": 27.52796173095703, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8631871938705444, + "num_tokens": 413424658.0, + "step": 10830 + }, + { + "epoch": 1.3778145274138152, + "ewc_loss": 0.04047054424881935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.023527122219093e-05, + "grad_norm": 27.618135452270508, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8736416101455688, + "num_tokens": 413458386.0, + "step": 10831 + }, + { + "epoch": 1.3779417376924055, + "ewc_loss": 0.040410544723272324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0205272448947653e-05, + "grad_norm": 27.588857650756836, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8526867032051086, + "num_tokens": 413492530.0, + "step": 10832 + }, + { + "epoch": 1.378068947970996, + "ewc_loss": 0.04039870947599411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.019935527641792e-05, + "grad_norm": 27.5095272064209, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8766946792602539, + "num_tokens": 413528894.0, + "step": 10833 + }, + { + "epoch": 1.3781961582495865, + "ewc_loss": 0.040411271154880524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0205636246828362e-05, + "grad_norm": 27.559864044189453, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.862095832824707, + "num_tokens": 413564765.0, + "step": 10834 + }, + { + "epoch": 1.378323368528177, + "ewc_loss": 0.040457047522068024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.022852459049318e-05, + "grad_norm": 27.597169876098633, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.8389649987220764, + "num_tokens": 413598122.0, + "step": 10835 + }, + { + "epoch": 1.3784505788067676, + "ewc_loss": 0.04045839607715607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0229197616572492e-05, + "grad_norm": 27.579389572143555, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8495492935180664, + "num_tokens": 413634519.0, + "step": 10836 + }, + { + "epoch": 1.378577789085358, + "ewc_loss": 0.040371887385845184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0185943867545575e-05, + "grad_norm": 27.51268196105957, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8574038147926331, + "num_tokens": 413674021.0, + "step": 10837 + }, + { + "epoch": 1.3787049993639486, + "ewc_loss": 0.040398579090833664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.019928979279939e-05, + "grad_norm": 27.563661575317383, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.871551513671875, + "num_tokens": 413713804.0, + "step": 10838 + }, + { + "epoch": 1.3788322096425392, + "ewc_loss": 0.04044055938720703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0220279111526906e-05, + "grad_norm": 27.553552627563477, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8727982640266418, + "num_tokens": 413755665.0, + "step": 10839 + }, + { + "epoch": 1.3789594199211297, + "ewc_loss": 0.04045581817626953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.022790977207478e-05, + "grad_norm": 27.513830184936523, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8651072978973389, + "num_tokens": 413790872.0, + "step": 10840 + }, + { + "epoch": 1.3790866301997202, + "ewc_loss": 0.04047269746661186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.023634806391783e-05, + "grad_norm": 27.593029022216797, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8853762149810791, + "num_tokens": 413830798.0, + "step": 10841 + }, + { + "epoch": 1.3792138404783105, + "ewc_loss": 0.04046698659658432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0233494069543667e-05, + "grad_norm": 27.36691665649414, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8700430393218994, + "num_tokens": 413865960.0, + "step": 10842 + }, + { + "epoch": 1.379341050756901, + "ewc_loss": 0.040459953248500824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.022997614403721e-05, + "grad_norm": 27.594566345214844, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.87621009349823, + "num_tokens": 413898222.0, + "step": 10843 + }, + { + "epoch": 1.3794682610354916, + "ewc_loss": 0.04065273329615593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0326366211520508e-05, + "grad_norm": 27.602571487426758, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8600497245788574, + "num_tokens": 413931839.0, + "step": 10844 + }, + { + "epoch": 1.3795954713140821, + "ewc_loss": 0.04047539457678795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.023769775405526e-05, + "grad_norm": 27.530986785888672, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8568314909934998, + "num_tokens": 413963815.0, + "step": 10845 + }, + { + "epoch": 1.3797226815926726, + "ewc_loss": 0.040586426854133606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.029321331065148e-05, + "grad_norm": 27.634662628173828, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8734333515167236, + "num_tokens": 414002476.0, + "step": 10846 + }, + { + "epoch": 1.3798498918712632, + "ewc_loss": 0.040526144206523895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0263072656234726e-05, + "grad_norm": 27.474706649780273, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8611024618148804, + "num_tokens": 414041121.0, + "step": 10847 + }, + { + "epoch": 1.3799771021498537, + "ewc_loss": 0.04052434116601944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0262170437490568e-05, + "grad_norm": 27.63022232055664, + "learning_rate": 1e-06, + "loss": 0.4876, + "mean_token_accuracy": 0.8531006574630737, + "num_tokens": 414080447.0, + "step": 10848 + }, + { + "epoch": 1.3801043124284442, + "ewc_loss": 0.04059358686208725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.029679308179766e-05, + "grad_norm": 27.592668533325195, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8554768562316895, + "num_tokens": 414108925.0, + "step": 10849 + }, + { + "epoch": 1.3802315227070348, + "ewc_loss": 0.040547847747802734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.027392474701628e-05, + "grad_norm": 27.694042205810547, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8709782361984253, + "num_tokens": 414154296.0, + "step": 10850 + }, + { + "epoch": 1.3803587329856253, + "ewc_loss": 0.04048868641257286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0244342522346415e-05, + "grad_norm": 27.57012367248535, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8550581932067871, + "num_tokens": 414194612.0, + "step": 10851 + }, + { + "epoch": 1.3804859432642158, + "ewc_loss": 0.040483780205249786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0241890524630435e-05, + "grad_norm": 27.49227523803711, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8688768148422241, + "num_tokens": 414229939.0, + "step": 10852 + }, + { + "epoch": 1.3806131535428063, + "ewc_loss": 0.0405353382229805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0267669242457487e-05, + "grad_norm": 27.6461124420166, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8622441291809082, + "num_tokens": 414265193.0, + "step": 10853 + }, + { + "epoch": 1.3807403638213969, + "ewc_loss": 0.0405791699886322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0289584426791407e-05, + "grad_norm": 27.605459213256836, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8621571063995361, + "num_tokens": 414301471.0, + "step": 10854 + }, + { + "epoch": 1.3808675740999874, + "ewc_loss": 0.040547166019678116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0273582777008414e-05, + "grad_norm": 27.609272003173828, + "learning_rate": 1e-06, + "loss": 0.5177, + "mean_token_accuracy": 0.837698221206665, + "num_tokens": 414338755.0, + "step": 10855 + }, + { + "epoch": 1.380994784378578, + "ewc_loss": 0.04048537090420723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0242685422999784e-05, + "grad_norm": 27.389427185058594, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8670141696929932, + "num_tokens": 414379996.0, + "step": 10856 + }, + { + "epoch": 1.3811219946571682, + "ewc_loss": 0.04046429693698883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0232148017385043e-05, + "grad_norm": 27.377086639404297, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8681014776229858, + "num_tokens": 414421905.0, + "step": 10857 + }, + { + "epoch": 1.3812492049357588, + "ewc_loss": 0.040610384196043015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.030519135587383e-05, + "grad_norm": 27.603517532348633, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8716737031936646, + "num_tokens": 414465149.0, + "step": 10858 + }, + { + "epoch": 1.3813764152143493, + "ewc_loss": 0.04061395674943924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0306977603468113e-05, + "grad_norm": 27.47483253479004, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8755682110786438, + "num_tokens": 414504086.0, + "step": 10859 + }, + { + "epoch": 1.3815036254929398, + "ewc_loss": 0.04051092639565468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0255463823559694e-05, + "grad_norm": 27.556194305419922, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8491393327713013, + "num_tokens": 414545392.0, + "step": 10860 + }, + { + "epoch": 1.3816308357715303, + "ewc_loss": 0.04056420177221298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.028210110438522e-05, + "grad_norm": 27.545513153076172, + "learning_rate": 1e-06, + "loss": 0.5396, + "mean_token_accuracy": 0.829993486404419, + "num_tokens": 414586445.0, + "step": 10861 + }, + { + "epoch": 1.3817580460501209, + "ewc_loss": 0.04046235606074333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0231178496032953e-05, + "grad_norm": 27.55738067626953, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8748389482498169, + "num_tokens": 414618467.0, + "step": 10862 + }, + { + "epoch": 1.3818852563287114, + "ewc_loss": 0.04056300222873688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.028150083788205e-05, + "grad_norm": 27.552595138549805, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8758444786071777, + "num_tokens": 414651253.0, + "step": 10863 + }, + { + "epoch": 1.382012466607302, + "ewc_loss": 0.040545348078012466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.027267328230664e-05, + "grad_norm": 27.604007720947266, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8649260997772217, + "num_tokens": 414692016.0, + "step": 10864 + }, + { + "epoch": 1.3821396768858925, + "ewc_loss": 0.04053177684545517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0265888451831415e-05, + "grad_norm": 27.559280395507812, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8707496523857117, + "num_tokens": 414731840.0, + "step": 10865 + }, + { + "epoch": 1.3822668871644828, + "ewc_loss": 0.040559448301792145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0279723685234785e-05, + "grad_norm": 27.560300827026367, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8669254779815674, + "num_tokens": 414767110.0, + "step": 10866 + }, + { + "epoch": 1.3823940974430733, + "ewc_loss": 0.040478963404893875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.023948218266014e-05, + "grad_norm": 27.543973922729492, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8709320425987244, + "num_tokens": 414799340.0, + "step": 10867 + }, + { + "epoch": 1.3825213077216638, + "ewc_loss": 0.0405266210436821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.026331094384659e-05, + "grad_norm": 27.54006004333496, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8674491047859192, + "num_tokens": 414840915.0, + "step": 10868 + }, + { + "epoch": 1.3826485180002543, + "ewc_loss": 0.04047887399792671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.023943670792505e-05, + "grad_norm": 27.612728118896484, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.853783369064331, + "num_tokens": 414877298.0, + "step": 10869 + }, + { + "epoch": 1.3827757282788449, + "ewc_loss": 0.04055379703640938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.027689879469108e-05, + "grad_norm": 27.535341262817383, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8725473880767822, + "num_tokens": 414916431.0, + "step": 10870 + }, + { + "epoch": 1.3829029385574354, + "ewc_loss": 0.040439050644636154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.021952604991384e-05, + "grad_norm": 27.554073333740234, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8638773560523987, + "num_tokens": 414958148.0, + "step": 10871 + }, + { + "epoch": 1.383030148836026, + "ewc_loss": 0.04053566977381706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.026783477049321e-05, + "grad_norm": 27.644237518310547, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.890761137008667, + "num_tokens": 414997008.0, + "step": 10872 + }, + { + "epoch": 1.3831573591146165, + "ewc_loss": 0.04052194580435753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0260973542463034e-05, + "grad_norm": 27.626148223876953, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8524855971336365, + "num_tokens": 415042893.0, + "step": 10873 + }, + { + "epoch": 1.383284569393207, + "ewc_loss": 0.04045764356851578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0228821085765958e-05, + "grad_norm": 27.504873275756836, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8638596534729004, + "num_tokens": 415077476.0, + "step": 10874 + }, + { + "epoch": 1.3834117796717975, + "ewc_loss": 0.04042994603514671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0214973119436763e-05, + "grad_norm": 27.60608673095703, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8611293435096741, + "num_tokens": 415124329.0, + "step": 10875 + }, + { + "epoch": 1.383538989950388, + "ewc_loss": 0.04048287495970726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0241437596268952e-05, + "grad_norm": 27.527006149291992, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8778814077377319, + "num_tokens": 415164101.0, + "step": 10876 + }, + { + "epoch": 1.3836662002289786, + "ewc_loss": 0.04041367024183273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.02068349608453e-05, + "grad_norm": 27.493667602539062, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8666861057281494, + "num_tokens": 415203792.0, + "step": 10877 + }, + { + "epoch": 1.383793410507569, + "ewc_loss": 0.04049992561340332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.024996319960337e-05, + "grad_norm": 27.539901733398438, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8627520799636841, + "num_tokens": 415233754.0, + "step": 10878 + }, + { + "epoch": 1.3839206207861596, + "ewc_loss": 0.0404566265642643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.022831358772237e-05, + "grad_norm": 27.455490112304688, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8726658821105957, + "num_tokens": 415269567.0, + "step": 10879 + }, + { + "epoch": 1.3840478310647502, + "ewc_loss": 0.040596380829811096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0298190065659583e-05, + "grad_norm": 27.584707260131836, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.868817150592804, + "num_tokens": 415312660.0, + "step": 10880 + }, + { + "epoch": 1.3841750413433405, + "ewc_loss": 0.040484730154275894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.024236528086476e-05, + "grad_norm": 27.367877960205078, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8768101334571838, + "num_tokens": 415355377.0, + "step": 10881 + }, + { + "epoch": 1.384302251621931, + "ewc_loss": 0.04040394350886345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0201970983180217e-05, + "grad_norm": 27.401233673095703, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8781933188438416, + "num_tokens": 415393109.0, + "step": 10882 + }, + { + "epoch": 1.3844294619005215, + "ewc_loss": 0.04062877595424652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.031438816629816e-05, + "grad_norm": 27.68515396118164, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.877943754196167, + "num_tokens": 415428650.0, + "step": 10883 + }, + { + "epoch": 1.384556672179112, + "ewc_loss": 0.04056105762720108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0280529497540556e-05, + "grad_norm": 27.411054611206055, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8636196851730347, + "num_tokens": 415467166.0, + "step": 10884 + }, + { + "epoch": 1.3846838824577026, + "ewc_loss": 0.040526729077100754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0263363694539294e-05, + "grad_norm": 27.58267593383789, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8786464929580688, + "num_tokens": 415505633.0, + "step": 10885 + }, + { + "epoch": 1.384811092736293, + "ewc_loss": 0.04059829190373421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0299146854085848e-05, + "grad_norm": 27.491134643554688, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.866479218006134, + "num_tokens": 415546254.0, + "step": 10886 + }, + { + "epoch": 1.3849383030148836, + "ewc_loss": 0.04053402692079544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0267012587282807e-05, + "grad_norm": 27.566797256469727, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8645160794258118, + "num_tokens": 415584999.0, + "step": 10887 + }, + { + "epoch": 1.3850655132934742, + "ewc_loss": 0.04059789702296257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.029894858424086e-05, + "grad_norm": 27.42882537841797, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8686188459396362, + "num_tokens": 415624608.0, + "step": 10888 + }, + { + "epoch": 1.3851927235720647, + "ewc_loss": 0.0404808446764946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0240422600181773e-05, + "grad_norm": 27.4907283782959, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8531123995780945, + "num_tokens": 415663405.0, + "step": 10889 + }, + { + "epoch": 1.385319933850655, + "ewc_loss": 0.04060475900769234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.030237919825595e-05, + "grad_norm": 27.60209083557129, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8804236054420471, + "num_tokens": 415698934.0, + "step": 10890 + }, + { + "epoch": 1.3854471441292455, + "ewc_loss": 0.04053836315870285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0269180822651833e-05, + "grad_norm": 27.501501083374023, + "learning_rate": 1e-06, + "loss": 0.5211, + "mean_token_accuracy": 0.8371831774711609, + "num_tokens": 415736973.0, + "step": 10891 + }, + { + "epoch": 1.385574354407836, + "ewc_loss": 0.04055625945329666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.027813025051728e-05, + "grad_norm": 27.615562438964844, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8688790202140808, + "num_tokens": 415778364.0, + "step": 10892 + }, + { + "epoch": 1.3857015646864266, + "ewc_loss": 0.04055705666542053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0278528609196655e-05, + "grad_norm": 27.58870506286621, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8825418949127197, + "num_tokens": 415817502.0, + "step": 10893 + }, + { + "epoch": 1.385828774965017, + "ewc_loss": 0.040506500750780106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.025325011345558e-05, + "grad_norm": 27.52949333190918, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8615413904190063, + "num_tokens": 415860364.0, + "step": 10894 + }, + { + "epoch": 1.3859559852436076, + "ewc_loss": 0.04051102325320244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0255511117284186e-05, + "grad_norm": 27.553417205810547, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.869475245475769, + "num_tokens": 415896816.0, + "step": 10895 + }, + { + "epoch": 1.3860831955221982, + "ewc_loss": 0.04058206453919411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.029103234235663e-05, + "grad_norm": 27.57537841796875, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8481886982917786, + "num_tokens": 415935173.0, + "step": 10896 + }, + { + "epoch": 1.3862104058007887, + "ewc_loss": 0.040509581565856934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0254790797480382e-05, + "grad_norm": 27.5621337890625, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8757779002189636, + "num_tokens": 415977043.0, + "step": 10897 + }, + { + "epoch": 1.3863376160793792, + "ewc_loss": 0.040494270622730255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0247134671080858e-05, + "grad_norm": 27.49081039428711, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8759970664978027, + "num_tokens": 416014376.0, + "step": 10898 + }, + { + "epoch": 1.3864648263579697, + "ewc_loss": 0.04050284996628761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0251425667083822e-05, + "grad_norm": 27.611024856567383, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8529950976371765, + "num_tokens": 416055648.0, + "step": 10899 + }, + { + "epoch": 1.3865920366365603, + "ewc_loss": 0.04051120579242706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0255602066754363e-05, + "grad_norm": 27.551578521728516, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8575330972671509, + "num_tokens": 416087062.0, + "step": 10900 + }, + { + "epoch": 1.3867192469151508, + "ewc_loss": 0.04051408916711807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0257044525351375e-05, + "grad_norm": 27.54667854309082, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8731995820999146, + "num_tokens": 416122559.0, + "step": 10901 + }, + { + "epoch": 1.3868464571937413, + "ewc_loss": 0.04057852178812027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0289260646677576e-05, + "grad_norm": 27.717330932617188, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8588885068893433, + "num_tokens": 416155228.0, + "step": 10902 + }, + { + "epoch": 1.3869736674723319, + "ewc_loss": 0.04053043946623802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.026521906373091e-05, + "grad_norm": 27.614892959594727, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8529260158538818, + "num_tokens": 416191190.0, + "step": 10903 + }, + { + "epoch": 1.3871008777509224, + "ewc_loss": 0.04047027602791786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0235138435964473e-05, + "grad_norm": 27.525592803955078, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8690462708473206, + "num_tokens": 416228009.0, + "step": 10904 + }, + { + "epoch": 1.387228088029513, + "ewc_loss": 0.040514592081308365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0257295545889065e-05, + "grad_norm": 27.634607315063477, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8618040084838867, + "num_tokens": 416263657.0, + "step": 10905 + }, + { + "epoch": 1.3873552983081032, + "ewc_loss": 0.0405704528093338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.028522612818051e-05, + "grad_norm": 27.58989143371582, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8591915369033813, + "num_tokens": 416305187.0, + "step": 10906 + }, + { + "epoch": 1.3874825085866938, + "ewc_loss": 0.04056372493505478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0281862816773355e-05, + "grad_norm": 27.529083251953125, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8630025386810303, + "num_tokens": 416342578.0, + "step": 10907 + }, + { + "epoch": 1.3876097188652843, + "ewc_loss": 0.04058954864740372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0294774003559723e-05, + "grad_norm": 27.78103256225586, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.864277720451355, + "num_tokens": 416378969.0, + "step": 10908 + }, + { + "epoch": 1.3877369291438748, + "ewc_loss": 0.040593575686216354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.029678762482945e-05, + "grad_norm": 27.5869197845459, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8649209141731262, + "num_tokens": 416412958.0, + "step": 10909 + }, + { + "epoch": 1.3878641394224653, + "ewc_loss": 0.040502674877643585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.025133653660305e-05, + "grad_norm": 27.668453216552734, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8474452495574951, + "num_tokens": 416445643.0, + "step": 10910 + }, + { + "epoch": 1.3879913497010559, + "ewc_loss": 0.040605876594781876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0302937628002837e-05, + "grad_norm": 27.81515121459961, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8528432846069336, + "num_tokens": 416484891.0, + "step": 10911 + }, + { + "epoch": 1.3881185599796464, + "ewc_loss": 0.040613096207380295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0306548321968876e-05, + "grad_norm": 27.717586517333984, + "learning_rate": 1e-06, + "loss": 0.4906, + "mean_token_accuracy": 0.852371335029602, + "num_tokens": 416520504.0, + "step": 10912 + }, + { + "epoch": 1.388245770258237, + "ewc_loss": 0.04049387201666832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.024693640123587e-05, + "grad_norm": 27.47179412841797, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8656680583953857, + "num_tokens": 416561877.0, + "step": 10913 + }, + { + "epoch": 1.3883729805368275, + "ewc_loss": 0.040514927357435226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.025746289291419e-05, + "grad_norm": 27.9548282623291, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8664920330047607, + "num_tokens": 416597676.0, + "step": 10914 + }, + { + "epoch": 1.3885001908154178, + "ewc_loss": 0.040563516318798065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.028175731538795e-05, + "grad_norm": 27.602014541625977, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8724949359893799, + "num_tokens": 416630557.0, + "step": 10915 + }, + { + "epoch": 1.3886274010940083, + "ewc_loss": 0.040478821843862534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0239411242073402e-05, + "grad_norm": 27.663490295410156, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8687162399291992, + "num_tokens": 416668604.0, + "step": 10916 + }, + { + "epoch": 1.3887546113725988, + "ewc_loss": 0.04056541994214058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0282710465835407e-05, + "grad_norm": 27.81961441040039, + "learning_rate": 1e-06, + "loss": 0.4992, + "mean_token_accuracy": 0.8455132246017456, + "num_tokens": 416710246.0, + "step": 10917 + }, + { + "epoch": 1.3888818216511893, + "ewc_loss": 0.0404435358941555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0221767044859007e-05, + "grad_norm": 27.553850173950195, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8670517802238464, + "num_tokens": 416744962.0, + "step": 10918 + }, + { + "epoch": 1.3890090319297799, + "ewc_loss": 0.04042713716626167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0213568859617226e-05, + "grad_norm": 27.532670974731445, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8672547340393066, + "num_tokens": 416779483.0, + "step": 10919 + }, + { + "epoch": 1.3891362422083704, + "ewc_loss": 0.0405702218413353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0285111531848088e-05, + "grad_norm": 27.62833023071289, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8783748149871826, + "num_tokens": 416822135.0, + "step": 10920 + }, + { + "epoch": 1.389263452486961, + "ewc_loss": 0.0405137799680233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0256889911252074e-05, + "grad_norm": 27.519332885742188, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8636094927787781, + "num_tokens": 416860383.0, + "step": 10921 + }, + { + "epoch": 1.3893906627655515, + "ewc_loss": 0.040511354804039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.025567664531991e-05, + "grad_norm": 27.549957275390625, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8611773252487183, + "num_tokens": 416896960.0, + "step": 10922 + }, + { + "epoch": 1.389517873044142, + "ewc_loss": 0.04052944481372833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0264722479623742e-05, + "grad_norm": 27.503665924072266, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8763166666030884, + "num_tokens": 416931981.0, + "step": 10923 + }, + { + "epoch": 1.3896450833227325, + "ewc_loss": 0.040535666048526764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0267832951503806e-05, + "grad_norm": 27.50832748413086, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.875057578086853, + "num_tokens": 416968179.0, + "step": 10924 + }, + { + "epoch": 1.389772293601323, + "ewc_loss": 0.04056514427065849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0282572222640738e-05, + "grad_norm": 27.679813385009766, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8703349828720093, + "num_tokens": 417008879.0, + "step": 10925 + }, + { + "epoch": 1.3898995038799136, + "ewc_loss": 0.0405556857585907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.027784285019152e-05, + "grad_norm": 27.402864456176758, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.87702476978302, + "num_tokens": 417042497.0, + "step": 10926 + }, + { + "epoch": 1.390026714158504, + "ewc_loss": 0.04049016907811165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.024508467002306e-05, + "grad_norm": 27.740192413330078, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8668162822723389, + "num_tokens": 417084396.0, + "step": 10927 + }, + { + "epoch": 1.3901539244370946, + "ewc_loss": 0.04061758518218994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.030879295489285e-05, + "grad_norm": 27.608642578125, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.878935694694519, + "num_tokens": 417121014.0, + "step": 10928 + }, + { + "epoch": 1.3902811347156852, + "ewc_loss": 0.04039350152015686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.019675048359204e-05, + "grad_norm": 27.52647590637207, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8643312454223633, + "num_tokens": 417160169.0, + "step": 10929 + }, + { + "epoch": 1.3904083449942755, + "ewc_loss": 0.04046984389424324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.023492197622545e-05, + "grad_norm": 27.780315399169922, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8495378494262695, + "num_tokens": 417195024.0, + "step": 10930 + }, + { + "epoch": 1.390535555272866, + "ewc_loss": 0.040512971580028534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0256486095604487e-05, + "grad_norm": 27.455142974853516, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.872066855430603, + "num_tokens": 417231344.0, + "step": 10931 + }, + { + "epoch": 1.3906627655514565, + "ewc_loss": 0.040446341037750244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0223171304678544e-05, + "grad_norm": 27.575571060180664, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8572858572006226, + "num_tokens": 417270266.0, + "step": 10932 + }, + { + "epoch": 1.390789975830047, + "ewc_loss": 0.0405808761715889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.029043753282167e-05, + "grad_norm": 27.625471115112305, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8642597198486328, + "num_tokens": 417308292.0, + "step": 10933 + }, + { + "epoch": 1.3909171861086376, + "ewc_loss": 0.040539342910051346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.026967194979079e-05, + "grad_norm": 27.57918357849121, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.862839937210083, + "num_tokens": 417346358.0, + "step": 10934 + }, + { + "epoch": 1.391044396387228, + "ewc_loss": 0.04054491966962814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0272460460546426e-05, + "grad_norm": 27.677749633789062, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8656930923461914, + "num_tokens": 417381924.0, + "step": 10935 + }, + { + "epoch": 1.3911716066658186, + "ewc_loss": 0.04055716097354889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0278581359889358e-05, + "grad_norm": 27.58955955505371, + "learning_rate": 1e-06, + "loss": 0.5092, + "mean_token_accuracy": 0.8395326733589172, + "num_tokens": 417417779.0, + "step": 10936 + }, + { + "epoch": 1.3912988169444092, + "ewc_loss": 0.040542036294937134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0271018001949415e-05, + "grad_norm": 27.650419235229492, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8568446636199951, + "num_tokens": 417459076.0, + "step": 10937 + }, + { + "epoch": 1.3914260272229997, + "ewc_loss": 0.04058507829904556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0292538465582766e-05, + "grad_norm": 27.65522575378418, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.864928662776947, + "num_tokens": 417499422.0, + "step": 10938 + }, + { + "epoch": 1.39155323750159, + "ewc_loss": 0.040578074753284454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.028903691098094e-05, + "grad_norm": 27.58220100402832, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8659487962722778, + "num_tokens": 417541776.0, + "step": 10939 + }, + { + "epoch": 1.3916804477801805, + "ewc_loss": 0.04054476320743561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0272382244002074e-05, + "grad_norm": 27.563940048217773, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.85262531042099, + "num_tokens": 417576469.0, + "step": 10940 + }, + { + "epoch": 1.391807658058771, + "ewc_loss": 0.04059158265590668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0295790818636306e-05, + "grad_norm": 27.511463165283203, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8670896291732788, + "num_tokens": 417611726.0, + "step": 10941 + }, + { + "epoch": 1.3919348683373616, + "ewc_loss": 0.040577568113803864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0288784071453847e-05, + "grad_norm": 27.625015258789062, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.858397364616394, + "num_tokens": 417650135.0, + "step": 10942 + }, + { + "epoch": 1.392062078615952, + "ewc_loss": 0.04060458019375801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0302290067775175e-05, + "grad_norm": 27.57738494873047, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8468995094299316, + "num_tokens": 417690981.0, + "step": 10943 + }, + { + "epoch": 1.3921892888945426, + "ewc_loss": 0.04057728871703148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0288644009269774e-05, + "grad_norm": 27.59071159362793, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8538499474525452, + "num_tokens": 417732152.0, + "step": 10944 + }, + { + "epoch": 1.3923164991731332, + "ewc_loss": 0.040634628385305405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0317314920248464e-05, + "grad_norm": 27.550952911376953, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8547592163085938, + "num_tokens": 417771814.0, + "step": 10945 + }, + { + "epoch": 1.3924437094517237, + "ewc_loss": 0.0406157411634922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0307870727265254e-05, + "grad_norm": 27.57345199584961, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8551095724105835, + "num_tokens": 417810663.0, + "step": 10946 + }, + { + "epoch": 1.3925709197303142, + "ewc_loss": 0.040580928325653076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0290464817662723e-05, + "grad_norm": 27.519163131713867, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8692165017127991, + "num_tokens": 417845664.0, + "step": 10947 + }, + { + "epoch": 1.3926981300089047, + "ewc_loss": 0.04066230729222298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.033115379163064e-05, + "grad_norm": 27.595165252685547, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8572906851768494, + "num_tokens": 417889617.0, + "step": 10948 + }, + { + "epoch": 1.3928253402874953, + "ewc_loss": 0.040579650551080704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0289824533392675e-05, + "grad_norm": 27.421445846557617, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8614758253097534, + "num_tokens": 417930820.0, + "step": 10949 + }, + { + "epoch": 1.3929525505660858, + "ewc_loss": 0.04066624119877815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0333120119175874e-05, + "grad_norm": 27.586170196533203, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.866539478302002, + "num_tokens": 417966799.0, + "step": 10950 + }, + { + "epoch": 1.3930797608446763, + "ewc_loss": 0.040703900158405304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.035195029748138e-05, + "grad_norm": 27.547117233276367, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8572745323181152, + "num_tokens": 418009328.0, + "step": 10951 + }, + { + "epoch": 1.3932069711232669, + "ewc_loss": 0.04067071154713631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.033535565715283e-05, + "grad_norm": 27.541675567626953, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8667060136795044, + "num_tokens": 418048548.0, + "step": 10952 + }, + { + "epoch": 1.3933341814018574, + "ewc_loss": 0.040664996951818466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.033249802479986e-05, + "grad_norm": 27.644081115722656, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8626676201820374, + "num_tokens": 418094947.0, + "step": 10953 + }, + { + "epoch": 1.393461391680448, + "ewc_loss": 0.04065108299255371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.03255422093207e-05, + "grad_norm": 27.577646255493164, + "learning_rate": 1e-06, + "loss": 0.4844, + "mean_token_accuracy": 0.8559410572052002, + "num_tokens": 418137144.0, + "step": 10954 + }, + { + "epoch": 1.3935886019590382, + "ewc_loss": 0.04062283784151077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.031141957559157e-05, + "grad_norm": 27.57131576538086, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8866852521896362, + "num_tokens": 418171449.0, + "step": 10955 + }, + { + "epoch": 1.3937158122376287, + "ewc_loss": 0.04062182456254959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0310912077547982e-05, + "grad_norm": 27.47342300415039, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8667806386947632, + "num_tokens": 418209854.0, + "step": 10956 + }, + { + "epoch": 1.3938430225162193, + "ewc_loss": 0.04062962532043457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0314811990829185e-05, + "grad_norm": 27.534526824951172, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8728307485580444, + "num_tokens": 418245121.0, + "step": 10957 + }, + { + "epoch": 1.3939702327948098, + "ewc_loss": 0.040658898651599884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0329449398559518e-05, + "grad_norm": 27.50657844543457, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8688634634017944, + "num_tokens": 418284158.0, + "step": 10958 + }, + { + "epoch": 1.3940974430734003, + "ewc_loss": 0.040700558573007584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0350278646219522e-05, + "grad_norm": 27.525739669799805, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8723022937774658, + "num_tokens": 418324091.0, + "step": 10959 + }, + { + "epoch": 1.3942246533519909, + "ewc_loss": 0.040691010653972626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0345505618024617e-05, + "grad_norm": 27.60797882080078, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8652527332305908, + "num_tokens": 418364764.0, + "step": 10960 + }, + { + "epoch": 1.3943518636305814, + "ewc_loss": 0.04067472368478775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0337362002464943e-05, + "grad_norm": 27.55286407470703, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8615595698356628, + "num_tokens": 418403974.0, + "step": 10961 + }, + { + "epoch": 1.394479073909172, + "ewc_loss": 0.04066137969493866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0330689949332736e-05, + "grad_norm": 27.52508544921875, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8557558059692383, + "num_tokens": 418446341.0, + "step": 10962 + }, + { + "epoch": 1.3946062841877624, + "ewc_loss": 0.04071969538927078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0359848349471577e-05, + "grad_norm": 27.673690795898438, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8651594519615173, + "num_tokens": 418490989.0, + "step": 10963 + }, + { + "epoch": 1.3947334944663528, + "ewc_loss": 0.040615327656269073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.030766336247325e-05, + "grad_norm": 27.530139923095703, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8600127696990967, + "num_tokens": 418524633.0, + "step": 10964 + }, + { + "epoch": 1.3948607047449433, + "ewc_loss": 0.040594473481178284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0297236915212125e-05, + "grad_norm": 27.520456314086914, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8638455271720886, + "num_tokens": 418562576.0, + "step": 10965 + }, + { + "epoch": 1.3949879150235338, + "ewc_loss": 0.040651120245456696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0325560399214737e-05, + "grad_norm": 27.619136810302734, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8642476797103882, + "num_tokens": 418596537.0, + "step": 10966 + }, + { + "epoch": 1.3951151253021243, + "ewc_loss": 0.04067175090312958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0335875888122246e-05, + "grad_norm": 27.588947296142578, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8628239035606384, + "num_tokens": 418625800.0, + "step": 10967 + }, + { + "epoch": 1.3952423355807149, + "ewc_loss": 0.040682949125766754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.034147473750636e-05, + "grad_norm": 27.58749008178711, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8506794571876526, + "num_tokens": 418663818.0, + "step": 10968 + }, + { + "epoch": 1.3953695458593054, + "ewc_loss": 0.040593188256025314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0296594811952673e-05, + "grad_norm": 27.536182403564453, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8704127669334412, + "num_tokens": 418700607.0, + "step": 10969 + }, + { + "epoch": 1.395496756137896, + "ewc_loss": 0.04063434526324272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0317173039074987e-05, + "grad_norm": 27.533855438232422, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.858188271522522, + "num_tokens": 418739357.0, + "step": 10970 + }, + { + "epoch": 1.3956239664164865, + "ewc_loss": 0.0407504066824913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.037520425801631e-05, + "grad_norm": 27.65791893005371, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8698899149894714, + "num_tokens": 418769752.0, + "step": 10971 + }, + { + "epoch": 1.395751176695077, + "ewc_loss": 0.04072517529129982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0362587747513317e-05, + "grad_norm": 27.46631622314453, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8463618755340576, + "num_tokens": 418808196.0, + "step": 10972 + }, + { + "epoch": 1.3958783869736675, + "ewc_loss": 0.04075193777680397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.03759682335658e-05, + "grad_norm": 27.76999282836914, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8809728622436523, + "num_tokens": 418843335.0, + "step": 10973 + }, + { + "epoch": 1.396005597252258, + "ewc_loss": 0.040801189839839935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0400595531100407e-05, + "grad_norm": 27.63827896118164, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8734662532806396, + "num_tokens": 418876591.0, + "step": 10974 + }, + { + "epoch": 1.3961328075308486, + "ewc_loss": 0.040635060518980026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0317529560998082e-05, + "grad_norm": 27.604520797729492, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8514409065246582, + "num_tokens": 418910482.0, + "step": 10975 + }, + { + "epoch": 1.396260017809439, + "ewc_loss": 0.040766775608062744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0383387891342863e-05, + "grad_norm": 27.70942497253418, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8563696146011353, + "num_tokens": 418942248.0, + "step": 10976 + }, + { + "epoch": 1.3963872280880296, + "ewc_loss": 0.04073364660143852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0366824173834175e-05, + "grad_norm": 27.59964370727539, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8690683245658875, + "num_tokens": 418977770.0, + "step": 10977 + }, + { + "epoch": 1.3965144383666201, + "ewc_loss": 0.04072944074869156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0364719603094272e-05, + "grad_norm": 27.606746673583984, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8572394847869873, + "num_tokens": 419016464.0, + "step": 10978 + }, + { + "epoch": 1.3966416486452105, + "ewc_loss": 0.04074953868985176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0374769519548863e-05, + "grad_norm": 27.738780975341797, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8554164171218872, + "num_tokens": 419053254.0, + "step": 10979 + }, + { + "epoch": 1.396768858923801, + "ewc_loss": 0.040766190737485886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0383095034048893e-05, + "grad_norm": 27.635929107666016, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8593988418579102, + "num_tokens": 419088940.0, + "step": 10980 + }, + { + "epoch": 1.3968960692023915, + "ewc_loss": 0.040702879428863525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0351439161458984e-05, + "grad_norm": 27.535858154296875, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8638913631439209, + "num_tokens": 419127897.0, + "step": 10981 + }, + { + "epoch": 1.397023279480982, + "ewc_loss": 0.04074414446949959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0372071958263405e-05, + "grad_norm": 27.5721492767334, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8670091032981873, + "num_tokens": 419170921.0, + "step": 10982 + }, + { + "epoch": 1.3971504897595726, + "ewc_loss": 0.0407617948949337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.038089769484941e-05, + "grad_norm": 27.63819122314453, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8710047602653503, + "num_tokens": 419206032.0, + "step": 10983 + }, + { + "epoch": 1.397277700038163, + "ewc_loss": 0.04075240343809128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0376201064209454e-05, + "grad_norm": 27.613975524902344, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8795419931411743, + "num_tokens": 419244240.0, + "step": 10984 + }, + { + "epoch": 1.3974049103167536, + "ewc_loss": 0.040748633444309235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.037431659118738e-05, + "grad_norm": 27.706525802612305, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8729002475738525, + "num_tokens": 419274902.0, + "step": 10985 + }, + { + "epoch": 1.3975321205953442, + "ewc_loss": 0.0407317653298378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0365881937323138e-05, + "grad_norm": 27.620359420776367, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8767129182815552, + "num_tokens": 419312295.0, + "step": 10986 + }, + { + "epoch": 1.3976593308739347, + "ewc_loss": 0.0407433919608593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.037169542745687e-05, + "grad_norm": 27.657066345214844, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8476870656013489, + "num_tokens": 419350999.0, + "step": 10987 + }, + { + "epoch": 1.397786541152525, + "ewc_loss": 0.04075196385383606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0375982785481028e-05, + "grad_norm": 27.6007080078125, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8660363554954529, + "num_tokens": 419386739.0, + "step": 10988 + }, + { + "epoch": 1.3979137514311155, + "ewc_loss": 0.04078233987092972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0391169528011233e-05, + "grad_norm": 27.60588264465332, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8715516328811646, + "num_tokens": 419430353.0, + "step": 10989 + }, + { + "epoch": 1.398040961709706, + "ewc_loss": 0.04073125496506691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0365627278806642e-05, + "grad_norm": 27.541296005249023, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8601747751235962, + "num_tokens": 419462910.0, + "step": 10990 + }, + { + "epoch": 1.3981681719882966, + "ewc_loss": 0.04071129113435745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0355646483949386e-05, + "grad_norm": 27.67948341369629, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8746756911277771, + "num_tokens": 419498727.0, + "step": 10991 + }, + { + "epoch": 1.398295382266887, + "ewc_loss": 0.04072976112365723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0364879674161784e-05, + "grad_norm": 27.62727165222168, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.855686366558075, + "num_tokens": 419534496.0, + "step": 10992 + }, + { + "epoch": 1.3984225925454776, + "ewc_loss": 0.040674369782209396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.03371855604928e-05, + "grad_norm": 27.607868194580078, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8611977696418762, + "num_tokens": 419575760.0, + "step": 10993 + }, + { + "epoch": 1.3985498028240682, + "ewc_loss": 0.04070485755801201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0352428691694513e-05, + "grad_norm": 27.640769958496094, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8713353872299194, + "num_tokens": 419616822.0, + "step": 10994 + }, + { + "epoch": 1.3986770131026587, + "ewc_loss": 0.0406811386346817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0340568880783394e-05, + "grad_norm": 27.685495376586914, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8717941045761108, + "num_tokens": 419650425.0, + "step": 10995 + }, + { + "epoch": 1.3988042233812492, + "ewc_loss": 0.040699612349271774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.03498057089746e-05, + "grad_norm": 27.526351928710938, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8720294237136841, + "num_tokens": 419685501.0, + "step": 10996 + }, + { + "epoch": 1.3989314336598397, + "ewc_loss": 0.04068957641720772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.034478893619962e-05, + "grad_norm": 27.694849014282227, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8675776124000549, + "num_tokens": 419724471.0, + "step": 10997 + }, + { + "epoch": 1.3990586439384303, + "ewc_loss": 0.04079698771238327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0398494598339312e-05, + "grad_norm": 27.6623592376709, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.873609185218811, + "num_tokens": 419763683.0, + "step": 10998 + }, + { + "epoch": 1.3991858542170208, + "ewc_loss": 0.040691789239645004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0345894881756976e-05, + "grad_norm": 27.61278533935547, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8679487705230713, + "num_tokens": 419801238.0, + "step": 10999 + }, + { + "epoch": 1.3993130644956113, + "ewc_loss": 0.040701400488615036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0350700651761144e-05, + "grad_norm": 27.73838233947754, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8621085286140442, + "num_tokens": 419832787.0, + "step": 11000 + }, + { + "epoch": 1.3994402747742019, + "ewc_loss": 0.04071810469031334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0359051632112823e-05, + "grad_norm": 27.778648376464844, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8546357154846191, + "num_tokens": 419864294.0, + "step": 11001 + }, + { + "epoch": 1.3995674850527924, + "ewc_loss": 0.040623970329761505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0311985281296074e-05, + "grad_norm": 27.5565185546875, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8685176372528076, + "num_tokens": 419900493.0, + "step": 11002 + }, + { + "epoch": 1.399694695331383, + "ewc_loss": 0.04070103541016579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0350516933831386e-05, + "grad_norm": 27.786191940307617, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8838786482810974, + "num_tokens": 419938917.0, + "step": 11003 + }, + { + "epoch": 1.3998219056099732, + "ewc_loss": 0.0407729372382164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.038646925939247e-05, + "grad_norm": 27.770212173461914, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8808750510215759, + "num_tokens": 419978771.0, + "step": 11004 + }, + { + "epoch": 1.3999491158885637, + "ewc_loss": 0.04067632928490639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0338164176791906e-05, + "grad_norm": 27.623348236083984, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8628550171852112, + "num_tokens": 420011836.0, + "step": 11005 + }, + { + "epoch": 1.4000763261671543, + "ewc_loss": 0.04065673053264618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0328365280875005e-05, + "grad_norm": 27.886247634887695, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.84437495470047, + "num_tokens": 420051618.0, + "step": 11006 + }, + { + "epoch": 1.4002035364457448, + "ewc_loss": 0.04072108864784241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0360545022413135e-05, + "grad_norm": 27.5098819732666, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8519372940063477, + "num_tokens": 420089119.0, + "step": 11007 + }, + { + "epoch": 1.4003307467243353, + "ewc_loss": 0.040607795119285583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.030389805440791e-05, + "grad_norm": 27.9906063079834, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8760589361190796, + "num_tokens": 420123706.0, + "step": 11008 + }, + { + "epoch": 1.4004579570029259, + "ewc_loss": 0.04084254428744316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0421271983650513e-05, + "grad_norm": 27.753379821777344, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.862572193145752, + "num_tokens": 420156723.0, + "step": 11009 + }, + { + "epoch": 1.4005851672815164, + "ewc_loss": 0.040452029556035995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0226014385116287e-05, + "grad_norm": 27.547977447509766, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8652613162994385, + "num_tokens": 420198519.0, + "step": 11010 + }, + { + "epoch": 1.400712377560107, + "ewc_loss": 0.040635012090206146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0317505914135836e-05, + "grad_norm": 27.602767944335938, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8779826760292053, + "num_tokens": 420231722.0, + "step": 11011 + }, + { + "epoch": 1.4008395878386974, + "ewc_loss": 0.04059944674372673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0299723473726772e-05, + "grad_norm": 27.58942222595215, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8682997822761536, + "num_tokens": 420272059.0, + "step": 11012 + }, + { + "epoch": 1.4009667981172877, + "ewc_loss": 0.04060148075222969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0300740288803354e-05, + "grad_norm": 27.618602752685547, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8716253042221069, + "num_tokens": 420307695.0, + "step": 11013 + }, + { + "epoch": 1.4010940083958783, + "ewc_loss": 0.04068874195218086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0344370568636805e-05, + "grad_norm": 27.642715454101562, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8754186630249023, + "num_tokens": 420345989.0, + "step": 11014 + }, + { + "epoch": 1.4012212186744688, + "ewc_loss": 0.040714994072914124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0357496396172792e-05, + "grad_norm": 27.614091873168945, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8720445036888123, + "num_tokens": 420387194.0, + "step": 11015 + }, + { + "epoch": 1.4013484289530593, + "ewc_loss": 0.040618106722831726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.030905307037756e-05, + "grad_norm": 27.518705368041992, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8810981512069702, + "num_tokens": 420427608.0, + "step": 11016 + }, + { + "epoch": 1.4014756392316499, + "ewc_loss": 0.04073743894696236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.036871956079267e-05, + "grad_norm": 27.627639770507812, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8719604015350342, + "num_tokens": 420461363.0, + "step": 11017 + }, + { + "epoch": 1.4016028495102404, + "ewc_loss": 0.04067010059952736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0335050066933036e-05, + "grad_norm": 27.447017669677734, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8478321433067322, + "num_tokens": 420503577.0, + "step": 11018 + }, + { + "epoch": 1.401730059788831, + "ewc_loss": 0.04072313383221626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0361567294457927e-05, + "grad_norm": 27.717758178710938, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8769437670707703, + "num_tokens": 420538852.0, + "step": 11019 + }, + { + "epoch": 1.4018572700674214, + "ewc_loss": 0.040804293006658554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0402147129061632e-05, + "grad_norm": 27.605195999145508, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8630653023719788, + "num_tokens": 420577495.0, + "step": 11020 + }, + { + "epoch": 1.401984480346012, + "ewc_loss": 0.04070613160729408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0353065337985754e-05, + "grad_norm": 27.714462280273438, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8655586838722229, + "num_tokens": 420616123.0, + "step": 11021 + }, + { + "epoch": 1.4021116906246025, + "ewc_loss": 0.04069933295249939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0349665646790527e-05, + "grad_norm": 27.49791717529297, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8755264282226562, + "num_tokens": 420653904.0, + "step": 11022 + }, + { + "epoch": 1.402238900903193, + "ewc_loss": 0.04057956859469414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0289784515625797e-05, + "grad_norm": 27.555086135864258, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8893724679946899, + "num_tokens": 420693539.0, + "step": 11023 + }, + { + "epoch": 1.4023661111817836, + "ewc_loss": 0.040798261761665344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0399131244630553e-05, + "grad_norm": 27.594057083129883, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8707639575004578, + "num_tokens": 420729046.0, + "step": 11024 + }, + { + "epoch": 1.402493321460374, + "ewc_loss": 0.040698207914829254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.034910357906483e-05, + "grad_norm": 27.645681381225586, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.868100643157959, + "num_tokens": 420766143.0, + "step": 11025 + }, + { + "epoch": 1.4026205317389646, + "ewc_loss": 0.04076965153217316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0384824892971665e-05, + "grad_norm": 27.55388641357422, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8632116317749023, + "num_tokens": 420806367.0, + "step": 11026 + }, + { + "epoch": 1.4027477420175551, + "ewc_loss": 0.040727559477090836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.036377918557264e-05, + "grad_norm": 27.69060516357422, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8719468116760254, + "num_tokens": 420845081.0, + "step": 11027 + }, + { + "epoch": 1.4028749522961454, + "ewc_loss": 0.04076579213142395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0382896764203906e-05, + "grad_norm": 27.49468421936035, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8705109357833862, + "num_tokens": 420885833.0, + "step": 11028 + }, + { + "epoch": 1.403002162574736, + "ewc_loss": 0.040603019297122955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0301509721321054e-05, + "grad_norm": 27.62612533569336, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8707382082939148, + "num_tokens": 420916313.0, + "step": 11029 + }, + { + "epoch": 1.4031293728533265, + "ewc_loss": 0.04080215469002724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0401077563292347e-05, + "grad_norm": 27.570554733276367, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8490103483200073, + "num_tokens": 420957011.0, + "step": 11030 + }, + { + "epoch": 1.403256583131917, + "ewc_loss": 0.040684182196855545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0342091374914162e-05, + "grad_norm": 27.64460563659668, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8702713251113892, + "num_tokens": 420994815.0, + "step": 11031 + }, + { + "epoch": 1.4033837934105076, + "ewc_loss": 0.040817808359861374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0408904674695805e-05, + "grad_norm": 27.51663589477539, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8731683492660522, + "num_tokens": 421033977.0, + "step": 11032 + }, + { + "epoch": 1.403511003689098, + "ewc_loss": 0.04072354733943939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0361772840260528e-05, + "grad_norm": 27.715463638305664, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8698465824127197, + "num_tokens": 421068871.0, + "step": 11033 + }, + { + "epoch": 1.4036382139676886, + "ewc_loss": 0.040820714086294174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0410356228239834e-05, + "grad_norm": 27.499794006347656, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.867047905921936, + "num_tokens": 421105680.0, + "step": 11034 + }, + { + "epoch": 1.4037654242462791, + "ewc_loss": 0.04068328067660332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0341640265542082e-05, + "grad_norm": 27.642562866210938, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.878247857093811, + "num_tokens": 421149519.0, + "step": 11035 + }, + { + "epoch": 1.4038926345248697, + "ewc_loss": 0.04088858142495155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0444291294552386e-05, + "grad_norm": 27.707595825195312, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8676252961158752, + "num_tokens": 421190195.0, + "step": 11036 + }, + { + "epoch": 1.40401984480346, + "ewc_loss": 0.04064521566033363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0322608179412782e-05, + "grad_norm": 27.51263999938965, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8604053258895874, + "num_tokens": 421218637.0, + "step": 11037 + }, + { + "epoch": 1.4041470550820505, + "ewc_loss": 0.04065508395433426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.03275412786752e-05, + "grad_norm": 27.48187828063965, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8699557781219482, + "num_tokens": 421261300.0, + "step": 11038 + }, + { + "epoch": 1.404274265360641, + "ewc_loss": 0.04073875769972801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0369378034956753e-05, + "grad_norm": 27.734058380126953, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8627520203590393, + "num_tokens": 421297250.0, + "step": 11039 + }, + { + "epoch": 1.4044014756392316, + "ewc_loss": 0.04074122756719589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.037061312876176e-05, + "grad_norm": 27.662994384765625, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8592694401741028, + "num_tokens": 421334213.0, + "step": 11040 + }, + { + "epoch": 1.404528685917822, + "ewc_loss": 0.04064714536070824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.032357224379666e-05, + "grad_norm": 27.654367446899414, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8740100860595703, + "num_tokens": 421368341.0, + "step": 11041 + }, + { + "epoch": 1.4046558961964126, + "ewc_loss": 0.040686704218387604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.034335193457082e-05, + "grad_norm": 27.703628540039062, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.853328287601471, + "num_tokens": 421408278.0, + "step": 11042 + }, + { + "epoch": 1.4047831064750032, + "ewc_loss": 0.040744226425886154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0372113795019686e-05, + "grad_norm": 27.732412338256836, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8576467037200928, + "num_tokens": 421442559.0, + "step": 11043 + }, + { + "epoch": 1.4049103167535937, + "ewc_loss": 0.04063628241419792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0318140741437674e-05, + "grad_norm": 27.648744583129883, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8634961247444153, + "num_tokens": 421474014.0, + "step": 11044 + }, + { + "epoch": 1.4050375270321842, + "ewc_loss": 0.04076201096177101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.038100501522422e-05, + "grad_norm": 27.721235275268555, + "learning_rate": 1e-06, + "loss": 0.5071, + "mean_token_accuracy": 0.841403067111969, + "num_tokens": 421511193.0, + "step": 11045 + }, + { + "epoch": 1.4051647373107747, + "ewc_loss": 0.04068043455481529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.034021781582851e-05, + "grad_norm": 27.64613914489746, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8676011562347412, + "num_tokens": 421547895.0, + "step": 11046 + }, + { + "epoch": 1.4052919475893653, + "ewc_loss": 0.04066232591867447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.033116288657766e-05, + "grad_norm": 27.656396865844727, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8655115365982056, + "num_tokens": 421582883.0, + "step": 11047 + }, + { + "epoch": 1.4054191578679558, + "ewc_loss": 0.040721114724874496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.036055775533896e-05, + "grad_norm": 27.574981689453125, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8685316443443298, + "num_tokens": 421619838.0, + "step": 11048 + }, + { + "epoch": 1.4055463681465463, + "ewc_loss": 0.04072127118706703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0360635971883312e-05, + "grad_norm": 27.62639808654785, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8765103816986084, + "num_tokens": 421659451.0, + "step": 11049 + }, + { + "epoch": 1.4056735784251368, + "ewc_loss": 0.04074627906084061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0373139705043286e-05, + "grad_norm": 27.499324798583984, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.863417387008667, + "num_tokens": 421704826.0, + "step": 11050 + }, + { + "epoch": 1.4058007887037274, + "ewc_loss": 0.04076061770319939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.038030834228266e-05, + "grad_norm": 27.666728973388672, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8626369833946228, + "num_tokens": 421748953.0, + "step": 11051 + }, + { + "epoch": 1.405927998982318, + "ewc_loss": 0.040839191526174545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0419594875420444e-05, + "grad_norm": 27.581239700317383, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8489947319030762, + "num_tokens": 421786614.0, + "step": 11052 + }, + { + "epoch": 1.4060552092609082, + "ewc_loss": 0.040813665837049484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0406832845765166e-05, + "grad_norm": 27.74797821044922, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8598586916923523, + "num_tokens": 421817291.0, + "step": 11053 + }, + { + "epoch": 1.4061824195394987, + "ewc_loss": 0.04085460677742958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.042730375251267e-05, + "grad_norm": 27.647924423217773, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8637937307357788, + "num_tokens": 421858200.0, + "step": 11054 + }, + { + "epoch": 1.4063096298180893, + "ewc_loss": 0.040663402527570724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0331701307441108e-05, + "grad_norm": 27.635637283325195, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8663111925125122, + "num_tokens": 421903343.0, + "step": 11055 + }, + { + "epoch": 1.4064368400966798, + "ewc_loss": 0.04080244526267052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.040122308244463e-05, + "grad_norm": 27.740991592407227, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8820099830627441, + "num_tokens": 421941426.0, + "step": 11056 + }, + { + "epoch": 1.4065640503752703, + "ewc_loss": 0.04072442278265953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0362211216706783e-05, + "grad_norm": 27.60072898864746, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.862924337387085, + "num_tokens": 421983992.0, + "step": 11057 + }, + { + "epoch": 1.4066912606538609, + "ewc_loss": 0.040716297924518585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0358149413368665e-05, + "grad_norm": 27.71583366394043, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8740151524543762, + "num_tokens": 422015066.0, + "step": 11058 + }, + { + "epoch": 1.4068184709324514, + "ewc_loss": 0.040834929794073105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.041746483882889e-05, + "grad_norm": 27.733047485351562, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8686757683753967, + "num_tokens": 422052392.0, + "step": 11059 + }, + { + "epoch": 1.406945681211042, + "ewc_loss": 0.04064158722758293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0320792827988043e-05, + "grad_norm": 27.511579513549805, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8611751794815063, + "num_tokens": 422091453.0, + "step": 11060 + }, + { + "epoch": 1.4070728914896324, + "ewc_loss": 0.04074813425540924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0374067389639094e-05, + "grad_norm": 27.73958396911621, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8497380018234253, + "num_tokens": 422131019.0, + "step": 11061 + }, + { + "epoch": 1.4072001017682227, + "ewc_loss": 0.04071113467216492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0355568267405033e-05, + "grad_norm": 27.538694381713867, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8595016002655029, + "num_tokens": 422172139.0, + "step": 11062 + }, + { + "epoch": 1.4073273120468133, + "ewc_loss": 0.040703970938920975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0351984858280048e-05, + "grad_norm": 27.77143096923828, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8733607530593872, + "num_tokens": 422209708.0, + "step": 11063 + }, + { + "epoch": 1.4074545223254038, + "ewc_loss": 0.04084091633558273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0420458895387128e-05, + "grad_norm": 27.699552536010742, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8665720224380493, + "num_tokens": 422245091.0, + "step": 11064 + }, + { + "epoch": 1.4075817326039943, + "ewc_loss": 0.0406373031437397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.031865187746007e-05, + "grad_norm": 27.73149299621582, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8665683269500732, + "num_tokens": 422280054.0, + "step": 11065 + }, + { + "epoch": 1.4077089428825849, + "ewc_loss": 0.04077017307281494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0385086827445775e-05, + "grad_norm": 27.621257781982422, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8720280528068542, + "num_tokens": 422310036.0, + "step": 11066 + }, + { + "epoch": 1.4078361531611754, + "ewc_loss": 0.04068244248628616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0341221897979267e-05, + "grad_norm": 27.716060638427734, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8508853912353516, + "num_tokens": 422347914.0, + "step": 11067 + }, + { + "epoch": 1.407963363439766, + "ewc_loss": 0.04070331156253815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0351655621198006e-05, + "grad_norm": 27.650972366333008, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8664966821670532, + "num_tokens": 422383207.0, + "step": 11068 + }, + { + "epoch": 1.4080905737183564, + "ewc_loss": 0.04068262502551079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0341312847449444e-05, + "grad_norm": 27.69192123413086, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8697693347930908, + "num_tokens": 422414675.0, + "step": 11069 + }, + { + "epoch": 1.408217783996947, + "ewc_loss": 0.04079212248325348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0396060790517367e-05, + "grad_norm": 27.684560775756836, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8548731803894043, + "num_tokens": 422449746.0, + "step": 11070 + }, + { + "epoch": 1.4083449942755375, + "ewc_loss": 0.04077395051717758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0386974938446656e-05, + "grad_norm": 27.691957473754883, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8590641617774963, + "num_tokens": 422487789.0, + "step": 11071 + }, + { + "epoch": 1.408472204554128, + "ewc_loss": 0.04077793285250664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.038896673184354e-05, + "grad_norm": 27.697782516479492, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.871475338935852, + "num_tokens": 422521750.0, + "step": 11072 + }, + { + "epoch": 1.4085994148327186, + "ewc_loss": 0.040777795016765594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0388897610246204e-05, + "grad_norm": 27.624164581298828, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8745763301849365, + "num_tokens": 422562923.0, + "step": 11073 + }, + { + "epoch": 1.408726625111309, + "ewc_loss": 0.04078849405050278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0394247258082032e-05, + "grad_norm": 27.663482666015625, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8550755977630615, + "num_tokens": 422606832.0, + "step": 11074 + }, + { + "epoch": 1.4088538353898996, + "ewc_loss": 0.04082351550459862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0411756850080565e-05, + "grad_norm": 27.589906692504883, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8688938617706299, + "num_tokens": 422644465.0, + "step": 11075 + }, + { + "epoch": 1.4089810456684901, + "ewc_loss": 0.04078000783920288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.039000355580356e-05, + "grad_norm": 27.6427059173584, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.874316394329071, + "num_tokens": 422681204.0, + "step": 11076 + }, + { + "epoch": 1.4091082559470804, + "ewc_loss": 0.040778957307338715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0389477867865935e-05, + "grad_norm": 27.612192153930664, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8564922213554382, + "num_tokens": 422716325.0, + "step": 11077 + }, + { + "epoch": 1.409235466225671, + "ewc_loss": 0.040897563099861145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.044878237938974e-05, + "grad_norm": 27.667844772338867, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8549745082855225, + "num_tokens": 422753644.0, + "step": 11078 + }, + { + "epoch": 1.4093626765042615, + "ewc_loss": 0.04076779633760452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.038389902736526e-05, + "grad_norm": 27.622333526611328, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8655171990394592, + "num_tokens": 422790893.0, + "step": 11079 + }, + { + "epoch": 1.409489886782852, + "ewc_loss": 0.04087312892079353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0436564227566123e-05, + "grad_norm": 27.64214324951172, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8584616780281067, + "num_tokens": 422829400.0, + "step": 11080 + }, + { + "epoch": 1.4096170970614426, + "ewc_loss": 0.04082382470369339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.041191328316927e-05, + "grad_norm": 27.629621505737305, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8750249147415161, + "num_tokens": 422867386.0, + "step": 11081 + }, + { + "epoch": 1.409744307340033, + "ewc_loss": 0.04082266986370087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0411334844538942e-05, + "grad_norm": 27.601991653442383, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8686697483062744, + "num_tokens": 422910736.0, + "step": 11082 + }, + { + "epoch": 1.4098715176186236, + "ewc_loss": 0.04088197648525238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0440988009795547e-05, + "grad_norm": 27.718894958496094, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8701502084732056, + "num_tokens": 422950302.0, + "step": 11083 + }, + { + "epoch": 1.4099987278972141, + "ewc_loss": 0.04091035574674606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0455177946132608e-05, + "grad_norm": 27.65983009338379, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8646581172943115, + "num_tokens": 422985892.0, + "step": 11084 + }, + { + "epoch": 1.4101259381758047, + "ewc_loss": 0.040891069918870926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.044553548330441e-05, + "grad_norm": 27.7529239654541, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8580373525619507, + "num_tokens": 423020075.0, + "step": 11085 + }, + { + "epoch": 1.410253148454395, + "ewc_loss": 0.040924638509750366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.046231929853093e-05, + "grad_norm": 27.636083602905273, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8686048984527588, + "num_tokens": 423058260.0, + "step": 11086 + }, + { + "epoch": 1.4103803587329855, + "ewc_loss": 0.04078908637166023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.039454375335481e-05, + "grad_norm": 27.652381896972656, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8890296816825867, + "num_tokens": 423092158.0, + "step": 11087 + }, + { + "epoch": 1.410507569011576, + "ewc_loss": 0.0408070869743824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0403544112923555e-05, + "grad_norm": 27.585586547851562, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.863542914390564, + "num_tokens": 423123999.0, + "step": 11088 + }, + { + "epoch": 1.4106347792901666, + "ewc_loss": 0.04077339544892311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0386698452057317e-05, + "grad_norm": 27.685285568237305, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8630568981170654, + "num_tokens": 423167111.0, + "step": 11089 + }, + { + "epoch": 1.410761989568757, + "ewc_loss": 0.04090297967195511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0451489035622217e-05, + "grad_norm": 27.713199615478516, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8576465845108032, + "num_tokens": 423202060.0, + "step": 11090 + }, + { + "epoch": 1.4108891998473476, + "ewc_loss": 0.04082566872239113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0412833691807464e-05, + "grad_norm": 27.558536529541016, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8577253818511963, + "num_tokens": 423241200.0, + "step": 11091 + }, + { + "epoch": 1.4110164101259381, + "ewc_loss": 0.040881574153900146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0440787920961156e-05, + "grad_norm": 27.687885284423828, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8749114274978638, + "num_tokens": 423284713.0, + "step": 11092 + }, + { + "epoch": 1.4111436204045287, + "ewc_loss": 0.04090692847967148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0453464458114468e-05, + "grad_norm": 27.619176864624023, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8686736822128296, + "num_tokens": 423322524.0, + "step": 11093 + }, + { + "epoch": 1.4112708306831192, + "ewc_loss": 0.04082629084587097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.041314473899547e-05, + "grad_norm": 27.580503463745117, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.866712749004364, + "num_tokens": 423360551.0, + "step": 11094 + }, + { + "epoch": 1.4113980409617097, + "ewc_loss": 0.04097175970673561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0485880668275058e-05, + "grad_norm": 27.696002960205078, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8759858012199402, + "num_tokens": 423395227.0, + "step": 11095 + }, + { + "epoch": 1.4115252512403003, + "ewc_loss": 0.040841855108737946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0420928194653243e-05, + "grad_norm": 27.56878662109375, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.8524372577667236, + "num_tokens": 423434757.0, + "step": 11096 + }, + { + "epoch": 1.4116524615188908, + "ewc_loss": 0.040876854211091995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0438426872715354e-05, + "grad_norm": 27.634366989135742, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.883563220500946, + "num_tokens": 423464599.0, + "step": 11097 + }, + { + "epoch": 1.4117796717974813, + "ewc_loss": 0.0408870093524456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0443505491130054e-05, + "grad_norm": 27.682483673095703, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8583636283874512, + "num_tokens": 423504327.0, + "step": 11098 + }, + { + "epoch": 1.4119068820760718, + "ewc_loss": 0.04096226021647453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0481129467952996e-05, + "grad_norm": 27.680734634399414, + "learning_rate": 1e-06, + "loss": 0.4977, + "mean_token_accuracy": 0.8473236560821533, + "num_tokens": 423543114.0, + "step": 11099 + }, + { + "epoch": 1.4120340923546624, + "ewc_loss": 0.04083014652132988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.041507286776323e-05, + "grad_norm": 27.630416870117188, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8542360067367554, + "num_tokens": 423581727.0, + "step": 11100 + }, + { + "epoch": 1.412161302633253, + "ewc_loss": 0.040901560336351395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0450779629754834e-05, + "grad_norm": 27.66482925415039, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8700767755508423, + "num_tokens": 423617073.0, + "step": 11101 + }, + { + "epoch": 1.4122885129118432, + "ewc_loss": 0.040863290429115295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0431645680218935e-05, + "grad_norm": 27.65515899658203, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.869547963142395, + "num_tokens": 423664922.0, + "step": 11102 + }, + { + "epoch": 1.4124157231904337, + "ewc_loss": 0.04085807874798775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0429039068403654e-05, + "grad_norm": 27.57367706298828, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8569523692131042, + "num_tokens": 423706990.0, + "step": 11103 + }, + { + "epoch": 1.4125429334690243, + "ewc_loss": 0.04085671156644821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.042835512838792e-05, + "grad_norm": 27.753250122070312, + "learning_rate": 1e-06, + "loss": 0.4903, + "mean_token_accuracy": 0.8495632410049438, + "num_tokens": 423745945.0, + "step": 11104 + }, + { + "epoch": 1.4126701437476148, + "ewc_loss": 0.04088950157165527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0444751498871483e-05, + "grad_norm": 27.70521354675293, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8723280429840088, + "num_tokens": 423780824.0, + "step": 11105 + }, + { + "epoch": 1.4127973540262053, + "ewc_loss": 0.040867023169994354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0433511963346973e-05, + "grad_norm": 27.758407592773438, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8782967925071716, + "num_tokens": 423818640.0, + "step": 11106 + }, + { + "epoch": 1.4129245643047958, + "ewc_loss": 0.040775369852781296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.038768434431404e-05, + "grad_norm": 27.57297134399414, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8754177093505859, + "num_tokens": 423856657.0, + "step": 11107 + }, + { + "epoch": 1.4130517745833864, + "ewc_loss": 0.04081244766712189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0406223484314978e-05, + "grad_norm": 27.528257369995117, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8775531053543091, + "num_tokens": 423900288.0, + "step": 11108 + }, + { + "epoch": 1.413178984861977, + "ewc_loss": 0.04091109335422516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0455547200981528e-05, + "grad_norm": 27.68378257751465, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8727184534072876, + "num_tokens": 423935929.0, + "step": 11109 + }, + { + "epoch": 1.4133061951405674, + "ewc_loss": 0.040888093411922455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.044404754997231e-05, + "grad_norm": 27.56153678894043, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8643613457679749, + "num_tokens": 423970719.0, + "step": 11110 + }, + { + "epoch": 1.4134334054191577, + "ewc_loss": 0.04081513732671738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0407569536473602e-05, + "grad_norm": 27.598825454711914, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8668776154518127, + "num_tokens": 424012338.0, + "step": 11111 + }, + { + "epoch": 1.4135606156977483, + "ewc_loss": 0.040890030562877655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0445015252334997e-05, + "grad_norm": 27.61574935913086, + "learning_rate": 1e-06, + "loss": 0.4878, + "mean_token_accuracy": 0.8486023545265198, + "num_tokens": 424050112.0, + "step": 11112 + }, + { + "epoch": 1.4136878259763388, + "ewc_loss": 0.04087245836853981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.043622953351587e-05, + "grad_norm": 27.6882266998291, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8666542768478394, + "num_tokens": 424092688.0, + "step": 11113 + }, + { + "epoch": 1.4138150362549293, + "ewc_loss": 0.04086218774318695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.043109452642966e-05, + "grad_norm": 27.563261032104492, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8637408018112183, + "num_tokens": 424131355.0, + "step": 11114 + }, + { + "epoch": 1.4139422465335199, + "ewc_loss": 0.040861938148736954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0430969016160816e-05, + "grad_norm": 27.635995864868164, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8555025458335876, + "num_tokens": 424177768.0, + "step": 11115 + }, + { + "epoch": 1.4140694568121104, + "ewc_loss": 0.040884729474782944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.044236498477403e-05, + "grad_norm": 27.563886642456055, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8790848851203918, + "num_tokens": 424215834.0, + "step": 11116 + }, + { + "epoch": 1.414196667090701, + "ewc_loss": 0.04081930220127106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.040965046035126e-05, + "grad_norm": 27.6122989654541, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8706172108650208, + "num_tokens": 424260002.0, + "step": 11117 + }, + { + "epoch": 1.4143238773692914, + "ewc_loss": 0.04093116521835327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.046558256552089e-05, + "grad_norm": 27.64982795715332, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8637012243270874, + "num_tokens": 424299343.0, + "step": 11118 + }, + { + "epoch": 1.414451087647882, + "ewc_loss": 0.040880221873521805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0440111256903037e-05, + "grad_norm": 27.594303131103516, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.860143780708313, + "num_tokens": 424340312.0, + "step": 11119 + }, + { + "epoch": 1.4145782979264725, + "ewc_loss": 0.04086859151721001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.04342959477799e-05, + "grad_norm": 27.6884822845459, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8744862079620361, + "num_tokens": 424376658.0, + "step": 11120 + }, + { + "epoch": 1.414705508205063, + "ewc_loss": 0.04083015397191048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0415076505742036e-05, + "grad_norm": 27.594120025634766, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8776910901069641, + "num_tokens": 424415499.0, + "step": 11121 + }, + { + "epoch": 1.4148327184836536, + "ewc_loss": 0.04083079844713211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0415398466866463e-05, + "grad_norm": 27.737350463867188, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8670363426208496, + "num_tokens": 424453401.0, + "step": 11122 + }, + { + "epoch": 1.414959928762244, + "ewc_loss": 0.040841296315193176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0420648070285097e-05, + "grad_norm": 27.592790603637695, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8696670532226562, + "num_tokens": 424490654.0, + "step": 11123 + }, + { + "epoch": 1.4150871390408346, + "ewc_loss": 0.04081737622618675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0408688214956783e-05, + "grad_norm": 27.665699005126953, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8756610751152039, + "num_tokens": 424534469.0, + "step": 11124 + }, + { + "epoch": 1.4152143493194251, + "ewc_loss": 0.040850523859262466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.042526102741249e-05, + "grad_norm": 27.73973846435547, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.857741117477417, + "num_tokens": 424570187.0, + "step": 11125 + }, + { + "epoch": 1.4153415595980154, + "ewc_loss": 0.040844064205884933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0422032321221195e-05, + "grad_norm": 27.695907592773438, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8463879227638245, + "num_tokens": 424607898.0, + "step": 11126 + }, + { + "epoch": 1.415468769876606, + "ewc_loss": 0.040729835629463196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.036491787293926e-05, + "grad_norm": 27.711782455444336, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8699653148651123, + "num_tokens": 424652156.0, + "step": 11127 + }, + { + "epoch": 1.4155959801551965, + "ewc_loss": 0.040779951959848404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0389976270962507e-05, + "grad_norm": 27.585617065429688, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.856365442276001, + "num_tokens": 424688735.0, + "step": 11128 + }, + { + "epoch": 1.415723190433787, + "ewc_loss": 0.0408056303858757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0402814698172733e-05, + "grad_norm": 27.615049362182617, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8668805360794067, + "num_tokens": 424732859.0, + "step": 11129 + }, + { + "epoch": 1.4158504007123776, + "ewc_loss": 0.04084376245737076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.04218813451007e-05, + "grad_norm": 27.680007934570312, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8579475283622742, + "num_tokens": 424769381.0, + "step": 11130 + }, + { + "epoch": 1.415977610990968, + "ewc_loss": 0.04080507904291153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0402540030772798e-05, + "grad_norm": 27.665145874023438, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8667209148406982, + "num_tokens": 424802362.0, + "step": 11131 + }, + { + "epoch": 1.4161048212695586, + "ewc_loss": 0.0407746322453022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0387316908454522e-05, + "grad_norm": 27.582687377929688, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8629461526870728, + "num_tokens": 424837897.0, + "step": 11132 + }, + { + "epoch": 1.4162320315481491, + "ewc_loss": 0.04085714370012283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0428571588126943e-05, + "grad_norm": 27.72866439819336, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8779382109642029, + "num_tokens": 424878154.0, + "step": 11133 + }, + { + "epoch": 1.4163592418267397, + "ewc_loss": 0.040854521095752716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0427260096766986e-05, + "grad_norm": 27.58643341064453, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8719502687454224, + "num_tokens": 424915063.0, + "step": 11134 + }, + { + "epoch": 1.41648645210533, + "ewc_loss": 0.04078532010316849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0392659280332737e-05, + "grad_norm": 27.653484344482422, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8656667470932007, + "num_tokens": 424955159.0, + "step": 11135 + }, + { + "epoch": 1.4166136623839205, + "ewc_loss": 0.04084380716085434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0421903172973543e-05, + "grad_norm": 27.6319637298584, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8571372032165527, + "num_tokens": 424992573.0, + "step": 11136 + }, + { + "epoch": 1.416740872662511, + "ewc_loss": 0.04076717048883438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.038358616118785e-05, + "grad_norm": 27.609228134155273, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.870320200920105, + "num_tokens": 425030452.0, + "step": 11137 + }, + { + "epoch": 1.4168680829411016, + "ewc_loss": 0.04093179106712341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.04658954316983e-05, + "grad_norm": 27.706483840942383, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8789337873458862, + "num_tokens": 425067118.0, + "step": 11138 + }, + { + "epoch": 1.416995293219692, + "ewc_loss": 0.04076801612973213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0384008166729473e-05, + "grad_norm": 27.576656341552734, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8567858934402466, + "num_tokens": 425101684.0, + "step": 11139 + }, + { + "epoch": 1.4171225034982826, + "ewc_loss": 0.04083274304866791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.041637162619736e-05, + "grad_norm": 27.67562484741211, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8750023245811462, + "num_tokens": 425143127.0, + "step": 11140 + }, + { + "epoch": 1.4172497137768731, + "ewc_loss": 0.04089593514800072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0447967472136952e-05, + "grad_norm": 27.741586685180664, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8653848171234131, + "num_tokens": 425173912.0, + "step": 11141 + }, + { + "epoch": 1.4173769240554637, + "ewc_loss": 0.04083780199289322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0418901840457693e-05, + "grad_norm": 27.622943878173828, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.87099689245224, + "num_tokens": 425210146.0, + "step": 11142 + }, + { + "epoch": 1.4175041343340542, + "ewc_loss": 0.040804196149110794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0402098016347736e-05, + "grad_norm": 27.62039566040039, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8796345591545105, + "num_tokens": 425246082.0, + "step": 11143 + }, + { + "epoch": 1.4176313446126447, + "ewc_loss": 0.0408935621380806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.044678149104584e-05, + "grad_norm": 27.67658805847168, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8643218278884888, + "num_tokens": 425287470.0, + "step": 11144 + }, + { + "epoch": 1.4177585548912353, + "ewc_loss": 0.04083982855081558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0419915017555468e-05, + "grad_norm": 27.60649299621582, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8628584146499634, + "num_tokens": 425326553.0, + "step": 11145 + }, + { + "epoch": 1.4178857651698258, + "ewc_loss": 0.04080628231167793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0403140297275968e-05, + "grad_norm": 27.543804168701172, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.859429657459259, + "num_tokens": 425363105.0, + "step": 11146 + }, + { + "epoch": 1.4180129754484163, + "ewc_loss": 0.04092193767428398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0460969608393498e-05, + "grad_norm": 27.749095916748047, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8607838153839111, + "num_tokens": 425402513.0, + "step": 11147 + }, + { + "epoch": 1.4181401857270068, + "ewc_loss": 0.040922388434410095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0461193344090134e-05, + "grad_norm": 27.622453689575195, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8679811954498291, + "num_tokens": 425440132.0, + "step": 11148 + }, + { + "epoch": 1.4182673960055974, + "ewc_loss": 0.04092750698328018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0463752662180923e-05, + "grad_norm": 27.734294891357422, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8653150200843811, + "num_tokens": 425481738.0, + "step": 11149 + }, + { + "epoch": 1.418394606284188, + "ewc_loss": 0.04095557704567909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0477787984418683e-05, + "grad_norm": 27.60651206970215, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8646979928016663, + "num_tokens": 425516916.0, + "step": 11150 + }, + { + "epoch": 1.4185218165627782, + "ewc_loss": 0.04092961549758911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.046480767603498e-05, + "grad_norm": 27.722326278686523, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8665785789489746, + "num_tokens": 425555951.0, + "step": 11151 + }, + { + "epoch": 1.4186490268413687, + "ewc_loss": 0.04095669463276863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0478348233154975e-05, + "grad_norm": 27.723783493041992, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8659517765045166, + "num_tokens": 425595302.0, + "step": 11152 + }, + { + "epoch": 1.4187762371199593, + "ewc_loss": 0.04087280482053757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0436402337509207e-05, + "grad_norm": 27.667028427124023, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8703927993774414, + "num_tokens": 425623281.0, + "step": 11153 + }, + { + "epoch": 1.4189034473985498, + "ewc_loss": 0.040956977754831314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0478488295339048e-05, + "grad_norm": 27.73650360107422, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8745461106300354, + "num_tokens": 425659124.0, + "step": 11154 + }, + { + "epoch": 1.4190306576771403, + "ewc_loss": 0.040869154036045074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.043457789113745e-05, + "grad_norm": 27.577314376831055, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8546339273452759, + "num_tokens": 425695809.0, + "step": 11155 + }, + { + "epoch": 1.4191578679557308, + "ewc_loss": 0.04091481864452362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.045740984613076e-05, + "grad_norm": 27.659753799438477, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.856547474861145, + "num_tokens": 425731890.0, + "step": 11156 + }, + { + "epoch": 1.4192850782343214, + "ewc_loss": 0.04089231789112091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0446159396669827e-05, + "grad_norm": 27.535545349121094, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8748883605003357, + "num_tokens": 425770899.0, + "step": 11157 + }, + { + "epoch": 1.419412288512912, + "ewc_loss": 0.04100499302148819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.050249713647645e-05, + "grad_norm": 27.711965560913086, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8777157068252563, + "num_tokens": 425807812.0, + "step": 11158 + }, + { + "epoch": 1.4195394987915024, + "ewc_loss": 0.040955983102321625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.047799171123188e-05, + "grad_norm": 27.637399673461914, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8617392182350159, + "num_tokens": 425848997.0, + "step": 11159 + }, + { + "epoch": 1.4196667090700927, + "ewc_loss": 0.04093766584992409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0468833099585027e-05, + "grad_norm": 27.594799041748047, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8764253854751587, + "num_tokens": 425887816.0, + "step": 11160 + }, + { + "epoch": 1.4197939193486833, + "ewc_loss": 0.041041985154151917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0520992620731704e-05, + "grad_norm": 27.633453369140625, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8581410050392151, + "num_tokens": 425925488.0, + "step": 11161 + }, + { + "epoch": 1.4199211296272738, + "ewc_loss": 0.040982190519571304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0491095710895024e-05, + "grad_norm": 27.668304443359375, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8729665279388428, + "num_tokens": 425965616.0, + "step": 11162 + }, + { + "epoch": 1.4200483399058643, + "ewc_loss": 0.04096182808279991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0480914827203378e-05, + "grad_norm": 27.632299423217773, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8604477643966675, + "num_tokens": 426005308.0, + "step": 11163 + }, + { + "epoch": 1.4201755501844548, + "ewc_loss": 0.040916334837675095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0458166545722634e-05, + "grad_norm": 27.631746292114258, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8637514114379883, + "num_tokens": 426042339.0, + "step": 11164 + }, + { + "epoch": 1.4203027604630454, + "ewc_loss": 0.04096032679080963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0480163584579714e-05, + "grad_norm": 27.649333953857422, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8664430975914001, + "num_tokens": 426081925.0, + "step": 11165 + }, + { + "epoch": 1.420429970741636, + "ewc_loss": 0.04097433388233185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0487166693783365e-05, + "grad_norm": 27.65833282470703, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8612022399902344, + "num_tokens": 426120918.0, + "step": 11166 + }, + { + "epoch": 1.4205571810202264, + "ewc_loss": 0.04102344065904617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0511719412752427e-05, + "grad_norm": 27.771188735961914, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8542224168777466, + "num_tokens": 426158711.0, + "step": 11167 + }, + { + "epoch": 1.420684391298817, + "ewc_loss": 0.04092396795749664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0461984604480676e-05, + "grad_norm": 27.6373348236084, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8626803159713745, + "num_tokens": 426197034.0, + "step": 11168 + }, + { + "epoch": 1.4208116015774075, + "ewc_loss": 0.040838781744241714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0419391148607247e-05, + "grad_norm": 27.548213958740234, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8593082427978516, + "num_tokens": 426239390.0, + "step": 11169 + }, + { + "epoch": 1.420938811855998, + "ewc_loss": 0.041023388504981995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0511693946900778e-05, + "grad_norm": 27.71653938293457, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8559654355049133, + "num_tokens": 426285488.0, + "step": 11170 + }, + { + "epoch": 1.4210660221345885, + "ewc_loss": 0.040927138179540634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0463568944251165e-05, + "grad_norm": 27.649744033813477, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8532759547233582, + "num_tokens": 426323955.0, + "step": 11171 + }, + { + "epoch": 1.421193232413179, + "ewc_loss": 0.04093075543642044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.046537701971829e-05, + "grad_norm": 27.677486419677734, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.884315013885498, + "num_tokens": 426359460.0, + "step": 11172 + }, + { + "epoch": 1.4213204426917696, + "ewc_loss": 0.04091088846325874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0455443518585525e-05, + "grad_norm": 27.541086196899414, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8585755228996277, + "num_tokens": 426397104.0, + "step": 11173 + }, + { + "epoch": 1.4214476529703601, + "ewc_loss": 0.04094268009066582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0471339666983113e-05, + "grad_norm": 27.65885353088379, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8747636079788208, + "num_tokens": 426441349.0, + "step": 11174 + }, + { + "epoch": 1.4215748632489504, + "ewc_loss": 0.04095710441470146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0478551959968172e-05, + "grad_norm": 27.681169509887695, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8738692998886108, + "num_tokens": 426478410.0, + "step": 11175 + }, + { + "epoch": 1.421702073527541, + "ewc_loss": 0.04091130942106247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0455654521356337e-05, + "grad_norm": 27.633914947509766, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8642563223838806, + "num_tokens": 426519874.0, + "step": 11176 + }, + { + "epoch": 1.4218292838061315, + "ewc_loss": 0.040920279920101166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.046014014922548e-05, + "grad_norm": 27.649198532104492, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8543714284896851, + "num_tokens": 426559970.0, + "step": 11177 + }, + { + "epoch": 1.421956494084722, + "ewc_loss": 0.040964145213365555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0482073523453437e-05, + "grad_norm": 27.71673583984375, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8665456175804138, + "num_tokens": 426593897.0, + "step": 11178 + }, + { + "epoch": 1.4220837043633126, + "ewc_loss": 0.040895573794841766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0447787392186e-05, + "grad_norm": 27.622652053833008, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8660238981246948, + "num_tokens": 426633712.0, + "step": 11179 + }, + { + "epoch": 1.422210914641903, + "ewc_loss": 0.04092448949813843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0462244719965383e-05, + "grad_norm": 27.693588256835938, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8801060318946838, + "num_tokens": 426669085.0, + "step": 11180 + }, + { + "epoch": 1.4223381249204936, + "ewc_loss": 0.040983133018016815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0491566829150543e-05, + "grad_norm": 27.703218460083008, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8791414499282837, + "num_tokens": 426709260.0, + "step": 11181 + }, + { + "epoch": 1.4224653351990841, + "ewc_loss": 0.04082007706165314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0410037905094214e-05, + "grad_norm": 27.603994369506836, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8554941415786743, + "num_tokens": 426747747.0, + "step": 11182 + }, + { + "epoch": 1.4225925454776747, + "ewc_loss": 0.04087479040026665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0437395505723543e-05, + "grad_norm": 27.687166213989258, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8528426885604858, + "num_tokens": 426788185.0, + "step": 11183 + }, + { + "epoch": 1.422719755756265, + "ewc_loss": 0.04089150205254555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.044575012405403e-05, + "grad_norm": 27.668018341064453, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.867434561252594, + "num_tokens": 426828663.0, + "step": 11184 + }, + { + "epoch": 1.4228469660348555, + "ewc_loss": 0.040867164731025696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.043358290393371e-05, + "grad_norm": 27.654253005981445, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8563722372055054, + "num_tokens": 426869205.0, + "step": 11185 + }, + { + "epoch": 1.422974176313446, + "ewc_loss": 0.04096312075853348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0481560568441637e-05, + "grad_norm": 27.62189292907715, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8817291259765625, + "num_tokens": 426900561.0, + "step": 11186 + }, + { + "epoch": 1.4231013865920366, + "ewc_loss": 0.04081067815423012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0405339455464855e-05, + "grad_norm": 27.66015625, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8804875612258911, + "num_tokens": 426940618.0, + "step": 11187 + }, + { + "epoch": 1.423228596870627, + "ewc_loss": 0.04093605652451515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.046802910626866e-05, + "grad_norm": 27.641559600830078, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8578931093215942, + "num_tokens": 426975509.0, + "step": 11188 + }, + { + "epoch": 1.4233558071492176, + "ewc_loss": 0.04093368351459503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0466841306188144e-05, + "grad_norm": 27.67774772644043, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8743093013763428, + "num_tokens": 427011647.0, + "step": 11189 + }, + { + "epoch": 1.4234830174278081, + "ewc_loss": 0.04091080650687218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0455403500818647e-05, + "grad_norm": 27.668712615966797, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.864100992679596, + "num_tokens": 427046631.0, + "step": 11190 + }, + { + "epoch": 1.4236102277063987, + "ewc_loss": 0.04091633856296539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0458168364712037e-05, + "grad_norm": 27.61368751525879, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8640961050987244, + "num_tokens": 427081364.0, + "step": 11191 + }, + { + "epoch": 1.4237374379849892, + "ewc_loss": 0.04094719886779785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0473598851822317e-05, + "grad_norm": 27.67780303955078, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8683139085769653, + "num_tokens": 427118667.0, + "step": 11192 + }, + { + "epoch": 1.4238646482635797, + "ewc_loss": 0.040948282927274704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0474140910664573e-05, + "grad_norm": 27.609344482421875, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8707836866378784, + "num_tokens": 427155017.0, + "step": 11193 + }, + { + "epoch": 1.4239918585421703, + "ewc_loss": 0.04094724357128143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0473622498684563e-05, + "grad_norm": 27.710222244262695, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8584118485450745, + "num_tokens": 427191741.0, + "step": 11194 + }, + { + "epoch": 1.4241190688207608, + "ewc_loss": 0.041068390011787415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.053419484582264e-05, + "grad_norm": 27.637767791748047, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8700414896011353, + "num_tokens": 427226210.0, + "step": 11195 + }, + { + "epoch": 1.4242462790993513, + "ewc_loss": 0.040924072265625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0462035536183976e-05, + "grad_norm": 27.743892669677734, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8620696067810059, + "num_tokens": 427261699.0, + "step": 11196 + }, + { + "epoch": 1.4243734893779418, + "ewc_loss": 0.04095318540930748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0476592908380553e-05, + "grad_norm": 27.566146850585938, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8561732769012451, + "num_tokens": 427297158.0, + "step": 11197 + }, + { + "epoch": 1.4245006996565324, + "ewc_loss": 0.04100973159074783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0504865460679866e-05, + "grad_norm": 27.799907684326172, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8509804010391235, + "num_tokens": 427338234.0, + "step": 11198 + }, + { + "epoch": 1.424627909935123, + "ewc_loss": 0.04093826934695244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0469135051826015e-05, + "grad_norm": 27.626686096191406, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.860564649105072, + "num_tokens": 427376727.0, + "step": 11199 + }, + { + "epoch": 1.4247551202137132, + "ewc_loss": 0.04089708626270294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0448542272788472e-05, + "grad_norm": 27.720670700073242, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8785836696624756, + "num_tokens": 427417095.0, + "step": 11200 + }, + { + "epoch": 1.4248823304923037, + "ewc_loss": 0.041029613465070724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0514806237770244e-05, + "grad_norm": 27.65505599975586, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8501188158988953, + "num_tokens": 427453243.0, + "step": 11201 + }, + { + "epoch": 1.4250095407708943, + "ewc_loss": 0.04096032679080963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0480163584579714e-05, + "grad_norm": 27.712366104125977, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.869220495223999, + "num_tokens": 427490961.0, + "step": 11202 + }, + { + "epoch": 1.4251367510494848, + "ewc_loss": 0.040973566472530365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0486782887019217e-05, + "grad_norm": 27.661359786987305, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8764256238937378, + "num_tokens": 427521487.0, + "step": 11203 + }, + { + "epoch": 1.4252639613280753, + "ewc_loss": 0.040991879999637604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0495939679676667e-05, + "grad_norm": 27.802928924560547, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8638140559196472, + "num_tokens": 427556607.0, + "step": 11204 + }, + { + "epoch": 1.4253911716066658, + "ewc_loss": 0.04098391905426979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.049195973086171e-05, + "grad_norm": 27.719680786132812, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8671633005142212, + "num_tokens": 427598940.0, + "step": 11205 + }, + { + "epoch": 1.4255183818852564, + "ewc_loss": 0.040874697268009186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.043734821199905e-05, + "grad_norm": 27.60289192199707, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8565253019332886, + "num_tokens": 427641337.0, + "step": 11206 + }, + { + "epoch": 1.425645592163847, + "ewc_loss": 0.040952909737825394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0476454665185884e-05, + "grad_norm": 27.836400985717773, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8771324157714844, + "num_tokens": 427680235.0, + "step": 11207 + }, + { + "epoch": 1.4257728024424374, + "ewc_loss": 0.040894728153944016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0447363567654975e-05, + "grad_norm": 27.505552291870117, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8665127754211426, + "num_tokens": 427719918.0, + "step": 11208 + }, + { + "epoch": 1.4259000127210277, + "ewc_loss": 0.04096813127398491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.048406531685032e-05, + "grad_norm": 27.80001449584961, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8687236309051514, + "num_tokens": 427758042.0, + "step": 11209 + }, + { + "epoch": 1.4260272229996183, + "ewc_loss": 0.04109898954629898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0549494365695864e-05, + "grad_norm": 27.805419921875, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8688364624977112, + "num_tokens": 427794591.0, + "step": 11210 + }, + { + "epoch": 1.4261544332782088, + "ewc_loss": 0.04085288569331169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0426443370524794e-05, + "grad_norm": 27.567142486572266, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8742407560348511, + "num_tokens": 427828190.0, + "step": 11211 + }, + { + "epoch": 1.4262816435567993, + "ewc_loss": 0.04098678007721901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0493389456532896e-05, + "grad_norm": 27.67267608642578, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8592276573181152, + "num_tokens": 427870363.0, + "step": 11212 + }, + { + "epoch": 1.4264088538353898, + "ewc_loss": 0.04096290469169617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0481451429077424e-05, + "grad_norm": 27.744298934936523, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8808351755142212, + "num_tokens": 427910037.0, + "step": 11213 + }, + { + "epoch": 1.4265360641139804, + "ewc_loss": 0.04099879786372185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.049939939752221e-05, + "grad_norm": 27.767993927001953, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8659502267837524, + "num_tokens": 427945782.0, + "step": 11214 + }, + { + "epoch": 1.426663274392571, + "ewc_loss": 0.04089749604463577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0448747818591073e-05, + "grad_norm": 27.575374603271484, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8738170862197876, + "num_tokens": 427984635.0, + "step": 11215 + }, + { + "epoch": 1.4267904846711614, + "ewc_loss": 0.04094800353050232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.04740008484805e-05, + "grad_norm": 27.6065731048584, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8817481398582458, + "num_tokens": 428024289.0, + "step": 11216 + }, + { + "epoch": 1.426917694949752, + "ewc_loss": 0.04094742611050606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.047371344815474e-05, + "grad_norm": 27.529634475708008, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8579821586608887, + "num_tokens": 428067119.0, + "step": 11217 + }, + { + "epoch": 1.4270449052283425, + "ewc_loss": 0.04091382771730423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.045691326202359e-05, + "grad_norm": 27.628501892089844, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8650678396224976, + "num_tokens": 428108150.0, + "step": 11218 + }, + { + "epoch": 1.427172115506933, + "ewc_loss": 0.04108909144997597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0544544895528816e-05, + "grad_norm": 27.776798248291016, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8765950202941895, + "num_tokens": 428150839.0, + "step": 11219 + }, + { + "epoch": 1.4272993257855235, + "ewc_loss": 0.040915943682193756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0457971913856454e-05, + "grad_norm": 27.607460021972656, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.855591356754303, + "num_tokens": 428194325.0, + "step": 11220 + }, + { + "epoch": 1.427426536064114, + "ewc_loss": 0.04093301296234131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.046650661213789e-05, + "grad_norm": 27.66718101501465, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8657569885253906, + "num_tokens": 428230879.0, + "step": 11221 + }, + { + "epoch": 1.4275537463427046, + "ewc_loss": 0.04102230817079544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0511153707047924e-05, + "grad_norm": 27.752212524414062, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8774954080581665, + "num_tokens": 428265795.0, + "step": 11222 + }, + { + "epoch": 1.4276809566212951, + "ewc_loss": 0.040926434099674225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.046321787929628e-05, + "grad_norm": 27.715219497680664, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8724957704544067, + "num_tokens": 428300413.0, + "step": 11223 + }, + { + "epoch": 1.4278081668998854, + "ewc_loss": 0.04091118648648262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0455592675716616e-05, + "grad_norm": 27.648792266845703, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.887455940246582, + "num_tokens": 428339293.0, + "step": 11224 + }, + { + "epoch": 1.427935377178476, + "ewc_loss": 0.040986716747283936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0493358533713035e-05, + "grad_norm": 27.77315330505371, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8783965110778809, + "num_tokens": 428374158.0, + "step": 11225 + }, + { + "epoch": 1.4280625874570665, + "ewc_loss": 0.04092798009514809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0463990949792787e-05, + "grad_norm": 27.693984985351562, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8678323030471802, + "num_tokens": 428411715.0, + "step": 11226 + }, + { + "epoch": 1.428189797735657, + "ewc_loss": 0.040903620421886444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0451810996746644e-05, + "grad_norm": 27.596904754638672, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8549630641937256, + "num_tokens": 428450000.0, + "step": 11227 + }, + { + "epoch": 1.4283170080142475, + "ewc_loss": 0.04093993827700615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0469969967962243e-05, + "grad_norm": 27.763303756713867, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8729488849639893, + "num_tokens": 428484546.0, + "step": 11228 + }, + { + "epoch": 1.428444218292838, + "ewc_loss": 0.040937479585409164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0468740331125446e-05, + "grad_norm": 27.60512351989746, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8680530190467834, + "num_tokens": 428527519.0, + "step": 11229 + }, + { + "epoch": 1.4285714285714286, + "ewc_loss": 0.04088560491800308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0442801542230882e-05, + "grad_norm": 27.699237823486328, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.861976683139801, + "num_tokens": 428568744.0, + "step": 11230 + }, + { + "epoch": 1.4286986388500191, + "ewc_loss": 0.04096256569027901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0481282263062894e-05, + "grad_norm": 27.642759323120117, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8623582720756531, + "num_tokens": 428613050.0, + "step": 11231 + }, + { + "epoch": 1.4288258491286097, + "ewc_loss": 0.04088392108678818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0441961169126444e-05, + "grad_norm": 27.573467254638672, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8639482855796814, + "num_tokens": 428647334.0, + "step": 11232 + }, + { + "epoch": 1.4289530594072, + "ewc_loss": 0.04097181186079979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0485906134126708e-05, + "grad_norm": 27.792022705078125, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8730351328849792, + "num_tokens": 428681015.0, + "step": 11233 + }, + { + "epoch": 1.4290802696857905, + "ewc_loss": 0.04096445441246033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0482226318563335e-05, + "grad_norm": 27.674495697021484, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8601426482200623, + "num_tokens": 428718903.0, + "step": 11234 + }, + { + "epoch": 1.429207479964381, + "ewc_loss": 0.04093749448657036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.046874760708306e-05, + "grad_norm": 27.7049560546875, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8750115036964417, + "num_tokens": 428756038.0, + "step": 11235 + }, + { + "epoch": 1.4293346902429716, + "ewc_loss": 0.040943313390016556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.047165617113933e-05, + "grad_norm": 27.638368606567383, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8837085962295532, + "num_tokens": 428793908.0, + "step": 11236 + }, + { + "epoch": 1.429461900521562, + "ewc_loss": 0.04097861051559448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0489305825321935e-05, + "grad_norm": 27.725709915161133, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.860663115978241, + "num_tokens": 428830924.0, + "step": 11237 + }, + { + "epoch": 1.4295891108001526, + "ewc_loss": 0.04097381979227066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0486910216277465e-05, + "grad_norm": 27.64737892150879, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8643775582313538, + "num_tokens": 428869083.0, + "step": 11238 + }, + { + "epoch": 1.4297163210787431, + "ewc_loss": 0.04094487056136131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0472434698604047e-05, + "grad_norm": 27.690187454223633, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8415555357933044, + "num_tokens": 428903970.0, + "step": 11239 + }, + { + "epoch": 1.4298435313573337, + "ewc_loss": 0.0410287007689476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0514349671429954e-05, + "grad_norm": 27.84219741821289, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8708940148353577, + "num_tokens": 428940710.0, + "step": 11240 + }, + { + "epoch": 1.4299707416359242, + "ewc_loss": 0.04102500528097153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0512503397185355e-05, + "grad_norm": 27.819557189941406, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8692045211791992, + "num_tokens": 428981160.0, + "step": 11241 + }, + { + "epoch": 1.4300979519145147, + "ewc_loss": 0.040840357542037964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0420178771018982e-05, + "grad_norm": 27.58342742919922, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8674929738044739, + "num_tokens": 429022464.0, + "step": 11242 + }, + { + "epoch": 1.4302251621931052, + "ewc_loss": 0.040924131870269775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0462066459003836e-05, + "grad_norm": 27.753007888793945, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8684311509132385, + "num_tokens": 429060964.0, + "step": 11243 + }, + { + "epoch": 1.4303523724716958, + "ewc_loss": 0.0409962497651577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0498124285950325e-05, + "grad_norm": 27.698591232299805, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8682399392127991, + "num_tokens": 429101389.0, + "step": 11244 + }, + { + "epoch": 1.4304795827502863, + "ewc_loss": 0.04093896970152855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0469484297791496e-05, + "grad_norm": 27.622648239135742, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.863135814666748, + "num_tokens": 429135547.0, + "step": 11245 + }, + { + "epoch": 1.4306067930288768, + "ewc_loss": 0.04101348668336868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0506742657744326e-05, + "grad_norm": 27.838720321655273, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8703583478927612, + "num_tokens": 429177504.0, + "step": 11246 + }, + { + "epoch": 1.4307340033074674, + "ewc_loss": 0.040854312479496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0427156414370984e-05, + "grad_norm": 27.52321434020996, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8709843754768372, + "num_tokens": 429217923.0, + "step": 11247 + }, + { + "epoch": 1.430861213586058, + "ewc_loss": 0.04093609005212784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.046804547717329e-05, + "grad_norm": 27.708942413330078, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8674671649932861, + "num_tokens": 429251514.0, + "step": 11248 + }, + { + "epoch": 1.4309884238646482, + "ewc_loss": 0.04102296382188797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0511481125140563e-05, + "grad_norm": 27.75238609313965, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8520816564559937, + "num_tokens": 429291747.0, + "step": 11249 + }, + { + "epoch": 1.4311156341432387, + "ewc_loss": 0.04102786257863045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.051393130386714e-05, + "grad_norm": 27.595712661743164, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8642447590827942, + "num_tokens": 429333480.0, + "step": 11250 + }, + { + "epoch": 1.4312428444218293, + "ewc_loss": 0.040980082005262375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0490040697040968e-05, + "grad_norm": 27.773082733154297, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8794136643409729, + "num_tokens": 429366482.0, + "step": 11251 + }, + { + "epoch": 1.4313700547004198, + "ewc_loss": 0.041075751185417175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0537874661386013e-05, + "grad_norm": 27.64053726196289, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8592969179153442, + "num_tokens": 429400704.0, + "step": 11252 + }, + { + "epoch": 1.4314972649790103, + "ewc_loss": 0.040908075869083405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0454037439776585e-05, + "grad_norm": 27.709169387817383, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8552474975585938, + "num_tokens": 429437463.0, + "step": 11253 + }, + { + "epoch": 1.4316244752576008, + "ewc_loss": 0.04110340401530266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.055170261883177e-05, + "grad_norm": 27.712507247924805, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8561820983886719, + "num_tokens": 429474496.0, + "step": 11254 + }, + { + "epoch": 1.4317516855361914, + "ewc_loss": 0.0409499928355217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.047499583568424e-05, + "grad_norm": 27.765047073364258, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8537774085998535, + "num_tokens": 429514641.0, + "step": 11255 + }, + { + "epoch": 1.431878895814782, + "ewc_loss": 0.04108402878046036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0542014681268483e-05, + "grad_norm": 27.712400436401367, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8665850162506104, + "num_tokens": 429554054.0, + "step": 11256 + }, + { + "epoch": 1.4320061060933724, + "ewc_loss": 0.0409848727285862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0492436306085438e-05, + "grad_norm": 27.81096839904785, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8625229597091675, + "num_tokens": 429594004.0, + "step": 11257 + }, + { + "epoch": 1.4321333163719627, + "ewc_loss": 0.04104558750987053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0522793420241214e-05, + "grad_norm": 27.55940055847168, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8736711144447327, + "num_tokens": 429633019.0, + "step": 11258 + }, + { + "epoch": 1.4322605266505533, + "ewc_loss": 0.04100523889064789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0502619008766487e-05, + "grad_norm": 27.746212005615234, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8670644164085388, + "num_tokens": 429666869.0, + "step": 11259 + }, + { + "epoch": 1.4323877369291438, + "ewc_loss": 0.04112297669053078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0561488781822845e-05, + "grad_norm": 27.657115936279297, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.87675541639328, + "num_tokens": 429700049.0, + "step": 11260 + }, + { + "epoch": 1.4325149472077343, + "ewc_loss": 0.04093918949365616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0469595256145112e-05, + "grad_norm": 27.616531372070312, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8644516468048096, + "num_tokens": 429737517.0, + "step": 11261 + }, + { + "epoch": 1.4326421574863248, + "ewc_loss": 0.041073545813560486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0536772353807464e-05, + "grad_norm": 27.712398529052734, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8788297176361084, + "num_tokens": 429778697.0, + "step": 11262 + }, + { + "epoch": 1.4327693677649154, + "ewc_loss": 0.04104408994317055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0522045815596357e-05, + "grad_norm": 27.748371124267578, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8673927187919617, + "num_tokens": 429811883.0, + "step": 11263 + }, + { + "epoch": 1.432896578043506, + "ewc_loss": 0.041087571531534195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0543786376947537e-05, + "grad_norm": 27.75724983215332, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8721620440483093, + "num_tokens": 429851748.0, + "step": 11264 + }, + { + "epoch": 1.4330237883220964, + "ewc_loss": 0.04099087789654732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0495439457590692e-05, + "grad_norm": 27.630521774291992, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8572547435760498, + "num_tokens": 429891033.0, + "step": 11265 + }, + { + "epoch": 1.433150998600687, + "ewc_loss": 0.04100795090198517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0503975974861532e-05, + "grad_norm": 27.69396209716797, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8688760995864868, + "num_tokens": 429935980.0, + "step": 11266 + }, + { + "epoch": 1.4332782088792775, + "ewc_loss": 0.041119977831840515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.055998811556492e-05, + "grad_norm": 27.806474685668945, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8548702001571655, + "num_tokens": 429970874.0, + "step": 11267 + }, + { + "epoch": 1.433405419157868, + "ewc_loss": 0.04104135185480118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0520676116575487e-05, + "grad_norm": 27.776079177856445, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8520311117172241, + "num_tokens": 430017909.0, + "step": 11268 + }, + { + "epoch": 1.4335326294364585, + "ewc_loss": 0.04099076986312866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0495384887908585e-05, + "grad_norm": 27.820161819458008, + "learning_rate": 1e-06, + "loss": 0.5416, + "mean_token_accuracy": 0.8401719927787781, + "num_tokens": 430050641.0, + "step": 11269 + }, + { + "epoch": 1.433659839715049, + "ewc_loss": 0.04099826514720917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.049913200607989e-05, + "grad_norm": 27.886070251464844, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8710870742797852, + "num_tokens": 430082985.0, + "step": 11270 + }, + { + "epoch": 1.4337870499936396, + "ewc_loss": 0.04100235179066658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0501176550169475e-05, + "grad_norm": 27.68263816833496, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8666610717773438, + "num_tokens": 430122340.0, + "step": 11271 + }, + { + "epoch": 1.4339142602722301, + "ewc_loss": 0.04103616252541542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0518080418696627e-05, + "grad_norm": 27.97223663330078, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8523492813110352, + "num_tokens": 430162140.0, + "step": 11272 + }, + { + "epoch": 1.4340414705508204, + "ewc_loss": 0.0410207062959671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0510353351710364e-05, + "grad_norm": 27.687471389770508, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8773002028465271, + "num_tokens": 430201672.0, + "step": 11273 + }, + { + "epoch": 1.434168680829411, + "ewc_loss": 0.04087122157216072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0435611077118665e-05, + "grad_norm": 27.8468074798584, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8565856218338013, + "num_tokens": 430239760.0, + "step": 11274 + }, + { + "epoch": 1.4342958911080015, + "ewc_loss": 0.04101041331887245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0505207430687733e-05, + "grad_norm": 27.645870208740234, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.849240779876709, + "num_tokens": 430280527.0, + "step": 11275 + }, + { + "epoch": 1.434423101386592, + "ewc_loss": 0.04093442112207413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0467210561037064e-05, + "grad_norm": 28.269298553466797, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8620477318763733, + "num_tokens": 430312788.0, + "step": 11276 + }, + { + "epoch": 1.4345503116651825, + "ewc_loss": 0.041053734719753265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0526867956505157e-05, + "grad_norm": 27.753955841064453, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8732305765151978, + "num_tokens": 430348044.0, + "step": 11277 + }, + { + "epoch": 1.434677521943773, + "ewc_loss": 0.040771253407001495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.038562706729863e-05, + "grad_norm": 27.814523696899414, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8824684023857117, + "num_tokens": 430385676.0, + "step": 11278 + }, + { + "epoch": 1.4348047322223636, + "ewc_loss": 0.04100862145423889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0504310668911785e-05, + "grad_norm": 27.73370933532715, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8633326888084412, + "num_tokens": 430416976.0, + "step": 11279 + }, + { + "epoch": 1.4349319425009541, + "ewc_loss": 0.04088539630174637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.044269785983488e-05, + "grad_norm": 27.839941024780273, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8774893283843994, + "num_tokens": 430460002.0, + "step": 11280 + }, + { + "epoch": 1.4350591527795447, + "ewc_loss": 0.04088655486702919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.044327811745461e-05, + "grad_norm": 27.653352737426758, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8450660705566406, + "num_tokens": 430503172.0, + "step": 11281 + }, + { + "epoch": 1.435186363058135, + "ewc_loss": 0.040935780853033066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.046789086307399e-05, + "grad_norm": 27.89091682434082, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8535382747650146, + "num_tokens": 430541221.0, + "step": 11282 + }, + { + "epoch": 1.4353135733367255, + "ewc_loss": 0.04090951755642891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0454759578569792e-05, + "grad_norm": 27.656970977783203, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8590492606163025, + "num_tokens": 430579527.0, + "step": 11283 + }, + { + "epoch": 1.435440783615316, + "ewc_loss": 0.040852803736925125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0426401533768512e-05, + "grad_norm": 27.911222457885742, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8675417900085449, + "num_tokens": 430617263.0, + "step": 11284 + }, + { + "epoch": 1.4355679938939065, + "ewc_loss": 0.04106210917234421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.053105527011212e-05, + "grad_norm": 27.71183204650879, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8789581060409546, + "num_tokens": 430649961.0, + "step": 11285 + }, + { + "epoch": 1.435695204172497, + "ewc_loss": 0.04085351526737213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0426758055691607e-05, + "grad_norm": 27.85830307006836, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8610438704490662, + "num_tokens": 430682599.0, + "step": 11286 + }, + { + "epoch": 1.4358224144510876, + "ewc_loss": 0.04110652208328247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0553261492750607e-05, + "grad_norm": 27.79488182067871, + "learning_rate": 1e-06, + "loss": 0.52, + "mean_token_accuracy": 0.8383578062057495, + "num_tokens": 430721014.0, + "step": 11287 + }, + { + "epoch": 1.4359496247296781, + "ewc_loss": 0.040894173085689545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0447087081265636e-05, + "grad_norm": 27.72170639038086, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8640934228897095, + "num_tokens": 430757014.0, + "step": 11288 + }, + { + "epoch": 1.4360768350082687, + "ewc_loss": 0.041007157415151596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.050357943517156e-05, + "grad_norm": 27.65930938720703, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.865759551525116, + "num_tokens": 430796169.0, + "step": 11289 + }, + { + "epoch": 1.4362040452868592, + "ewc_loss": 0.040985483676195145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0492741896305233e-05, + "grad_norm": 27.763330459594727, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8646291494369507, + "num_tokens": 430832726.0, + "step": 11290 + }, + { + "epoch": 1.4363312555654497, + "ewc_loss": 0.041112542152404785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0556271920213476e-05, + "grad_norm": 27.738908767700195, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8537507057189941, + "num_tokens": 430866343.0, + "step": 11291 + }, + { + "epoch": 1.4364584658440402, + "ewc_loss": 0.041052915155887604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0526456864899956e-05, + "grad_norm": 27.625185012817383, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8719677925109863, + "num_tokens": 430901622.0, + "step": 11292 + }, + { + "epoch": 1.4365856761226308, + "ewc_loss": 0.04112115502357483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.056057746813167e-05, + "grad_norm": 27.836837768554688, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.879372239112854, + "num_tokens": 430937499.0, + "step": 11293 + }, + { + "epoch": 1.4367128864012213, + "ewc_loss": 0.041151393204927444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0575696908053942e-05, + "grad_norm": 27.71879768371582, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8530449867248535, + "num_tokens": 430974048.0, + "step": 11294 + }, + { + "epoch": 1.4368400966798118, + "ewc_loss": 0.04106162115931511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.053080970654264e-05, + "grad_norm": 27.781455993652344, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8736734390258789, + "num_tokens": 431010927.0, + "step": 11295 + }, + { + "epoch": 1.4369673069584024, + "ewc_loss": 0.04111073911190033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0555369701469317e-05, + "grad_norm": 27.717897415161133, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8502812385559082, + "num_tokens": 431051731.0, + "step": 11296 + }, + { + "epoch": 1.4370945172369929, + "ewc_loss": 0.041064679622650146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0532339476631023e-05, + "grad_norm": 27.761014938354492, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8807669878005981, + "num_tokens": 431087876.0, + "step": 11297 + }, + { + "epoch": 1.4372217275155832, + "ewc_loss": 0.04108112305402756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.054056130873505e-05, + "grad_norm": 27.719440460205078, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8596780896186829, + "num_tokens": 431126217.0, + "step": 11298 + }, + { + "epoch": 1.4373489377941737, + "ewc_loss": 0.04105450585484505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.052725358225871e-05, + "grad_norm": 27.639354705810547, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8664356470108032, + "num_tokens": 431168049.0, + "step": 11299 + }, + { + "epoch": 1.4374761480727642, + "ewc_loss": 0.041111111640930176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.055555523838848e-05, + "grad_norm": 27.678050994873047, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8639161586761475, + "num_tokens": 431211143.0, + "step": 11300 + }, + { + "epoch": 1.4376033583513548, + "ewc_loss": 0.04110020399093628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0550101908156648e-05, + "grad_norm": 27.674184799194336, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8735731840133667, + "num_tokens": 431255747.0, + "step": 11301 + }, + { + "epoch": 1.4377305686299453, + "ewc_loss": 0.041147880256175995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0573939764290117e-05, + "grad_norm": 27.676727294921875, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8683688640594482, + "num_tokens": 431296618.0, + "step": 11302 + }, + { + "epoch": 1.4378577789085358, + "ewc_loss": 0.04110823944211006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.055412005574908e-05, + "grad_norm": 27.694429397583008, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8680250644683838, + "num_tokens": 431336493.0, + "step": 11303 + }, + { + "epoch": 1.4379849891871264, + "ewc_loss": 0.04115380719304085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0576902898028493e-05, + "grad_norm": 27.759414672851562, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8639639616012573, + "num_tokens": 431376200.0, + "step": 11304 + }, + { + "epoch": 1.438112199465717, + "ewc_loss": 0.04104229807853699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.052114905382041e-05, + "grad_norm": 27.624753952026367, + "learning_rate": 1e-06, + "loss": 0.501, + "mean_token_accuracy": 0.846199631690979, + "num_tokens": 431414015.0, + "step": 11305 + }, + { + "epoch": 1.4382394097443074, + "ewc_loss": 0.04109165072441101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0545825464068912e-05, + "grad_norm": 27.858144760131836, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8612618446350098, + "num_tokens": 431453498.0, + "step": 11306 + }, + { + "epoch": 1.4383666200228977, + "ewc_loss": 0.04110938683152199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0554693037411198e-05, + "grad_norm": 27.685890197753906, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8645989894866943, + "num_tokens": 431491126.0, + "step": 11307 + }, + { + "epoch": 1.4384938303014883, + "ewc_loss": 0.04113401845097542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0567009414662607e-05, + "grad_norm": 27.757442474365234, + "learning_rate": 1e-06, + "loss": 0.3596, + "mean_token_accuracy": 0.8881239891052246, + "num_tokens": 431528549.0, + "step": 11308 + }, + { + "epoch": 1.4386210405800788, + "ewc_loss": 0.041072096675634384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0536048396024853e-05, + "grad_norm": 27.85386085510254, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8720678091049194, + "num_tokens": 431569435.0, + "step": 11309 + }, + { + "epoch": 1.4387482508586693, + "ewc_loss": 0.04106379300355911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0531895643216558e-05, + "grad_norm": 27.736886978149414, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8706271648406982, + "num_tokens": 431606125.0, + "step": 11310 + }, + { + "epoch": 1.4388754611372598, + "ewc_loss": 0.04105987772345543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0529938410618342e-05, + "grad_norm": 27.793472290039062, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8610517382621765, + "num_tokens": 431643517.0, + "step": 11311 + }, + { + "epoch": 1.4390026714158504, + "ewc_loss": 0.04099581018090248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.04979060072219e-05, + "grad_norm": 27.612865447998047, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8753944039344788, + "num_tokens": 431682058.0, + "step": 11312 + }, + { + "epoch": 1.439129881694441, + "ewc_loss": 0.041050054132938385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.052502713922877e-05, + "grad_norm": 27.753475189208984, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8773415684700012, + "num_tokens": 431721241.0, + "step": 11313 + }, + { + "epoch": 1.4392570919730314, + "ewc_loss": 0.041100744158029556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0550372937577777e-05, + "grad_norm": 27.744714736938477, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8779851198196411, + "num_tokens": 431757809.0, + "step": 11314 + }, + { + "epoch": 1.439384302251622, + "ewc_loss": 0.04096253216266632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0481265892158262e-05, + "grad_norm": 27.69472312927246, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8548532724380493, + "num_tokens": 431795931.0, + "step": 11315 + }, + { + "epoch": 1.4395115125302125, + "ewc_loss": 0.04103850945830345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0519255485851318e-05, + "grad_norm": 27.89145851135254, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8756008744239807, + "num_tokens": 431830517.0, + "step": 11316 + }, + { + "epoch": 1.439638722808803, + "ewc_loss": 0.04103695601224899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0518478777376004e-05, + "grad_norm": 27.75333595275879, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8674623966217041, + "num_tokens": 431866047.0, + "step": 11317 + }, + { + "epoch": 1.4397659330873935, + "ewc_loss": 0.04100778326392174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.050389230134897e-05, + "grad_norm": 27.85769271850586, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8545833230018616, + "num_tokens": 431901602.0, + "step": 11318 + }, + { + "epoch": 1.439893143365984, + "ewc_loss": 0.04108026996254921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.054013566521462e-05, + "grad_norm": 27.740995407104492, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8617568016052246, + "num_tokens": 431934989.0, + "step": 11319 + }, + { + "epoch": 1.4400203536445746, + "ewc_loss": 0.040876273065805435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0438135834410787e-05, + "grad_norm": 27.628005981445312, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8708699941635132, + "num_tokens": 431968664.0, + "step": 11320 + }, + { + "epoch": 1.4401475639231651, + "ewc_loss": 0.04109182208776474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.054591095657088e-05, + "grad_norm": 27.774723052978516, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8767422437667847, + "num_tokens": 432008025.0, + "step": 11321 + }, + { + "epoch": 1.4402747742017554, + "ewc_loss": 0.041076067835092545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0538034732453525e-05, + "grad_norm": 27.85660171508789, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8691925406455994, + "num_tokens": 432043526.0, + "step": 11322 + }, + { + "epoch": 1.440401984480346, + "ewc_loss": 0.0410282164812088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0514107745839283e-05, + "grad_norm": 27.853782653808594, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8676828145980835, + "num_tokens": 432085175.0, + "step": 11323 + }, + { + "epoch": 1.4405291947589365, + "ewc_loss": 0.04104039445519447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0520197722362354e-05, + "grad_norm": 27.814645767211914, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8570883870124817, + "num_tokens": 432122379.0, + "step": 11324 + }, + { + "epoch": 1.440656405037527, + "ewc_loss": 0.041019901633262634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.050995135505218e-05, + "grad_norm": 27.788965225219727, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8683205842971802, + "num_tokens": 432157690.0, + "step": 11325 + }, + { + "epoch": 1.4407836153161175, + "ewc_loss": 0.04105565696954727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.052782838291023e-05, + "grad_norm": 27.938690185546875, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8742977380752563, + "num_tokens": 432188885.0, + "step": 11326 + }, + { + "epoch": 1.440910825594708, + "ewc_loss": 0.04102614149451256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0513070921879262e-05, + "grad_norm": 27.76187515258789, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8666912317276001, + "num_tokens": 432228804.0, + "step": 11327 + }, + { + "epoch": 1.4410380358732986, + "ewc_loss": 0.04097621142864227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0488105292315595e-05, + "grad_norm": 27.879491806030273, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8483008146286011, + "num_tokens": 432264150.0, + "step": 11328 + }, + { + "epoch": 1.4411652461518891, + "ewc_loss": 0.04097852110862732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0489260350586846e-05, + "grad_norm": 27.68109130859375, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8587501645088196, + "num_tokens": 432306759.0, + "step": 11329 + }, + { + "epoch": 1.4412924564304797, + "ewc_loss": 0.041014302521944046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0507151930360124e-05, + "grad_norm": 27.8193359375, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.876240611076355, + "num_tokens": 432346928.0, + "step": 11330 + }, + { + "epoch": 1.44141966670907, + "ewc_loss": 0.041030265390872955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.051513183687348e-05, + "grad_norm": 27.827184677124023, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8731244802474976, + "num_tokens": 432386630.0, + "step": 11331 + }, + { + "epoch": 1.4415468769876605, + "ewc_loss": 0.04110568389296532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.055284130619839e-05, + "grad_norm": 27.82655906677246, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8539186120033264, + "num_tokens": 432427331.0, + "step": 11332 + }, + { + "epoch": 1.441674087266251, + "ewc_loss": 0.040994882583618164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0497442164923996e-05, + "grad_norm": 27.890975952148438, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8584607839584351, + "num_tokens": 432462901.0, + "step": 11333 + }, + { + "epoch": 1.4418012975448415, + "ewc_loss": 0.04108187183737755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.054093602055218e-05, + "grad_norm": 27.83814239501953, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8619806170463562, + "num_tokens": 432494232.0, + "step": 11334 + }, + { + "epoch": 1.441928507823432, + "ewc_loss": 0.04095388948917389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0476943973335437e-05, + "grad_norm": 27.863794326782227, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8785791993141174, + "num_tokens": 432534276.0, + "step": 11335 + }, + { + "epoch": 1.4420557181020226, + "ewc_loss": 0.04104917123913765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0524585124803707e-05, + "grad_norm": 27.74102783203125, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8573809862136841, + "num_tokens": 432577337.0, + "step": 11336 + }, + { + "epoch": 1.4421829283806131, + "ewc_loss": 0.040984146296978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.049207250820473e-05, + "grad_norm": 27.724369049072266, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.856920599937439, + "num_tokens": 432614290.0, + "step": 11337 + }, + { + "epoch": 1.4423101386592037, + "ewc_loss": 0.041070062667131424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.053503158094827e-05, + "grad_norm": 27.770431518554688, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8671107292175293, + "num_tokens": 432648558.0, + "step": 11338 + }, + { + "epoch": 1.4424373489377942, + "ewc_loss": 0.041002169251441956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0501083781709895e-05, + "grad_norm": 27.812000274658203, + "learning_rate": 1e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.8398288488388062, + "num_tokens": 432684243.0, + "step": 11339 + }, + { + "epoch": 1.4425645592163847, + "ewc_loss": 0.041061971336603165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0530986148514785e-05, + "grad_norm": 27.842731475830078, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8649446964263916, + "num_tokens": 432719355.0, + "step": 11340 + }, + { + "epoch": 1.4426917694949752, + "ewc_loss": 0.04097378998994827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0486895664362237e-05, + "grad_norm": 27.655994415283203, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8657860159873962, + "num_tokens": 432758675.0, + "step": 11341 + }, + { + "epoch": 1.4428189797735658, + "ewc_loss": 0.04107256233692169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0536281226668507e-05, + "grad_norm": 27.80381202697754, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8628278374671936, + "num_tokens": 432801677.0, + "step": 11342 + }, + { + "epoch": 1.4429461900521563, + "ewc_loss": 0.04116234928369522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.058117388514802e-05, + "grad_norm": 27.75259017944336, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8633809089660645, + "num_tokens": 432836299.0, + "step": 11343 + }, + { + "epoch": 1.4430734003307468, + "ewc_loss": 0.04109065607190132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0545328879961744e-05, + "grad_norm": 27.763690948486328, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8644118905067444, + "num_tokens": 432868838.0, + "step": 11344 + }, + { + "epoch": 1.4432006106093374, + "ewc_loss": 0.04113118350505829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0565592421917245e-05, + "grad_norm": 27.77872085571289, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8673189878463745, + "num_tokens": 432904895.0, + "step": 11345 + }, + { + "epoch": 1.4433278208879279, + "ewc_loss": 0.041107144206762314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0553572539938614e-05, + "grad_norm": 27.677785873413086, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8600603342056274, + "num_tokens": 432944785.0, + "step": 11346 + }, + { + "epoch": 1.4434550311665182, + "ewc_loss": 0.041108131408691406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0554065486066975e-05, + "grad_norm": 27.839590072631836, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8548802733421326, + "num_tokens": 432975645.0, + "step": 11347 + }, + { + "epoch": 1.4435822414451087, + "ewc_loss": 0.041191939264535904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.059596954495646e-05, + "grad_norm": 27.76157569885254, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8604215979576111, + "num_tokens": 433016887.0, + "step": 11348 + }, + { + "epoch": 1.4437094517236992, + "ewc_loss": 0.04111139476299286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0555697119561955e-05, + "grad_norm": 27.70731544494629, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8680312037467957, + "num_tokens": 433053666.0, + "step": 11349 + }, + { + "epoch": 1.4438366620022898, + "ewc_loss": 0.041212428361177444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.060621409327723e-05, + "grad_norm": 27.744131088256836, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.859870433807373, + "num_tokens": 433101131.0, + "step": 11350 + }, + { + "epoch": 1.4439638722808803, + "ewc_loss": 0.04118873551487923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0594367015291937e-05, + "grad_norm": 27.77579116821289, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8730829954147339, + "num_tokens": 433132535.0, + "step": 11351 + }, + { + "epoch": 1.4440910825594708, + "ewc_loss": 0.04124930500984192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0624653188860975e-05, + "grad_norm": 27.827577590942383, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8507610559463501, + "num_tokens": 433168229.0, + "step": 11352 + }, + { + "epoch": 1.4442182928380614, + "ewc_loss": 0.04118376225233078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.059188045677729e-05, + "grad_norm": 27.75233268737793, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8707435727119446, + "num_tokens": 433206094.0, + "step": 11353 + }, + { + "epoch": 1.4443455031166519, + "ewc_loss": 0.04114281386137009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.057140773104038e-05, + "grad_norm": 27.719322204589844, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8531577587127686, + "num_tokens": 433241038.0, + "step": 11354 + }, + { + "epoch": 1.4444727133952424, + "ewc_loss": 0.04117871820926666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0589359337463975e-05, + "grad_norm": 27.80903434753418, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8571840524673462, + "num_tokens": 433274674.0, + "step": 11355 + }, + { + "epoch": 1.4445999236738327, + "ewc_loss": 0.041149210184812546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0574605514411815e-05, + "grad_norm": 27.777687072753906, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8674508333206177, + "num_tokens": 433314511.0, + "step": 11356 + }, + { + "epoch": 1.4447271339524232, + "ewc_loss": 0.04115460067987442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.057730125670787e-05, + "grad_norm": 27.75617027282715, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8669707775115967, + "num_tokens": 433349338.0, + "step": 11357 + }, + { + "epoch": 1.4448543442310138, + "ewc_loss": 0.04110332205891609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0551660782075487e-05, + "grad_norm": 27.743896484375, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.875786542892456, + "num_tokens": 433382609.0, + "step": 11358 + }, + { + "epoch": 1.4449815545096043, + "ewc_loss": 0.04121212661266327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0606063117156737e-05, + "grad_norm": 27.894855499267578, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8804683089256287, + "num_tokens": 433411553.0, + "step": 11359 + }, + { + "epoch": 1.4451087647881948, + "ewc_loss": 0.04121106490492821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.06055319722509e-05, + "grad_norm": 27.750091552734375, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8631526827812195, + "num_tokens": 433450571.0, + "step": 11360 + }, + { + "epoch": 1.4452359750667854, + "ewc_loss": 0.04111998900771141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0559995391522534e-05, + "grad_norm": 27.77657127380371, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.861213207244873, + "num_tokens": 433486994.0, + "step": 11361 + }, + { + "epoch": 1.445363185345376, + "ewc_loss": 0.04119565710425377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0597828552126884e-05, + "grad_norm": 27.739398956298828, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.878021776676178, + "num_tokens": 433521407.0, + "step": 11362 + }, + { + "epoch": 1.4454903956239664, + "ewc_loss": 0.04121199622750282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.060599763353821e-05, + "grad_norm": 27.796600341796875, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.876396119594574, + "num_tokens": 433561338.0, + "step": 11363 + }, + { + "epoch": 1.445617605902557, + "ewc_loss": 0.04120627045631409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0603134544217028e-05, + "grad_norm": 27.666553497314453, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8736082911491394, + "num_tokens": 433598892.0, + "step": 11364 + }, + { + "epoch": 1.4457448161811475, + "ewc_loss": 0.041220083832740784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.061004124698229e-05, + "grad_norm": 27.8439884185791, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8552929162979126, + "num_tokens": 433641423.0, + "step": 11365 + }, + { + "epoch": 1.445872026459738, + "ewc_loss": 0.04123653098940849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0618264898075722e-05, + "grad_norm": 27.8758487701416, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8691561222076416, + "num_tokens": 433677385.0, + "step": 11366 + }, + { + "epoch": 1.4459992367383285, + "ewc_loss": 0.04115554317831993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0577772374963388e-05, + "grad_norm": 27.758901596069336, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8641500473022461, + "num_tokens": 433716901.0, + "step": 11367 + }, + { + "epoch": 1.446126447016919, + "ewc_loss": 0.04118099808692932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0590499843819998e-05, + "grad_norm": 27.844465255737305, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8609142303466797, + "num_tokens": 433755099.0, + "step": 11368 + }, + { + "epoch": 1.4462536572955096, + "ewc_loss": 0.041187599301338196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.059379949059803e-05, + "grad_norm": 27.763141632080078, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8616480827331543, + "num_tokens": 433794012.0, + "step": 11369 + }, + { + "epoch": 1.4463808675741001, + "ewc_loss": 0.041162554174661636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.058127756754402e-05, + "grad_norm": 27.81029510498047, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8686383366584778, + "num_tokens": 433831965.0, + "step": 11370 + }, + { + "epoch": 1.4465080778526904, + "ewc_loss": 0.041144371032714844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.05721862585051e-05, + "grad_norm": 27.78347396850586, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8662827014923096, + "num_tokens": 433870819.0, + "step": 11371 + }, + { + "epoch": 1.446635288131281, + "ewc_loss": 0.04124310240149498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.062155181192793e-05, + "grad_norm": 27.758378982543945, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8602960705757141, + "num_tokens": 433909197.0, + "step": 11372 + }, + { + "epoch": 1.4467624984098715, + "ewc_loss": 0.04119515419006348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0597577531589195e-05, + "grad_norm": 27.795120239257812, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8724757432937622, + "num_tokens": 433946026.0, + "step": 11373 + }, + { + "epoch": 1.446889708688462, + "ewc_loss": 0.041101183742284775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0550591216306202e-05, + "grad_norm": 27.716819763183594, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8872185945510864, + "num_tokens": 433986239.0, + "step": 11374 + }, + { + "epoch": 1.4470169189670525, + "ewc_loss": 0.0412386916577816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.061934537778143e-05, + "grad_norm": 27.93999671936035, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8710222244262695, + "num_tokens": 434022557.0, + "step": 11375 + }, + { + "epoch": 1.447144129245643, + "ewc_loss": 0.0411224402487278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.056121957139112e-05, + "grad_norm": 27.738924026489258, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8697062730789185, + "num_tokens": 434059258.0, + "step": 11376 + }, + { + "epoch": 1.4472713395242336, + "ewc_loss": 0.04107672721147537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0538363969535567e-05, + "grad_norm": 27.828536987304688, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8521420955657959, + "num_tokens": 434094872.0, + "step": 11377 + }, + { + "epoch": 1.4473985498028241, + "ewc_loss": 0.04113903269171715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0569515982060693e-05, + "grad_norm": 27.76607322692871, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8430624604225159, + "num_tokens": 434134842.0, + "step": 11378 + }, + { + "epoch": 1.4475257600814146, + "ewc_loss": 0.041130077093839645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0565037630149163e-05, + "grad_norm": 27.855939865112305, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8770566582679749, + "num_tokens": 434173152.0, + "step": 11379 + }, + { + "epoch": 1.447652970360005, + "ewc_loss": 0.04118206351995468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0591030988725834e-05, + "grad_norm": 27.801040649414062, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8679988384246826, + "num_tokens": 434208318.0, + "step": 11380 + }, + { + "epoch": 1.4477801806385955, + "ewc_loss": 0.041035205125808716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0517602024483494e-05, + "grad_norm": 27.948806762695312, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8631856441497803, + "num_tokens": 434243693.0, + "step": 11381 + }, + { + "epoch": 1.447907390917186, + "ewc_loss": 0.041068099439144135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0534049326670356e-05, + "grad_norm": 27.725427627563477, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.853758692741394, + "num_tokens": 434283172.0, + "step": 11382 + }, + { + "epoch": 1.4480346011957765, + "ewc_loss": 0.040996573865413666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.049828617600724e-05, + "grad_norm": 27.925607681274414, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8667064905166626, + "num_tokens": 434318789.0, + "step": 11383 + }, + { + "epoch": 1.448161811474367, + "ewc_loss": 0.041227247565984726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0613624656107277e-05, + "grad_norm": 27.966960906982422, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8585905432701111, + "num_tokens": 434358943.0, + "step": 11384 + }, + { + "epoch": 1.4482890217529576, + "ewc_loss": 0.040937818586826324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0468909497139975e-05, + "grad_norm": 27.736562728881836, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.866681694984436, + "num_tokens": 434398264.0, + "step": 11385 + }, + { + "epoch": 1.4484162320315481, + "ewc_loss": 0.04103119671344757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0515597498160787e-05, + "grad_norm": 27.877981185913086, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8756217956542969, + "num_tokens": 434433602.0, + "step": 11386 + }, + { + "epoch": 1.4485434423101387, + "ewc_loss": 0.04113951325416565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.056975608866196e-05, + "grad_norm": 27.938060760498047, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8594821691513062, + "num_tokens": 434480001.0, + "step": 11387 + }, + { + "epoch": 1.4486706525887292, + "ewc_loss": 0.04105990007519722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0529949324554764e-05, + "grad_norm": 27.798120498657227, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8806272745132446, + "num_tokens": 434519285.0, + "step": 11388 + }, + { + "epoch": 1.4487978628673197, + "ewc_loss": 0.04104284569621086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0521423721220344e-05, + "grad_norm": 27.78924560546875, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.865583062171936, + "num_tokens": 434556118.0, + "step": 11389 + }, + { + "epoch": 1.4489250731459102, + "ewc_loss": 0.041141800582408905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.057090023299679e-05, + "grad_norm": 27.953338623046875, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8503199219703674, + "num_tokens": 434595350.0, + "step": 11390 + }, + { + "epoch": 1.4490522834245008, + "ewc_loss": 0.041110385209321976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0555193259497173e-05, + "grad_norm": 27.824127197265625, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8635786771774292, + "num_tokens": 434632988.0, + "step": 11391 + }, + { + "epoch": 1.4491794937030913, + "ewc_loss": 0.04097048565745354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0485242202994414e-05, + "grad_norm": 27.8262939453125, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8732045292854309, + "num_tokens": 434673040.0, + "step": 11392 + }, + { + "epoch": 1.4493067039816818, + "ewc_loss": 0.041076552122831345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0538276658044197e-05, + "grad_norm": 27.731361389160156, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8813265562057495, + "num_tokens": 434712201.0, + "step": 11393 + }, + { + "epoch": 1.4494339142602723, + "ewc_loss": 0.04100275784730911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.050137845799327e-05, + "grad_norm": 27.83609390258789, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8798651695251465, + "num_tokens": 434756676.0, + "step": 11394 + }, + { + "epoch": 1.4495611245388629, + "ewc_loss": 0.04113929718732834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0569648768287152e-05, + "grad_norm": 27.777057647705078, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8691344261169434, + "num_tokens": 434797890.0, + "step": 11395 + }, + { + "epoch": 1.4496883348174532, + "ewc_loss": 0.04107679799199104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0538398530334234e-05, + "grad_norm": 27.919260025024414, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8785223364830017, + "num_tokens": 434836933.0, + "step": 11396 + }, + { + "epoch": 1.4498155450960437, + "ewc_loss": 0.041008226573467255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0504114218056202e-05, + "grad_norm": 27.755046844482422, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8796941637992859, + "num_tokens": 434868098.0, + "step": 11397 + }, + { + "epoch": 1.4499427553746342, + "ewc_loss": 0.041042767465114594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0521383703453466e-05, + "grad_norm": 27.805620193481445, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.860417902469635, + "num_tokens": 434907239.0, + "step": 11398 + }, + { + "epoch": 1.4500699656532248, + "ewc_loss": 0.041066691279411316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0533345377771184e-05, + "grad_norm": 27.723751068115234, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8722099661827087, + "num_tokens": 434945672.0, + "step": 11399 + }, + { + "epoch": 1.4501971759318153, + "ewc_loss": 0.04104262962937355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.052131458185613e-05, + "grad_norm": 27.77950096130371, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8565175533294678, + "num_tokens": 434981693.0, + "step": 11400 + }, + { + "epoch": 1.4503243862104058, + "ewc_loss": 0.04116375744342804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.058187783404719e-05, + "grad_norm": 27.761943817138672, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8880265355110168, + "num_tokens": 435012452.0, + "step": 11401 + }, + { + "epoch": 1.4504515964889964, + "ewc_loss": 0.04105120897293091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0525603758869693e-05, + "grad_norm": 27.825326919555664, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8594367504119873, + "num_tokens": 435052711.0, + "step": 11402 + }, + { + "epoch": 1.4505788067675869, + "ewc_loss": 0.04116678237915039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.058339123323094e-05, + "grad_norm": 27.757675170898438, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8638536334037781, + "num_tokens": 435092766.0, + "step": 11403 + }, + { + "epoch": 1.4507060170461774, + "ewc_loss": 0.04106132313609123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0530662368400954e-05, + "grad_norm": 27.82901954650879, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8515831232070923, + "num_tokens": 435132093.0, + "step": 11404 + }, + { + "epoch": 1.4508332273247677, + "ewc_loss": 0.041131794452667236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.056589801213704e-05, + "grad_norm": 27.66121482849121, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8624743223190308, + "num_tokens": 435169071.0, + "step": 11405 + }, + { + "epoch": 1.4509604376033582, + "ewc_loss": 0.04109147563576698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0545738152577542e-05, + "grad_norm": 27.922351837158203, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8758520483970642, + "num_tokens": 435202665.0, + "step": 11406 + }, + { + "epoch": 1.4510876478819488, + "ewc_loss": 0.04122472181916237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0612360458471812e-05, + "grad_norm": 27.815526962280273, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8805702328681946, + "num_tokens": 435234719.0, + "step": 11407 + }, + { + "epoch": 1.4512148581605393, + "ewc_loss": 0.041065946221351624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0532972484943457e-05, + "grad_norm": 27.815128326416016, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.880731463432312, + "num_tokens": 435267090.0, + "step": 11408 + }, + { + "epoch": 1.4513420684391298, + "ewc_loss": 0.041147056967020035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0573528672684915e-05, + "grad_norm": 27.70587730407715, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8740899562835693, + "num_tokens": 435307243.0, + "step": 11409 + }, + { + "epoch": 1.4514692787177204, + "ewc_loss": 0.041155390441417694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.057769597740844e-05, + "grad_norm": 27.69394302368164, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8734548687934875, + "num_tokens": 435347507.0, + "step": 11410 + }, + { + "epoch": 1.4515964889963109, + "ewc_loss": 0.04121876135468483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0609380953828804e-05, + "grad_norm": 27.785451889038086, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8569003939628601, + "num_tokens": 435383499.0, + "step": 11411 + }, + { + "epoch": 1.4517236992749014, + "ewc_loss": 0.04123888164758682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0619439965230413e-05, + "grad_norm": 27.73554229736328, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8666645288467407, + "num_tokens": 435423500.0, + "step": 11412 + }, + { + "epoch": 1.451850909553492, + "ewc_loss": 0.04124797508120537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0623987438739277e-05, + "grad_norm": 27.82761573791504, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8705506324768066, + "num_tokens": 435457075.0, + "step": 11413 + }, + { + "epoch": 1.4519781198320825, + "ewc_loss": 0.0412704236805439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.063521242234856e-05, + "grad_norm": 27.74049949645996, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8652580976486206, + "num_tokens": 435498933.0, + "step": 11414 + }, + { + "epoch": 1.452105330110673, + "ewc_loss": 0.04121045768260956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.060522820102051e-05, + "grad_norm": 27.709026336669922, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8581334948539734, + "num_tokens": 435535949.0, + "step": 11415 + }, + { + "epoch": 1.4522325403892635, + "ewc_loss": 0.04131164029240608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.065581975330133e-05, + "grad_norm": 27.7924861907959, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.859359085559845, + "num_tokens": 435565252.0, + "step": 11416 + }, + { + "epoch": 1.452359750667854, + "ewc_loss": 0.04130549728870392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0652749299188145e-05, + "grad_norm": 27.84364891052246, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8477490544319153, + "num_tokens": 435604663.0, + "step": 11417 + }, + { + "epoch": 1.4524869609464446, + "ewc_loss": 0.041245073080062866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0622535885195248e-05, + "grad_norm": 27.832250595092773, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.860986053943634, + "num_tokens": 435640883.0, + "step": 11418 + }, + { + "epoch": 1.4526141712250351, + "ewc_loss": 0.041272517293691635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0636258341255598e-05, + "grad_norm": 27.727689743041992, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.869397759437561, + "num_tokens": 435682787.0, + "step": 11419 + }, + { + "epoch": 1.4527413815036254, + "ewc_loss": 0.041335225105285645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0667612261604518e-05, + "grad_norm": 27.94502830505371, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8639082908630371, + "num_tokens": 435719982.0, + "step": 11420 + }, + { + "epoch": 1.452868591782216, + "ewc_loss": 0.04132300987839699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.066150409518741e-05, + "grad_norm": 27.778642654418945, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8486095666885376, + "num_tokens": 435755153.0, + "step": 11421 + }, + { + "epoch": 1.4529958020608065, + "ewc_loss": 0.041234999895095825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.061749910353683e-05, + "grad_norm": 27.842926025390625, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8752511143684387, + "num_tokens": 435797513.0, + "step": 11422 + }, + { + "epoch": 1.453123012339397, + "ewc_loss": 0.04129372164607048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0646861230488867e-05, + "grad_norm": 27.782894134521484, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8653792142868042, + "num_tokens": 435836835.0, + "step": 11423 + }, + { + "epoch": 1.4532502226179875, + "ewc_loss": 0.04126106575131416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0630532162613235e-05, + "grad_norm": 27.82752799987793, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8572773933410645, + "num_tokens": 435875124.0, + "step": 11424 + }, + { + "epoch": 1.453377432896578, + "ewc_loss": 0.041332706809043884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0666353520937264e-05, + "grad_norm": 27.86143684387207, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8774702548980713, + "num_tokens": 435914993.0, + "step": 11425 + }, + { + "epoch": 1.4535046431751686, + "ewc_loss": 0.04120504856109619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0602523363777436e-05, + "grad_norm": 27.797269821166992, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8737077713012695, + "num_tokens": 435950916.0, + "step": 11426 + }, + { + "epoch": 1.4536318534537591, + "ewc_loss": 0.041288554668426514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0644276446546428e-05, + "grad_norm": 27.816715240478516, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.861055850982666, + "num_tokens": 435991986.0, + "step": 11427 + }, + { + "epoch": 1.4537590637323496, + "ewc_loss": 0.04127456620335579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0637282432289794e-05, + "grad_norm": 27.814865112304688, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8741795420646667, + "num_tokens": 436024603.0, + "step": 11428 + }, + { + "epoch": 1.45388627401094, + "ewc_loss": 0.041233502328395844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0616751498891972e-05, + "grad_norm": 27.81503677368164, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8615918159484863, + "num_tokens": 436065540.0, + "step": 11429 + }, + { + "epoch": 1.4540134842895305, + "ewc_loss": 0.04126506298780441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0632531231967732e-05, + "grad_norm": 27.85407066345215, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8525362014770508, + "num_tokens": 436104394.0, + "step": 11430 + }, + { + "epoch": 1.454140694568121, + "ewc_loss": 0.04127303883433342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.063652027572971e-05, + "grad_norm": 27.89684295654297, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8533926606178284, + "num_tokens": 436139977.0, + "step": 11431 + }, + { + "epoch": 1.4542679048467115, + "ewc_loss": 0.041169144213199615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0584571757353842e-05, + "grad_norm": 27.794979095458984, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8810243606567383, + "num_tokens": 436178047.0, + "step": 11432 + }, + { + "epoch": 1.454395115125302, + "ewc_loss": 0.041159745305776596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0579873307724483e-05, + "grad_norm": 27.75441551208496, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.864072322845459, + "num_tokens": 436218331.0, + "step": 11433 + }, + { + "epoch": 1.4545223254038926, + "ewc_loss": 0.041213758289813995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.060687984339893e-05, + "grad_norm": 27.83713722229004, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8632969856262207, + "num_tokens": 436255663.0, + "step": 11434 + }, + { + "epoch": 1.4546495356824831, + "ewc_loss": 0.04118284955620766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0591423890437e-05, + "grad_norm": 27.77231216430664, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8654810190200806, + "num_tokens": 436301328.0, + "step": 11435 + }, + { + "epoch": 1.4547767459610736, + "ewc_loss": 0.04121864214539528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0609320927178487e-05, + "grad_norm": 28.102025985717773, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8751250505447388, + "num_tokens": 436336487.0, + "step": 11436 + }, + { + "epoch": 1.4549039562396642, + "ewc_loss": 0.04129006713628769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0645033146138303e-05, + "grad_norm": 27.729278564453125, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.868855357170105, + "num_tokens": 436377629.0, + "step": 11437 + }, + { + "epoch": 1.4550311665182547, + "ewc_loss": 0.04102880135178566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0514400603133254e-05, + "grad_norm": 27.856752395629883, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8804224729537964, + "num_tokens": 436421590.0, + "step": 11438 + }, + { + "epoch": 1.4551583767968452, + "ewc_loss": 0.041173629462718964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0586814571288414e-05, + "grad_norm": 27.74872589111328, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8589697480201721, + "num_tokens": 436461172.0, + "step": 11439 + }, + { + "epoch": 1.4552855870754358, + "ewc_loss": 0.041053805500268936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0526902517303824e-05, + "grad_norm": 28.027610778808594, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.874895453453064, + "num_tokens": 436497536.0, + "step": 11440 + }, + { + "epoch": 1.4554127973540263, + "ewc_loss": 0.04112647473812103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0563236830639653e-05, + "grad_norm": 27.812942504882812, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8632425665855408, + "num_tokens": 436543906.0, + "step": 11441 + }, + { + "epoch": 1.4555400076326168, + "ewc_loss": 0.04106492921710014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0532464986899868e-05, + "grad_norm": 27.83142852783203, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8673021793365479, + "num_tokens": 436579462.0, + "step": 11442 + }, + { + "epoch": 1.4556672179112073, + "ewc_loss": 0.04117630794644356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0588153347489424e-05, + "grad_norm": 27.909744262695312, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8596476316452026, + "num_tokens": 436618840.0, + "step": 11443 + }, + { + "epoch": 1.4557944281897979, + "ewc_loss": 0.04102353751659393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0511768525466323e-05, + "grad_norm": 27.665754318237305, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8551624417304993, + "num_tokens": 436654176.0, + "step": 11444 + }, + { + "epoch": 1.4559216384683882, + "ewc_loss": 0.04106734320521355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.053367097687442e-05, + "grad_norm": 27.873947143554688, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8788658380508423, + "num_tokens": 436685507.0, + "step": 11445 + }, + { + "epoch": 1.4560488487469787, + "ewc_loss": 0.04117425158619881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.058712561847642e-05, + "grad_norm": 27.686656951904297, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8740671873092651, + "num_tokens": 436729881.0, + "step": 11446 + }, + { + "epoch": 1.4561760590255692, + "ewc_loss": 0.041089851409196854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0544925064314157e-05, + "grad_norm": 27.728437423706055, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8655632734298706, + "num_tokens": 436772355.0, + "step": 11447 + }, + { + "epoch": 1.4563032693041598, + "ewc_loss": 0.04117034748196602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0585173842846416e-05, + "grad_norm": 27.77767562866211, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.859699547290802, + "num_tokens": 436808701.0, + "step": 11448 + }, + { + "epoch": 1.4564304795827503, + "ewc_loss": 0.041075173765420914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0537587261060253e-05, + "grad_norm": 27.686302185058594, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.876030683517456, + "num_tokens": 436845441.0, + "step": 11449 + }, + { + "epoch": 1.4565576898613408, + "ewc_loss": 0.04120920971035957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0604604287655093e-05, + "grad_norm": 27.87104034423828, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8693495392799377, + "num_tokens": 436885316.0, + "step": 11450 + }, + { + "epoch": 1.4566849001399313, + "ewc_loss": 0.04124071076512337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0620354916900396e-05, + "grad_norm": 27.903982162475586, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8605449795722961, + "num_tokens": 436918315.0, + "step": 11451 + }, + { + "epoch": 1.4568121104185219, + "ewc_loss": 0.041163451969623566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0581726857926697e-05, + "grad_norm": 27.831636428833008, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8574582934379578, + "num_tokens": 436956039.0, + "step": 11452 + }, + { + "epoch": 1.4569393206971124, + "ewc_loss": 0.04119841009378433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.059920552710537e-05, + "grad_norm": 27.996822357177734, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.8510013818740845, + "num_tokens": 436991274.0, + "step": 11453 + }, + { + "epoch": 1.4570665309757027, + "ewc_loss": 0.041138529777526855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0569264961523004e-05, + "grad_norm": 27.821910858154297, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8773852586746216, + "num_tokens": 437029963.0, + "step": 11454 + }, + { + "epoch": 1.4571937412542932, + "ewc_loss": 0.04107994958758354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.053997559414711e-05, + "grad_norm": 27.87754249572754, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8584623336791992, + "num_tokens": 437067030.0, + "step": 11455 + }, + { + "epoch": 1.4573209515328838, + "ewc_loss": 0.04116328805685043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0581643184414133e-05, + "grad_norm": 27.718862533569336, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8732307553291321, + "num_tokens": 437103363.0, + "step": 11456 + }, + { + "epoch": 1.4574481618114743, + "ewc_loss": 0.041086237877607346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0543118807836436e-05, + "grad_norm": 27.838314056396484, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8730549216270447, + "num_tokens": 437131574.0, + "step": 11457 + }, + { + "epoch": 1.4575753720900648, + "ewc_loss": 0.04128849878907204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0644249161705375e-05, + "grad_norm": 27.826974868774414, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8591848611831665, + "num_tokens": 437169537.0, + "step": 11458 + }, + { + "epoch": 1.4577025823686554, + "ewc_loss": 0.0411812998354435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0590650819940493e-05, + "grad_norm": 27.749895095825195, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8717588186264038, + "num_tokens": 437208025.0, + "step": 11459 + }, + { + "epoch": 1.4578297926472459, + "ewc_loss": 0.04129815101623535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0649074940592982e-05, + "grad_norm": 28.078577041625977, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8777731657028198, + "num_tokens": 437246642.0, + "step": 11460 + }, + { + "epoch": 1.4579570029258364, + "ewc_loss": 0.04122910648584366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0614552340703085e-05, + "grad_norm": 27.825021743774414, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8642058372497559, + "num_tokens": 437285274.0, + "step": 11461 + }, + { + "epoch": 1.458084213204427, + "ewc_loss": 0.04111252352595329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0556261006277055e-05, + "grad_norm": 27.90770721435547, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.8840785026550293, + "num_tokens": 437325865.0, + "step": 11462 + }, + { + "epoch": 1.4582114234830175, + "ewc_loss": 0.04127555713057518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0637779016396962e-05, + "grad_norm": 27.873470306396484, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8711581230163574, + "num_tokens": 437364093.0, + "step": 11463 + }, + { + "epoch": 1.458338633761608, + "ewc_loss": 0.0411994606256485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0599731215042993e-05, + "grad_norm": 28.095169067382812, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8536139726638794, + "num_tokens": 437402413.0, + "step": 11464 + }, + { + "epoch": 1.4584658440401985, + "ewc_loss": 0.04119999706745148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0599998606485315e-05, + "grad_norm": 27.90863609313965, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8531183004379272, + "num_tokens": 437442583.0, + "step": 11465 + }, + { + "epoch": 1.458593054318789, + "ewc_loss": 0.041120562702417374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.056028097285889e-05, + "grad_norm": 27.901472091674805, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8659077286720276, + "num_tokens": 437485682.0, + "step": 11466 + }, + { + "epoch": 1.4587202645973796, + "ewc_loss": 0.041253045201301575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.062652310996782e-05, + "grad_norm": 28.0330753326416, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8565424084663391, + "num_tokens": 437524859.0, + "step": 11467 + }, + { + "epoch": 1.45884747487597, + "ewc_loss": 0.041100915521383286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0550458430079743e-05, + "grad_norm": 27.861967086791992, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8584169149398804, + "num_tokens": 437563590.0, + "step": 11468 + }, + { + "epoch": 1.4589746851545604, + "ewc_loss": 0.04114983230829239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.057491656159982e-05, + "grad_norm": 27.79534339904785, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8819591999053955, + "num_tokens": 437601429.0, + "step": 11469 + }, + { + "epoch": 1.459101895433151, + "ewc_loss": 0.04122398793697357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.06119948416017e-05, + "grad_norm": 27.95469093322754, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8537246584892273, + "num_tokens": 437639144.0, + "step": 11470 + }, + { + "epoch": 1.4592291057117415, + "ewc_loss": 0.04115724191069603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0578621843014844e-05, + "grad_norm": 27.871898651123047, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8737194538116455, + "num_tokens": 437680846.0, + "step": 11471 + }, + { + "epoch": 1.459356315990332, + "ewc_loss": 0.041087135672569275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.054356809821911e-05, + "grad_norm": 27.796550750732422, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.867746114730835, + "num_tokens": 437722156.0, + "step": 11472 + }, + { + "epoch": 1.4594835262689225, + "ewc_loss": 0.041166357696056366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0583178411470726e-05, + "grad_norm": 27.943029403686523, + "learning_rate": 1e-06, + "loss": 0.5135, + "mean_token_accuracy": 0.8410122990608215, + "num_tokens": 437766905.0, + "step": 11473 + }, + { + "epoch": 1.459610736547513, + "ewc_loss": 0.041110191494226456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.055509503406938e-05, + "grad_norm": 27.807968139648438, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8658323287963867, + "num_tokens": 437801425.0, + "step": 11474 + }, + { + "epoch": 1.4597379468261036, + "ewc_loss": 0.04115597531199455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0577987015713006e-05, + "grad_norm": 27.81814193725586, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8551567792892456, + "num_tokens": 437838126.0, + "step": 11475 + }, + { + "epoch": 1.4598651571046941, + "ewc_loss": 0.0412038154900074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0601908545359038e-05, + "grad_norm": 27.954145431518555, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8610364198684692, + "num_tokens": 437876597.0, + "step": 11476 + }, + { + "epoch": 1.4599923673832846, + "ewc_loss": 0.041202764958143234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0601382857421413e-05, + "grad_norm": 27.8878173828125, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8775084018707275, + "num_tokens": 437911852.0, + "step": 11477 + }, + { + "epoch": 1.460119577661875, + "ewc_loss": 0.041085995733737946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0542998754535802e-05, + "grad_norm": 27.839630126953125, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8656830787658691, + "num_tokens": 437953418.0, + "step": 11478 + }, + { + "epoch": 1.4602467879404655, + "ewc_loss": 0.041166484355926514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.058324207609985e-05, + "grad_norm": 27.793689727783203, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.871587872505188, + "num_tokens": 437989561.0, + "step": 11479 + }, + { + "epoch": 1.460373998219056, + "ewc_loss": 0.041161950677633286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0580975615303032e-05, + "grad_norm": 27.941913604736328, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8742331266403198, + "num_tokens": 438033464.0, + "step": 11480 + }, + { + "epoch": 1.4605012084976465, + "ewc_loss": 0.04121654853224754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.060827500827145e-05, + "grad_norm": 27.766530990600586, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8694857358932495, + "num_tokens": 438068676.0, + "step": 11481 + }, + { + "epoch": 1.460628418776237, + "ewc_loss": 0.041176680475473404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.058834070339799e-05, + "grad_norm": 27.864158630371094, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.847590982913971, + "num_tokens": 438107867.0, + "step": 11482 + }, + { + "epoch": 1.4607556290548276, + "ewc_loss": 0.041266292333602905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.063314605038613e-05, + "grad_norm": 27.834636688232422, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8820592164993286, + "num_tokens": 438144256.0, + "step": 11483 + }, + { + "epoch": 1.4608828393334181, + "ewc_loss": 0.04115770757198334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0578852854669094e-05, + "grad_norm": 27.81368064880371, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8766728639602661, + "num_tokens": 438183458.0, + "step": 11484 + }, + { + "epoch": 1.4610100496120086, + "ewc_loss": 0.04123000428080559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.061500163108576e-05, + "grad_norm": 27.813400268554688, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8538913726806641, + "num_tokens": 438216025.0, + "step": 11485 + }, + { + "epoch": 1.4611372598905992, + "ewc_loss": 0.04127335548400879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0636678527807817e-05, + "grad_norm": 27.812332153320312, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.876512885093689, + "num_tokens": 438258185.0, + "step": 11486 + }, + { + "epoch": 1.4612644701691897, + "ewc_loss": 0.04123912379145622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.061956183752045e-05, + "grad_norm": 27.82826805114746, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8613842129707336, + "num_tokens": 438298189.0, + "step": 11487 + }, + { + "epoch": 1.4613916804477802, + "ewc_loss": 0.04122622311115265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0613111701095477e-05, + "grad_norm": 27.885509490966797, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8552765846252441, + "num_tokens": 438336593.0, + "step": 11488 + }, + { + "epoch": 1.4615188907263708, + "ewc_loss": 0.041287872940301895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0643936295527965e-05, + "grad_norm": 27.93621253967285, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8722730875015259, + "num_tokens": 438374110.0, + "step": 11489 + }, + { + "epoch": 1.4616461010049613, + "ewc_loss": 0.04122275114059448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0611376385204494e-05, + "grad_norm": 27.794586181640625, + "learning_rate": 1e-06, + "loss": 0.4813, + "mean_token_accuracy": 0.8501230478286743, + "num_tokens": 438413062.0, + "step": 11490 + }, + { + "epoch": 1.4617733112835518, + "ewc_loss": 0.041325852274894714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0662926544900984e-05, + "grad_norm": 28.042024612426758, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8685154318809509, + "num_tokens": 438452055.0, + "step": 11491 + }, + { + "epoch": 1.4619005215621423, + "ewc_loss": 0.041254136711359024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0627068806788884e-05, + "grad_norm": 27.78270149230957, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8681422472000122, + "num_tokens": 438494224.0, + "step": 11492 + }, + { + "epoch": 1.4620277318407329, + "ewc_loss": 0.041255075484514236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0627538106055e-05, + "grad_norm": 28.021257400512695, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8628695011138916, + "num_tokens": 438529790.0, + "step": 11493 + }, + { + "epoch": 1.4621549421193232, + "ewc_loss": 0.0412675216794014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.063376086880453e-05, + "grad_norm": 27.829673767089844, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8697553277015686, + "num_tokens": 438571371.0, + "step": 11494 + }, + { + "epoch": 1.4622821523979137, + "ewc_loss": 0.041174083948135376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0587041944963858e-05, + "grad_norm": 27.901321411132812, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.851902961730957, + "num_tokens": 438612383.0, + "step": 11495 + }, + { + "epoch": 1.4624093626765042, + "ewc_loss": 0.0412810742855072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0640536604332738e-05, + "grad_norm": 27.873178482055664, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8562456369400024, + "num_tokens": 438653443.0, + "step": 11496 + }, + { + "epoch": 1.4625365729550948, + "ewc_loss": 0.041183073073625565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.059153666778002e-05, + "grad_norm": 27.862092971801758, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8533056378364563, + "num_tokens": 438689565.0, + "step": 11497 + }, + { + "epoch": 1.4626637832336853, + "ewc_loss": 0.041266072541475296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0633036911021918e-05, + "grad_norm": 27.82713508605957, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8685187101364136, + "num_tokens": 438727753.0, + "step": 11498 + }, + { + "epoch": 1.4627909935122758, + "ewc_loss": 0.041224099695682526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0612049411283806e-05, + "grad_norm": 27.870325088500977, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.863448441028595, + "num_tokens": 438764667.0, + "step": 11499 + }, + { + "epoch": 1.4629182037908663, + "ewc_loss": 0.04128140211105347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0640700313379057e-05, + "grad_norm": 27.773630142211914, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.871699333190918, + "num_tokens": 438801739.0, + "step": 11500 + }, + { + "epoch": 1.4630454140694569, + "ewc_loss": 0.0411975122988224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0598756236722693e-05, + "grad_norm": 27.799474716186523, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.867565929889679, + "num_tokens": 438840028.0, + "step": 11501 + }, + { + "epoch": 1.4631726243480474, + "ewc_loss": 0.04128799960017204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.064399996015709e-05, + "grad_norm": 27.818492889404297, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8634277582168579, + "num_tokens": 438880632.0, + "step": 11502 + }, + { + "epoch": 1.4632998346266377, + "ewc_loss": 0.04118816554546356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0594083252944984e-05, + "grad_norm": 27.791168212890625, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8630144596099854, + "num_tokens": 438923040.0, + "step": 11503 + }, + { + "epoch": 1.4634270449052282, + "ewc_loss": 0.04129019379615784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0645096810767427e-05, + "grad_norm": 27.897825241088867, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8654001951217651, + "num_tokens": 438958597.0, + "step": 11504 + }, + { + "epoch": 1.4635542551838188, + "ewc_loss": 0.04133642092347145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0668210709118284e-05, + "grad_norm": 27.828489303588867, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8543651103973389, + "num_tokens": 438997909.0, + "step": 11505 + }, + { + "epoch": 1.4636814654624093, + "ewc_loss": 0.041242364794015884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.062118255707901e-05, + "grad_norm": 27.854602813720703, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8737263679504395, + "num_tokens": 439030084.0, + "step": 11506 + }, + { + "epoch": 1.4638086757409998, + "ewc_loss": 0.04128102958202362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0640514776459895e-05, + "grad_norm": 27.751750946044922, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8685582876205444, + "num_tokens": 439067770.0, + "step": 11507 + }, + { + "epoch": 1.4639358860195903, + "ewc_loss": 0.04131539165973663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0657695131376386e-05, + "grad_norm": 27.88076400756836, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8675137162208557, + "num_tokens": 439102476.0, + "step": 11508 + }, + { + "epoch": 1.4640630962981809, + "ewc_loss": 0.041323404759168625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0661702365032397e-05, + "grad_norm": 27.71262550354004, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8759818077087402, + "num_tokens": 439138660.0, + "step": 11509 + }, + { + "epoch": 1.4641903065767714, + "ewc_loss": 0.04129274562001228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0646371922339313e-05, + "grad_norm": 27.79286003112793, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8738359212875366, + "num_tokens": 439181690.0, + "step": 11510 + }, + { + "epoch": 1.464317516855362, + "ewc_loss": 0.04131660982966423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0658304492826574e-05, + "grad_norm": 27.778705596923828, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8705786466598511, + "num_tokens": 439218825.0, + "step": 11511 + }, + { + "epoch": 1.4644447271339525, + "ewc_loss": 0.04127000272274017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0635001419577748e-05, + "grad_norm": 27.794395446777344, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.871479868888855, + "num_tokens": 439253505.0, + "step": 11512 + }, + { + "epoch": 1.464571937412543, + "ewc_loss": 0.04137440770864487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.068720459647011e-05, + "grad_norm": 27.772926330566406, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8603162169456482, + "num_tokens": 439292764.0, + "step": 11513 + }, + { + "epoch": 1.4646991476911335, + "ewc_loss": 0.04132329300045967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0661645976360887e-05, + "grad_norm": 27.756999969482422, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8748990893363953, + "num_tokens": 439335269.0, + "step": 11514 + }, + { + "epoch": 1.464826357969724, + "ewc_loss": 0.04136000946164131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0680005036410876e-05, + "grad_norm": 27.85146713256836, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8669443130493164, + "num_tokens": 439371977.0, + "step": 11515 + }, + { + "epoch": 1.4649535682483146, + "ewc_loss": 0.04132849723100662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.066424895019736e-05, + "grad_norm": 27.829849243164062, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8658344149589539, + "num_tokens": 439412710.0, + "step": 11516 + }, + { + "epoch": 1.465080778526905, + "ewc_loss": 0.04133094847202301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.066547494905535e-05, + "grad_norm": 27.769702911376953, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8657349348068237, + "num_tokens": 439449637.0, + "step": 11517 + }, + { + "epoch": 1.4652079888054954, + "ewc_loss": 0.04132666811347008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0663333998527378e-05, + "grad_norm": 27.764785766601562, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8601823449134827, + "num_tokens": 439495143.0, + "step": 11518 + }, + { + "epoch": 1.465335199084086, + "ewc_loss": 0.04137255251407623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0686276911874302e-05, + "grad_norm": 27.947622299194336, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8534997701644897, + "num_tokens": 439532721.0, + "step": 11519 + }, + { + "epoch": 1.4654624093626765, + "ewc_loss": 0.04133081063628197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0665405827458017e-05, + "grad_norm": 27.80011558532715, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8756140470504761, + "num_tokens": 439567938.0, + "step": 11520 + }, + { + "epoch": 1.465589619641267, + "ewc_loss": 0.04126439243555069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.063219653791748e-05, + "grad_norm": 27.861656188964844, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8739305138587952, + "num_tokens": 439595720.0, + "step": 11521 + }, + { + "epoch": 1.4657168299198575, + "ewc_loss": 0.041330426931381226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.066521301458124e-05, + "grad_norm": 27.747249603271484, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8662916421890259, + "num_tokens": 439634091.0, + "step": 11522 + }, + { + "epoch": 1.465844040198448, + "ewc_loss": 0.041333477944135666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0666739146690816e-05, + "grad_norm": 27.908954620361328, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8551636934280396, + "num_tokens": 439671978.0, + "step": 11523 + }, + { + "epoch": 1.4659712504770386, + "ewc_loss": 0.041402097791433334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0701048924820498e-05, + "grad_norm": 27.864601135253906, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8610793352127075, + "num_tokens": 439707748.0, + "step": 11524 + }, + { + "epoch": 1.466098460755629, + "ewc_loss": 0.04127177596092224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0635887267417274e-05, + "grad_norm": 27.755271911621094, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8736516833305359, + "num_tokens": 439744060.0, + "step": 11525 + }, + { + "epoch": 1.4662256710342196, + "ewc_loss": 0.04133617877960205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0668088836828247e-05, + "grad_norm": 27.878074645996094, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8577742576599121, + "num_tokens": 439789277.0, + "step": 11526 + }, + { + "epoch": 1.46635288131281, + "ewc_loss": 0.041375648230314255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.068782487185672e-05, + "grad_norm": 27.79608154296875, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8833322525024414, + "num_tokens": 439827524.0, + "step": 11527 + }, + { + "epoch": 1.4664800915914005, + "ewc_loss": 0.041334621608257294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.066731030936353e-05, + "grad_norm": 27.774394989013672, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8626307249069214, + "num_tokens": 439865181.0, + "step": 11528 + }, + { + "epoch": 1.466607301869991, + "ewc_loss": 0.04139544069766998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.069772017421201e-05, + "grad_norm": 27.862485885620117, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8591829538345337, + "num_tokens": 439900071.0, + "step": 11529 + }, + { + "epoch": 1.4667345121485815, + "ewc_loss": 0.04134928807616234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0674644474638626e-05, + "grad_norm": 27.726640701293945, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8654606342315674, + "num_tokens": 439940839.0, + "step": 11530 + }, + { + "epoch": 1.466861722427172, + "ewc_loss": 0.041373711079359055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.068685535050463e-05, + "grad_norm": 27.918193817138672, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8632876873016357, + "num_tokens": 439978908.0, + "step": 11531 + }, + { + "epoch": 1.4669889327057626, + "ewc_loss": 0.041426002979278564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0713001504191197e-05, + "grad_norm": 27.714134216308594, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8653521537780762, + "num_tokens": 440013448.0, + "step": 11532 + }, + { + "epoch": 1.467116142984353, + "ewc_loss": 0.04133144021034241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.066572051262483e-05, + "grad_norm": 27.759044647216797, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8693939447402954, + "num_tokens": 440048377.0, + "step": 11533 + }, + { + "epoch": 1.4672433532629436, + "ewc_loss": 0.041444774717092514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0722387489513494e-05, + "grad_norm": 27.727291107177734, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.860985279083252, + "num_tokens": 440089806.0, + "step": 11534 + }, + { + "epoch": 1.4673705635415342, + "ewc_loss": 0.041407469660043716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0703735572169535e-05, + "grad_norm": 27.805477142333984, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8709394335746765, + "num_tokens": 440124209.0, + "step": 11535 + }, + { + "epoch": 1.4674977738201247, + "ewc_loss": 0.041502274572849274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0751136617036536e-05, + "grad_norm": 27.876310348510742, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8662571310997009, + "num_tokens": 440158349.0, + "step": 11536 + }, + { + "epoch": 1.4676249840987152, + "ewc_loss": 0.041447803378105164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0723900888697244e-05, + "grad_norm": 27.87424659729004, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8678696751594543, + "num_tokens": 440195220.0, + "step": 11537 + }, + { + "epoch": 1.4677521943773058, + "ewc_loss": 0.04135914146900177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0679570297943428e-05, + "grad_norm": 27.828176498413086, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8662668466567993, + "num_tokens": 440235158.0, + "step": 11538 + }, + { + "epoch": 1.4678794046558963, + "ewc_loss": 0.04132380336523056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0661902453866787e-05, + "grad_norm": 27.771350860595703, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8613872528076172, + "num_tokens": 440271879.0, + "step": 11539 + }, + { + "epoch": 1.4680066149344868, + "ewc_loss": 0.041384678333997726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.069233960355632e-05, + "grad_norm": 27.77294921875, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8650184869766235, + "num_tokens": 440306454.0, + "step": 11540 + }, + { + "epoch": 1.4681338252130773, + "ewc_loss": 0.04137539491057396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.068769754259847e-05, + "grad_norm": 27.876327514648438, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8542278409004211, + "num_tokens": 440341185.0, + "step": 11541 + }, + { + "epoch": 1.4682610354916679, + "ewc_loss": 0.04138325899839401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0691630197688937e-05, + "grad_norm": 27.747005462646484, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8649826049804688, + "num_tokens": 440379168.0, + "step": 11542 + }, + { + "epoch": 1.4683882457702582, + "ewc_loss": 0.04141717404127121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.070858681690879e-05, + "grad_norm": 27.909242630004883, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8683159947395325, + "num_tokens": 440417945.0, + "step": 11543 + }, + { + "epoch": 1.4685154560488487, + "ewc_loss": 0.04145585745573044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0727928131236695e-05, + "grad_norm": 27.77204704284668, + "learning_rate": 1e-06, + "loss": 0.4861, + "mean_token_accuracy": 0.8504209518432617, + "num_tokens": 440457864.0, + "step": 11544 + }, + { + "epoch": 1.4686426663274392, + "ewc_loss": 0.04140055552124977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0700277673313394e-05, + "grad_norm": 27.929590225219727, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8611913919448853, + "num_tokens": 440491709.0, + "step": 11545 + }, + { + "epoch": 1.4687698766060298, + "ewc_loss": 0.04146356135606766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0731780750793405e-05, + "grad_norm": 27.785316467285156, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8718702793121338, + "num_tokens": 440533570.0, + "step": 11546 + }, + { + "epoch": 1.4688970868846203, + "ewc_loss": 0.04145580530166626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0727902665385045e-05, + "grad_norm": 27.906251907348633, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.871566116809845, + "num_tokens": 440568360.0, + "step": 11547 + }, + { + "epoch": 1.4690242971632108, + "ewc_loss": 0.041503045707941055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0751522242790088e-05, + "grad_norm": 27.840171813964844, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8756261467933655, + "num_tokens": 440606848.0, + "step": 11548 + }, + { + "epoch": 1.4691515074418013, + "ewc_loss": 0.041382014751434326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0691008103312925e-05, + "grad_norm": 27.926668167114258, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8793973326683044, + "num_tokens": 440638254.0, + "step": 11549 + }, + { + "epoch": 1.4692787177203919, + "ewc_loss": 0.04142914339900017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0714571292046458e-05, + "grad_norm": 27.912559509277344, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.872143030166626, + "num_tokens": 440674122.0, + "step": 11550 + }, + { + "epoch": 1.4694059279989824, + "ewc_loss": 0.04135439172387123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.06771965167718e-05, + "grad_norm": 27.83170509338379, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8811924457550049, + "num_tokens": 440718362.0, + "step": 11551 + }, + { + "epoch": 1.4695331382775727, + "ewc_loss": 0.041373033076524734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.068651701847557e-05, + "grad_norm": 27.847623825073242, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8737499713897705, + "num_tokens": 440755971.0, + "step": 11552 + }, + { + "epoch": 1.4696603485561632, + "ewc_loss": 0.04145611450076103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0728057279484347e-05, + "grad_norm": 27.92765998840332, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8636282086372375, + "num_tokens": 440794328.0, + "step": 11553 + }, + { + "epoch": 1.4697875588347538, + "ewc_loss": 0.04131627827882767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.065813896479085e-05, + "grad_norm": 27.84311294555664, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8539218902587891, + "num_tokens": 440833383.0, + "step": 11554 + }, + { + "epoch": 1.4699147691133443, + "ewc_loss": 0.0413590744137764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0679537556134164e-05, + "grad_norm": 27.81235122680664, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8724347949028015, + "num_tokens": 440875502.0, + "step": 11555 + }, + { + "epoch": 1.4700419793919348, + "ewc_loss": 0.04130640998482704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0653205865528435e-05, + "grad_norm": 27.799419403076172, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.870116114616394, + "num_tokens": 440915259.0, + "step": 11556 + }, + { + "epoch": 1.4701691896705253, + "ewc_loss": 0.04139529913663864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.069764923362527e-05, + "grad_norm": 27.83868408203125, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.862288773059845, + "num_tokens": 440951660.0, + "step": 11557 + }, + { + "epoch": 1.4702963999491159, + "ewc_loss": 0.041361790150403976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.068089452222921e-05, + "grad_norm": 27.7742862701416, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8666964173316956, + "num_tokens": 440992533.0, + "step": 11558 + }, + { + "epoch": 1.4704236102277064, + "ewc_loss": 0.04131002351641655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0655012122006156e-05, + "grad_norm": 27.82982063293457, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.85544753074646, + "num_tokens": 441027360.0, + "step": 11559 + }, + { + "epoch": 1.470550820506297, + "ewc_loss": 0.0413832850754261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0691642930614762e-05, + "grad_norm": 27.806560516357422, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8725550770759583, + "num_tokens": 441071746.0, + "step": 11560 + }, + { + "epoch": 1.4706780307848875, + "ewc_loss": 0.04131036624312401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.065518310701009e-05, + "grad_norm": 27.849506378173828, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8655496835708618, + "num_tokens": 441105378.0, + "step": 11561 + }, + { + "epoch": 1.470805241063478, + "ewc_loss": 0.041408345103263855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0704172129626386e-05, + "grad_norm": 27.961469650268555, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8666788339614868, + "num_tokens": 441150000.0, + "step": 11562 + }, + { + "epoch": 1.4709324513420685, + "ewc_loss": 0.041349053382873535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0674526240327395e-05, + "grad_norm": 27.699453353881836, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8590587377548218, + "num_tokens": 441188426.0, + "step": 11563 + }, + { + "epoch": 1.471059661620659, + "ewc_loss": 0.041233908385038376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0616953406715766e-05, + "grad_norm": 27.898971557617188, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8735643625259399, + "num_tokens": 441226028.0, + "step": 11564 + }, + { + "epoch": 1.4711868718992496, + "ewc_loss": 0.04137580469250679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0687903088401072e-05, + "grad_norm": 27.852405548095703, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8673507571220398, + "num_tokens": 441265788.0, + "step": 11565 + }, + { + "epoch": 1.47131408217784, + "ewc_loss": 0.04136558994650841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0682795366155915e-05, + "grad_norm": 27.914365768432617, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8749317526817322, + "num_tokens": 441297280.0, + "step": 11566 + }, + { + "epoch": 1.4714412924564304, + "ewc_loss": 0.041374411433935165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0687206415459514e-05, + "grad_norm": 27.868005752563477, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8653002381324768, + "num_tokens": 441331229.0, + "step": 11567 + }, + { + "epoch": 1.471568502735021, + "ewc_loss": 0.04128173738718033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0640869479393587e-05, + "grad_norm": 27.795249938964844, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8700816035270691, + "num_tokens": 441371064.0, + "step": 11568 + }, + { + "epoch": 1.4716957130136115, + "ewc_loss": 0.04133078083395958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0665391275542788e-05, + "grad_norm": 27.779273986816406, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8479920029640198, + "num_tokens": 441405706.0, + "step": 11569 + }, + { + "epoch": 1.471822923292202, + "ewc_loss": 0.04137076437473297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0685381969087757e-05, + "grad_norm": 27.82457733154297, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8578422665596008, + "num_tokens": 441445966.0, + "step": 11570 + }, + { + "epoch": 1.4719501335707925, + "ewc_loss": 0.04139562323689461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0697811123682186e-05, + "grad_norm": 27.937225341796875, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8502783179283142, + "num_tokens": 441478916.0, + "step": 11571 + }, + { + "epoch": 1.472077343849383, + "ewc_loss": 0.0413716658949852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0685833078459837e-05, + "grad_norm": 27.749271392822266, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8631813526153564, + "num_tokens": 441519245.0, + "step": 11572 + }, + { + "epoch": 1.4722045541279736, + "ewc_loss": 0.04140521213412285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0702605979749933e-05, + "grad_norm": 27.86355209350586, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8582487106323242, + "num_tokens": 441557485.0, + "step": 11573 + }, + { + "epoch": 1.472331764406564, + "ewc_loss": 0.04145405441522598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.072702773148194e-05, + "grad_norm": 27.872886657714844, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.876545250415802, + "num_tokens": 441596042.0, + "step": 11574 + }, + { + "epoch": 1.4724589746851546, + "ewc_loss": 0.04134348779916763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.067174318653997e-05, + "grad_norm": 27.84416389465332, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8761723041534424, + "num_tokens": 441634391.0, + "step": 11575 + }, + { + "epoch": 1.472586184963745, + "ewc_loss": 0.041325557976961136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0662779206759296e-05, + "grad_norm": 27.794273376464844, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8580825924873352, + "num_tokens": 441672201.0, + "step": 11576 + }, + { + "epoch": 1.4727133952423355, + "ewc_loss": 0.041420694440603256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0710347598651424e-05, + "grad_norm": 27.90366554260254, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8635107278823853, + "num_tokens": 441713350.0, + "step": 11577 + }, + { + "epoch": 1.472840605520926, + "ewc_loss": 0.041417188942432404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0708594092866406e-05, + "grad_norm": 27.829578399658203, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8660156726837158, + "num_tokens": 441754677.0, + "step": 11578 + }, + { + "epoch": 1.4729678157995165, + "ewc_loss": 0.041322559118270874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0661280359490775e-05, + "grad_norm": 27.85144805908203, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8557464480400085, + "num_tokens": 441797037.0, + "step": 11579 + }, + { + "epoch": 1.473095026078107, + "ewc_loss": 0.041424237191677094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0712119294330478e-05, + "grad_norm": 27.951635360717773, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8632231950759888, + "num_tokens": 441831661.0, + "step": 11580 + }, + { + "epoch": 1.4732222363566976, + "ewc_loss": 0.041342753916978836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0671377569669858e-05, + "grad_norm": 27.66912269592285, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8689279556274414, + "num_tokens": 441876933.0, + "step": 11581 + }, + { + "epoch": 1.473349446635288, + "ewc_loss": 0.04140666127204895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0703329937532544e-05, + "grad_norm": 27.959821701049805, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8881657123565674, + "num_tokens": 441918015.0, + "step": 11582 + }, + { + "epoch": 1.4734766569138786, + "ewc_loss": 0.041457828134298325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0728914023493417e-05, + "grad_norm": 27.85981559753418, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.859502911567688, + "num_tokens": 441956573.0, + "step": 11583 + }, + { + "epoch": 1.4736038671924692, + "ewc_loss": 0.041381560266017914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.069078072963748e-05, + "grad_norm": 27.884302139282227, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8801746368408203, + "num_tokens": 442002064.0, + "step": 11584 + }, + { + "epoch": 1.4737310774710597, + "ewc_loss": 0.041392043232917786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0696021238109097e-05, + "grad_norm": 27.923667907714844, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8734027147293091, + "num_tokens": 442040122.0, + "step": 11585 + }, + { + "epoch": 1.4738582877496502, + "ewc_loss": 0.041376058012247086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0688028598669916e-05, + "grad_norm": 27.909168243408203, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8688398599624634, + "num_tokens": 442079698.0, + "step": 11586 + }, + { + "epoch": 1.4739854980282407, + "ewc_loss": 0.04135310277342796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0676550775533542e-05, + "grad_norm": 27.833345413208008, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8790608644485474, + "num_tokens": 442118252.0, + "step": 11587 + }, + { + "epoch": 1.4741127083068313, + "ewc_loss": 0.041299931704998016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.064996624540072e-05, + "grad_norm": 27.78829002380371, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8702068328857422, + "num_tokens": 442154675.0, + "step": 11588 + }, + { + "epoch": 1.4742399185854218, + "ewc_loss": 0.04133208468556404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0666042473749258e-05, + "grad_norm": 27.744239807128906, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.872946560382843, + "num_tokens": 442189449.0, + "step": 11589 + }, + { + "epoch": 1.4743671288640123, + "ewc_loss": 0.041378285735845566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.068914363917429e-05, + "grad_norm": 28.00533103942871, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8690928220748901, + "num_tokens": 442222844.0, + "step": 11590 + }, + { + "epoch": 1.4744943391426026, + "ewc_loss": 0.041480373591184616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.074018630082719e-05, + "grad_norm": 27.82680320739746, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8732963800430298, + "num_tokens": 442258599.0, + "step": 11591 + }, + { + "epoch": 1.4746215494211932, + "ewc_loss": 0.04126625135540962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0633126041502692e-05, + "grad_norm": 27.89317512512207, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8740208148956299, + "num_tokens": 442294775.0, + "step": 11592 + }, + { + "epoch": 1.4747487596997837, + "ewc_loss": 0.04142600670456886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.07130033231806e-05, + "grad_norm": 27.939029693603516, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8772259950637817, + "num_tokens": 442330313.0, + "step": 11593 + }, + { + "epoch": 1.4748759699783742, + "ewc_loss": 0.04131130129098892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.06556505872868e-05, + "grad_norm": 27.728994369506836, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8651630282402039, + "num_tokens": 442364988.0, + "step": 11594 + }, + { + "epoch": 1.4750031802569648, + "ewc_loss": 0.04134949669241905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0674748157034628e-05, + "grad_norm": 27.887250900268555, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8686235547065735, + "num_tokens": 442406927.0, + "step": 11595 + }, + { + "epoch": 1.4751303905355553, + "ewc_loss": 0.041446611285209656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0723306079162285e-05, + "grad_norm": 27.823894500732422, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8698601722717285, + "num_tokens": 442442092.0, + "step": 11596 + }, + { + "epoch": 1.4752576008141458, + "ewc_loss": 0.04144228994846344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0721145119750872e-05, + "grad_norm": 27.914161682128906, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8734211921691895, + "num_tokens": 442483013.0, + "step": 11597 + }, + { + "epoch": 1.4753848110927363, + "ewc_loss": 0.04139763116836548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0698815205832943e-05, + "grad_norm": 27.82293128967285, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8734132647514343, + "num_tokens": 442525052.0, + "step": 11598 + }, + { + "epoch": 1.4755120213713269, + "ewc_loss": 0.04139731079339981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.069865513476543e-05, + "grad_norm": 27.85157585144043, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8581178188323975, + "num_tokens": 442562543.0, + "step": 11599 + }, + { + "epoch": 1.4756392316499174, + "ewc_loss": 0.0414382740855217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0719136955449358e-05, + "grad_norm": 27.951305389404297, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.871422529220581, + "num_tokens": 442604976.0, + "step": 11600 + }, + { + "epoch": 1.4757664419285077, + "ewc_loss": 0.04138099029660225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0690495148301125e-05, + "grad_norm": 27.824443817138672, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8790538311004639, + "num_tokens": 442642271.0, + "step": 11601 + }, + { + "epoch": 1.4758936522070982, + "ewc_loss": 0.04136402904987335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0682015019701794e-05, + "grad_norm": 27.913848876953125, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.863296627998352, + "num_tokens": 442680508.0, + "step": 11602 + }, + { + "epoch": 1.4760208624856888, + "ewc_loss": 0.041420962661504745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0710482203867286e-05, + "grad_norm": 27.70836639404297, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8556398749351501, + "num_tokens": 442718391.0, + "step": 11603 + }, + { + "epoch": 1.4761480727642793, + "ewc_loss": 0.041490621864795685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.074531039397698e-05, + "grad_norm": 28.022714614868164, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8767521381378174, + "num_tokens": 442757688.0, + "step": 11604 + }, + { + "epoch": 1.4762752830428698, + "ewc_loss": 0.041432347148656845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.071617382171098e-05, + "grad_norm": 27.799732208251953, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8796072006225586, + "num_tokens": 442796704.0, + "step": 11605 + }, + { + "epoch": 1.4764024933214603, + "ewc_loss": 0.04147829860448837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.073914947686717e-05, + "grad_norm": 28.010684967041016, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.876786470413208, + "num_tokens": 442838791.0, + "step": 11606 + }, + { + "epoch": 1.4765297036000509, + "ewc_loss": 0.04138699173927307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0693496480816975e-05, + "grad_norm": 27.901718139648438, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8676310777664185, + "num_tokens": 442882368.0, + "step": 11607 + }, + { + "epoch": 1.4766569138786414, + "ewc_loss": 0.04141637310385704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.070818663924001e-05, + "grad_norm": 27.9420108795166, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8671389818191528, + "num_tokens": 442923768.0, + "step": 11608 + }, + { + "epoch": 1.476784124157232, + "ewc_loss": 0.04137096181511879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.068548019451555e-05, + "grad_norm": 27.84906005859375, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8816701173782349, + "num_tokens": 442955656.0, + "step": 11609 + }, + { + "epoch": 1.4769113344358225, + "ewc_loss": 0.04140956327319145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0704781491076574e-05, + "grad_norm": 27.959609985351562, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8581450581550598, + "num_tokens": 442997323.0, + "step": 11610 + }, + { + "epoch": 1.477038544714413, + "ewc_loss": 0.04137963429093361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0689816665253602e-05, + "grad_norm": 27.821208953857422, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.852171778678894, + "num_tokens": 443033454.0, + "step": 11611 + }, + { + "epoch": 1.4771657549930035, + "ewc_loss": 0.04138459265232086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0692295947810635e-05, + "grad_norm": 27.897815704345703, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8667031526565552, + "num_tokens": 443077902.0, + "step": 11612 + }, + { + "epoch": 1.477292965271594, + "ewc_loss": 0.041391149163246155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0695573766715825e-05, + "grad_norm": 27.9481143951416, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8591314554214478, + "num_tokens": 443116515.0, + "step": 11613 + }, + { + "epoch": 1.4774201755501846, + "ewc_loss": 0.04136040061712265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0680199668277055e-05, + "grad_norm": 27.870174407958984, + "learning_rate": 1e-06, + "loss": 0.3446, + "mean_token_accuracy": 0.8919215202331543, + "num_tokens": 443154124.0, + "step": 11614 + }, + { + "epoch": 1.477547385828775, + "ewc_loss": 0.04125199094414711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0625995603040792e-05, + "grad_norm": 27.883193969726562, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8763843774795532, + "num_tokens": 443195852.0, + "step": 11615 + }, + { + "epoch": 1.4776745961073654, + "ewc_loss": 0.041353050619363785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0676525309681892e-05, + "grad_norm": 27.837833404541016, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8605086803436279, + "num_tokens": 443229850.0, + "step": 11616 + }, + { + "epoch": 1.477801806385956, + "ewc_loss": 0.04128287360072136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0641437004087493e-05, + "grad_norm": 27.885143280029297, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8586289286613464, + "num_tokens": 443274481.0, + "step": 11617 + }, + { + "epoch": 1.4779290166645465, + "ewc_loss": 0.04135710000991821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.067854984488804e-05, + "grad_norm": 27.927398681640625, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8731708526611328, + "num_tokens": 443311737.0, + "step": 11618 + }, + { + "epoch": 1.478056226943137, + "ewc_loss": 0.04133015498518944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0665076590375975e-05, + "grad_norm": 27.9760684967041, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8584360480308533, + "num_tokens": 443345310.0, + "step": 11619 + }, + { + "epoch": 1.4781834372217275, + "ewc_loss": 0.041379012167453766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0689505618065596e-05, + "grad_norm": 27.926149368286133, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8660951852798462, + "num_tokens": 443392869.0, + "step": 11620 + }, + { + "epoch": 1.478310647500318, + "ewc_loss": 0.04125198349356651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0625991965061985e-05, + "grad_norm": 27.90563201904297, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8652569055557251, + "num_tokens": 443432842.0, + "step": 11621 + }, + { + "epoch": 1.4784378577789086, + "ewc_loss": 0.04128880798816681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0644403775804676e-05, + "grad_norm": 27.93402099609375, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8689491152763367, + "num_tokens": 443465954.0, + "step": 11622 + }, + { + "epoch": 1.478565068057499, + "ewc_loss": 0.04130520299077034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0652601961046457e-05, + "grad_norm": 27.997480392456055, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8581902384757996, + "num_tokens": 443508506.0, + "step": 11623 + }, + { + "epoch": 1.4786922783360896, + "ewc_loss": 0.0412299782037735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0614988898159936e-05, + "grad_norm": 27.861955642700195, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8652648329734802, + "num_tokens": 443543575.0, + "step": 11624 + }, + { + "epoch": 1.47881948861468, + "ewc_loss": 0.041284218430519104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0642110030166805e-05, + "grad_norm": 27.957136154174805, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.869879961013794, + "num_tokens": 443578504.0, + "step": 11625 + }, + { + "epoch": 1.4789466988932705, + "ewc_loss": 0.0413556843996048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.067784225801006e-05, + "grad_norm": 27.951311111450195, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8796355128288269, + "num_tokens": 443614647.0, + "step": 11626 + }, + { + "epoch": 1.479073909171861, + "ewc_loss": 0.04120485112071037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0602425138349645e-05, + "grad_norm": 27.83965301513672, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8612465858459473, + "num_tokens": 443649244.0, + "step": 11627 + }, + { + "epoch": 1.4792011194504515, + "ewc_loss": 0.041374415159225464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0687208234448917e-05, + "grad_norm": 27.965532302856445, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8625774383544922, + "num_tokens": 443688374.0, + "step": 11628 + }, + { + "epoch": 1.479328329729042, + "ewc_loss": 0.04136636108160019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0683180991909467e-05, + "grad_norm": 27.99825668334961, + "learning_rate": 1e-06, + "loss": 0.5263, + "mean_token_accuracy": 0.8362643122673035, + "num_tokens": 443730929.0, + "step": 11629 + }, + { + "epoch": 1.4794555400076326, + "ewc_loss": 0.04128928855061531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0644643882405944e-05, + "grad_norm": 27.759225845336914, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8607196807861328, + "num_tokens": 443763067.0, + "step": 11630 + }, + { + "epoch": 1.479582750286223, + "ewc_loss": 0.04127984121441841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0639919966924936e-05, + "grad_norm": 27.964162826538086, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.85544353723526, + "num_tokens": 443801273.0, + "step": 11631 + }, + { + "epoch": 1.4797099605648136, + "ewc_loss": 0.04142221435904503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0711107936222106e-05, + "grad_norm": 27.91413116455078, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8539050817489624, + "num_tokens": 443840394.0, + "step": 11632 + }, + { + "epoch": 1.4798371708434042, + "ewc_loss": 0.04130641743540764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.065320950350724e-05, + "grad_norm": 27.833866119384766, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8621786832809448, + "num_tokens": 443877486.0, + "step": 11633 + }, + { + "epoch": 1.4799643811219947, + "ewc_loss": 0.041362229734659195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.068111461994704e-05, + "grad_norm": 27.79010772705078, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.866558313369751, + "num_tokens": 443913763.0, + "step": 11634 + }, + { + "epoch": 1.4800915914005852, + "ewc_loss": 0.041398786008358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.069939364446327e-05, + "grad_norm": 27.90033721923828, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8739101886749268, + "num_tokens": 443946840.0, + "step": 11635 + }, + { + "epoch": 1.4802188016791757, + "ewc_loss": 0.0414850227534771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0742510969284922e-05, + "grad_norm": 27.87484359741211, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8519406318664551, + "num_tokens": 443992439.0, + "step": 11636 + }, + { + "epoch": 1.4803460119577663, + "ewc_loss": 0.0414862260222435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0743113054777496e-05, + "grad_norm": 27.996129989624023, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8759259581565857, + "num_tokens": 444029348.0, + "step": 11637 + }, + { + "epoch": 1.4804732222363568, + "ewc_loss": 0.041388705372810364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0694353224826045e-05, + "grad_norm": 27.70098876953125, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8737455606460571, + "num_tokens": 444068386.0, + "step": 11638 + }, + { + "epoch": 1.4806004325149473, + "ewc_loss": 0.041498493403196335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0749246687046252e-05, + "grad_norm": 28.035995483398438, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8552387952804565, + "num_tokens": 444110307.0, + "step": 11639 + }, + { + "epoch": 1.4807276427935376, + "ewc_loss": 0.04153569042682648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.076784585369751e-05, + "grad_norm": 27.78767967224121, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8665996789932251, + "num_tokens": 444153919.0, + "step": 11640 + }, + { + "epoch": 1.4808548530721282, + "ewc_loss": 0.0413861945271492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.06930981221376e-05, + "grad_norm": 27.890583038330078, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8681674003601074, + "num_tokens": 444190575.0, + "step": 11641 + }, + { + "epoch": 1.4809820633507187, + "ewc_loss": 0.04148949310183525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.074474650726188e-05, + "grad_norm": 27.866539001464844, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8600517511367798, + "num_tokens": 444226108.0, + "step": 11642 + }, + { + "epoch": 1.4811092736293092, + "ewc_loss": 0.04137065261602402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0685325580416247e-05, + "grad_norm": 27.753807067871094, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8636955618858337, + "num_tokens": 444268162.0, + "step": 11643 + }, + { + "epoch": 1.4812364839078997, + "ewc_loss": 0.04153303802013397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0766519810422324e-05, + "grad_norm": 28.080764770507812, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8521981239318848, + "num_tokens": 444309273.0, + "step": 11644 + }, + { + "epoch": 1.4813636941864903, + "ewc_loss": 0.041498079895973206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0749039322254248e-05, + "grad_norm": 27.893089294433594, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8689305186271667, + "num_tokens": 444338697.0, + "step": 11645 + }, + { + "epoch": 1.4814909044650808, + "ewc_loss": 0.041404448449611664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0702223991975188e-05, + "grad_norm": 27.842437744140625, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8765551447868347, + "num_tokens": 444377954.0, + "step": 11646 + }, + { + "epoch": 1.4816181147436713, + "ewc_loss": 0.04141823202371597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0709116142825224e-05, + "grad_norm": 27.911724090576172, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8672225475311279, + "num_tokens": 444416996.0, + "step": 11647 + }, + { + "epoch": 1.4817453250222619, + "ewc_loss": 0.041428714990615845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.071435665129684e-05, + "grad_norm": 27.840147018432617, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8720008134841919, + "num_tokens": 444455550.0, + "step": 11648 + }, + { + "epoch": 1.4818725353008524, + "ewc_loss": 0.04146553575992584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.073276846203953e-05, + "grad_norm": 27.867321014404297, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8625683784484863, + "num_tokens": 444493979.0, + "step": 11649 + }, + { + "epoch": 1.4819997455794427, + "ewc_loss": 0.04142405092716217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0712024706881493e-05, + "grad_norm": 27.840557098388672, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.86471027135849, + "num_tokens": 444540622.0, + "step": 11650 + }, + { + "epoch": 1.4821269558580332, + "ewc_loss": 0.04149011895060539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.074505937343929e-05, + "grad_norm": 27.94261360168457, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8633652925491333, + "num_tokens": 444577772.0, + "step": 11651 + }, + { + "epoch": 1.4822541661366238, + "ewc_loss": 0.04147106036543846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0735529687954113e-05, + "grad_norm": 27.90146255493164, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.853029727935791, + "num_tokens": 444615523.0, + "step": 11652 + }, + { + "epoch": 1.4823813764152143, + "ewc_loss": 0.04139016941189766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.069508445856627e-05, + "grad_norm": 27.95223617553711, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8648511171340942, + "num_tokens": 444659785.0, + "step": 11653 + }, + { + "epoch": 1.4825085866938048, + "ewc_loss": 0.04148035869002342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0740179024869576e-05, + "grad_norm": 27.910783767700195, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8626627922058105, + "num_tokens": 444700806.0, + "step": 11654 + }, + { + "epoch": 1.4826357969723953, + "ewc_loss": 0.04145963117480278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0729816242237575e-05, + "grad_norm": 27.9757080078125, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8677756786346436, + "num_tokens": 444734320.0, + "step": 11655 + }, + { + "epoch": 1.4827630072509859, + "ewc_loss": 0.04142656549811363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0713283447548747e-05, + "grad_norm": 27.905654907226562, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.859926700592041, + "num_tokens": 444770215.0, + "step": 11656 + }, + { + "epoch": 1.4828902175295764, + "ewc_loss": 0.04136853292584419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0684266928583384e-05, + "grad_norm": 27.882417678833008, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8652058839797974, + "num_tokens": 444805432.0, + "step": 11657 + }, + { + "epoch": 1.483017427808167, + "ewc_loss": 0.04156116768717766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.078058423649054e-05, + "grad_norm": 28.04022216796875, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8652939200401306, + "num_tokens": 444832398.0, + "step": 11658 + }, + { + "epoch": 1.4831446380867574, + "ewc_loss": 0.04146822541952133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.073411269520875e-05, + "grad_norm": 27.927471160888672, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8704150915145874, + "num_tokens": 444876128.0, + "step": 11659 + }, + { + "epoch": 1.483271848365348, + "ewc_loss": 0.041448283940553665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0724142814287916e-05, + "grad_norm": 27.977785110473633, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8617535829544067, + "num_tokens": 444915188.0, + "step": 11660 + }, + { + "epoch": 1.4833990586439385, + "ewc_loss": 0.04141228646039963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.070614391413983e-05, + "grad_norm": 27.912673950195312, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8713741302490234, + "num_tokens": 444953528.0, + "step": 11661 + }, + { + "epoch": 1.483526268922529, + "ewc_loss": 0.04143296927213669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0716484868898988e-05, + "grad_norm": 27.836822509765625, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8596314191818237, + "num_tokens": 444996469.0, + "step": 11662 + }, + { + "epoch": 1.4836534792011196, + "ewc_loss": 0.04138926416635513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0694631530204788e-05, + "grad_norm": 27.80535888671875, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.883012056350708, + "num_tokens": 445034298.0, + "step": 11663 + }, + { + "epoch": 1.48378068947971, + "ewc_loss": 0.0415043830871582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0752191630890593e-05, + "grad_norm": 27.969493865966797, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8649884462356567, + "num_tokens": 445079010.0, + "step": 11664 + }, + { + "epoch": 1.4839078997583004, + "ewc_loss": 0.041517164558172226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.075858174066525e-05, + "grad_norm": 27.77505111694336, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8793351054191589, + "num_tokens": 445116009.0, + "step": 11665 + }, + { + "epoch": 1.484035110036891, + "ewc_loss": 0.041446503251791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.072325150948018e-05, + "grad_norm": 27.91118621826172, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8790756464004517, + "num_tokens": 445156313.0, + "step": 11666 + }, + { + "epoch": 1.4841623203154815, + "ewc_loss": 0.04155018553137779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0775092707481235e-05, + "grad_norm": 27.874094009399414, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8672138452529907, + "num_tokens": 445187424.0, + "step": 11667 + }, + { + "epoch": 1.484289530594072, + "ewc_loss": 0.041491106152534485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.074555231956765e-05, + "grad_norm": 27.883737564086914, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.867508590221405, + "num_tokens": 445233448.0, + "step": 11668 + }, + { + "epoch": 1.4844167408726625, + "ewc_loss": 0.04153028130531311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0765141016454436e-05, + "grad_norm": 27.938053131103516, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8761447668075562, + "num_tokens": 445273663.0, + "step": 11669 + }, + { + "epoch": 1.484543951151253, + "ewc_loss": 0.04144890233874321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.072445204248652e-05, + "grad_norm": 27.88741683959961, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8721930980682373, + "num_tokens": 445313790.0, + "step": 11670 + }, + { + "epoch": 1.4846711614298436, + "ewc_loss": 0.04146164655685425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.073082396236714e-05, + "grad_norm": 27.89348793029785, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8769601583480835, + "num_tokens": 445351818.0, + "step": 11671 + }, + { + "epoch": 1.484798371708434, + "ewc_loss": 0.04146122932434082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0730614778585732e-05, + "grad_norm": 27.884126663208008, + "learning_rate": 1e-06, + "loss": 0.5139, + "mean_token_accuracy": 0.840368390083313, + "num_tokens": 445391763.0, + "step": 11672 + }, + { + "epoch": 1.4849255819870246, + "ewc_loss": 0.04139192774891853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0695963030448183e-05, + "grad_norm": 27.734949111938477, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8601693511009216, + "num_tokens": 445426723.0, + "step": 11673 + }, + { + "epoch": 1.485052792265615, + "ewc_loss": 0.04143708944320679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.07185439649038e-05, + "grad_norm": 27.93988609313965, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8710517883300781, + "num_tokens": 445465635.0, + "step": 11674 + }, + { + "epoch": 1.4851800025442055, + "ewc_loss": 0.041495274752378464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0747636881424114e-05, + "grad_norm": 27.81884765625, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8673539161682129, + "num_tokens": 445502819.0, + "step": 11675 + }, + { + "epoch": 1.485307212822796, + "ewc_loss": 0.04149575158953667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.074787516903598e-05, + "grad_norm": 27.94033432006836, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8759242296218872, + "num_tokens": 445541861.0, + "step": 11676 + }, + { + "epoch": 1.4854344231013865, + "ewc_loss": 0.04146144911646843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0730723917949945e-05, + "grad_norm": 27.873628616333008, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8817949295043945, + "num_tokens": 445578058.0, + "step": 11677 + }, + { + "epoch": 1.485561633379977, + "ewc_loss": 0.04150386154651642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0751931515405886e-05, + "grad_norm": 27.921308517456055, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8515116572380066, + "num_tokens": 445612896.0, + "step": 11678 + }, + { + "epoch": 1.4856888436585676, + "ewc_loss": 0.04144642502069473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0723213310702704e-05, + "grad_norm": 27.79631233215332, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8597186803817749, + "num_tokens": 445647849.0, + "step": 11679 + }, + { + "epoch": 1.485816053937158, + "ewc_loss": 0.04149288311600685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0746441805385984e-05, + "grad_norm": 27.888803482055664, + "learning_rate": 1e-06, + "loss": 0.4729, + "mean_token_accuracy": 0.851733386516571, + "num_tokens": 445689199.0, + "step": 11680 + }, + { + "epoch": 1.4859432642157486, + "ewc_loss": 0.04154534637928009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.077267345157452e-05, + "grad_norm": 27.895597457885742, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8642182350158691, + "num_tokens": 445727869.0, + "step": 11681 + }, + { + "epoch": 1.4860704744943392, + "ewc_loss": 0.041440315544605255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0720157408504747e-05, + "grad_norm": 27.80839729309082, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8703206777572632, + "num_tokens": 445770105.0, + "step": 11682 + }, + { + "epoch": 1.4861976847729297, + "ewc_loss": 0.04150133579969406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.075066731777042e-05, + "grad_norm": 27.961471557617188, + "learning_rate": 1e-06, + "loss": 0.4961, + "mean_token_accuracy": 0.8455552458763123, + "num_tokens": 445812950.0, + "step": 11683 + }, + { + "epoch": 1.4863248950515202, + "ewc_loss": 0.04151557385921478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.07577868422959e-05, + "grad_norm": 27.76211929321289, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.863603949546814, + "num_tokens": 445850811.0, + "step": 11684 + }, + { + "epoch": 1.4864521053301107, + "ewc_loss": 0.0414794385433197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.073971882055048e-05, + "grad_norm": 27.89988136291504, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8704507946968079, + "num_tokens": 445891967.0, + "step": 11685 + }, + { + "epoch": 1.4865793156087013, + "ewc_loss": 0.04160783439874649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0803916413569823e-05, + "grad_norm": 27.870935440063477, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8674931526184082, + "num_tokens": 445927936.0, + "step": 11686 + }, + { + "epoch": 1.4867065258872918, + "ewc_loss": 0.04156837612390518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0784187654498965e-05, + "grad_norm": 27.8819522857666, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8782472610473633, + "num_tokens": 445962481.0, + "step": 11687 + }, + { + "epoch": 1.4868337361658823, + "ewc_loss": 0.041537582874298096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0768791728187352e-05, + "grad_norm": 27.9356689453125, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8801438808441162, + "num_tokens": 446001445.0, + "step": 11688 + }, + { + "epoch": 1.4869609464444726, + "ewc_loss": 0.0415274053812027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.076370219583623e-05, + "grad_norm": 27.927968978881836, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8849222660064697, + "num_tokens": 446037904.0, + "step": 11689 + }, + { + "epoch": 1.4870881567230632, + "ewc_loss": 0.04144783318042755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0723917259601876e-05, + "grad_norm": 27.865148544311523, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8753305673599243, + "num_tokens": 446077583.0, + "step": 11690 + }, + { + "epoch": 1.4872153670016537, + "ewc_loss": 0.04142840579152107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0714202037197538e-05, + "grad_norm": 27.86823081970215, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8645423650741577, + "num_tokens": 446110123.0, + "step": 11691 + }, + { + "epoch": 1.4873425772802442, + "ewc_loss": 0.04149891063570976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.074945587082766e-05, + "grad_norm": 27.996313095092773, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8642914295196533, + "num_tokens": 446149832.0, + "step": 11692 + }, + { + "epoch": 1.4874697875588347, + "ewc_loss": 0.04147619009017944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0738094463013113e-05, + "grad_norm": 27.799880981445312, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8725907802581787, + "num_tokens": 446184211.0, + "step": 11693 + }, + { + "epoch": 1.4875969978374253, + "ewc_loss": 0.04146955907344818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0734780264319852e-05, + "grad_norm": 27.966501235961914, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8601132035255432, + "num_tokens": 446229620.0, + "step": 11694 + }, + { + "epoch": 1.4877242081160158, + "ewc_loss": 0.04154627397656441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0773137293872423e-05, + "grad_norm": 27.847684860229492, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8686308860778809, + "num_tokens": 446267134.0, + "step": 11695 + }, + { + "epoch": 1.4878514183946063, + "ewc_loss": 0.04145818203687668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.072909046546556e-05, + "grad_norm": 27.979738235473633, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8542611598968506, + "num_tokens": 446303001.0, + "step": 11696 + }, + { + "epoch": 1.4879786286731969, + "ewc_loss": 0.04149308428168297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.074654184980318e-05, + "grad_norm": 27.915672302246094, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8649888038635254, + "num_tokens": 446334286.0, + "step": 11697 + }, + { + "epoch": 1.4881058389517874, + "ewc_loss": 0.04149118438363075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.074559233733453e-05, + "grad_norm": 28.00263214111328, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8848456144332886, + "num_tokens": 446369547.0, + "step": 11698 + }, + { + "epoch": 1.4882330492303777, + "ewc_loss": 0.04150320589542389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0751602278323844e-05, + "grad_norm": 27.87112045288086, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.85125732421875, + "num_tokens": 446408179.0, + "step": 11699 + }, + { + "epoch": 1.4883602595089682, + "ewc_loss": 0.0414799265563488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0739962565130554e-05, + "grad_norm": 27.989566802978516, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8682490587234497, + "num_tokens": 446448533.0, + "step": 11700 + }, + { + "epoch": 1.4884874697875587, + "ewc_loss": 0.04152606427669525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0763032807735726e-05, + "grad_norm": 28.067523956298828, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8644504547119141, + "num_tokens": 446489445.0, + "step": 11701 + }, + { + "epoch": 1.4886146800661493, + "ewc_loss": 0.04144192114472389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0720961401821114e-05, + "grad_norm": 27.89203453063965, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.879497230052948, + "num_tokens": 446526530.0, + "step": 11702 + }, + { + "epoch": 1.4887418903447398, + "ewc_loss": 0.04142800346016884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0714001948363148e-05, + "grad_norm": 28.0640926361084, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8665785193443298, + "num_tokens": 446558315.0, + "step": 11703 + }, + { + "epoch": 1.4888691006233303, + "ewc_loss": 0.041477322578430176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.073866198770702e-05, + "grad_norm": 27.977420806884766, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8697271943092346, + "num_tokens": 446598548.0, + "step": 11704 + }, + { + "epoch": 1.4889963109019209, + "ewc_loss": 0.04140277951955795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.070138907583896e-05, + "grad_norm": 28.073169708251953, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8623643517494202, + "num_tokens": 446632111.0, + "step": 11705 + }, + { + "epoch": 1.4891235211805114, + "ewc_loss": 0.04142704978585243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.071352537313942e-05, + "grad_norm": 27.914142608642578, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8746103644371033, + "num_tokens": 446669296.0, + "step": 11706 + }, + { + "epoch": 1.489250731459102, + "ewc_loss": 0.04143780097365379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0718900486826897e-05, + "grad_norm": 27.995662689208984, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8683758974075317, + "num_tokens": 446707891.0, + "step": 11707 + }, + { + "epoch": 1.4893779417376924, + "ewc_loss": 0.04146675392985344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0733377823489718e-05, + "grad_norm": 28.06196403503418, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8547611236572266, + "num_tokens": 446749455.0, + "step": 11708 + }, + { + "epoch": 1.489505152016283, + "ewc_loss": 0.04133305326104164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.06665263249306e-05, + "grad_norm": 27.966066360473633, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8788644075393677, + "num_tokens": 446787001.0, + "step": 11709 + }, + { + "epoch": 1.4896323622948735, + "ewc_loss": 0.04139327630400658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.06966378755169e-05, + "grad_norm": 27.931243896484375, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8670490384101868, + "num_tokens": 446827262.0, + "step": 11710 + }, + { + "epoch": 1.489759572573464, + "ewc_loss": 0.04140928015112877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0704639609903097e-05, + "grad_norm": 27.96786880493164, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8823651075363159, + "num_tokens": 446860514.0, + "step": 11711 + }, + { + "epoch": 1.4898867828520546, + "ewc_loss": 0.041453856974840164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0726927687064745e-05, + "grad_norm": 27.957895278930664, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8579052686691284, + "num_tokens": 446902605.0, + "step": 11712 + }, + { + "epoch": 1.490013993130645, + "ewc_loss": 0.041352368891239166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.067618515866343e-05, + "grad_norm": 27.8751220703125, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8730796575546265, + "num_tokens": 446938638.0, + "step": 11713 + }, + { + "epoch": 1.4901412034092354, + "ewc_loss": 0.04141027480363846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.070513801299967e-05, + "grad_norm": 27.945323944091797, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.871698796749115, + "num_tokens": 446979286.0, + "step": 11714 + }, + { + "epoch": 1.490268413687826, + "ewc_loss": 0.041445616632699966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0722807676065713e-05, + "grad_norm": 28.11087989807129, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8521429300308228, + "num_tokens": 447017405.0, + "step": 11715 + }, + { + "epoch": 1.4903956239664164, + "ewc_loss": 0.04135092347860336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0675461200880818e-05, + "grad_norm": 27.867015838623047, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.86990886926651, + "num_tokens": 447056733.0, + "step": 11716 + }, + { + "epoch": 1.490522834245007, + "ewc_loss": 0.041364505887031555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0682253307313658e-05, + "grad_norm": 28.018264770507812, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8620137572288513, + "num_tokens": 447094493.0, + "step": 11717 + }, + { + "epoch": 1.4906500445235975, + "ewc_loss": 0.041462335735559464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.073116775136441e-05, + "grad_norm": 27.85752296447754, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8825862407684326, + "num_tokens": 447134039.0, + "step": 11718 + }, + { + "epoch": 1.490777254802188, + "ewc_loss": 0.0414721742272377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0736086298711598e-05, + "grad_norm": 28.007104873657227, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8763000965118408, + "num_tokens": 447168320.0, + "step": 11719 + }, + { + "epoch": 1.4909044650807786, + "ewc_loss": 0.04153536632657051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0767683963640593e-05, + "grad_norm": 27.969297409057617, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8512210845947266, + "num_tokens": 447205640.0, + "step": 11720 + }, + { + "epoch": 1.491031675359369, + "ewc_loss": 0.04144586622714996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0722933186334558e-05, + "grad_norm": 27.93409538269043, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8807868361473083, + "num_tokens": 447239831.0, + "step": 11721 + }, + { + "epoch": 1.4911588856379596, + "ewc_loss": 0.04149822145700455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0749110262840986e-05, + "grad_norm": 27.87620735168457, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8519906997680664, + "num_tokens": 447281100.0, + "step": 11722 + }, + { + "epoch": 1.49128609591655, + "ewc_loss": 0.041530054062604904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0765026420122012e-05, + "grad_norm": 28.00250816345215, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8480627536773682, + "num_tokens": 447316137.0, + "step": 11723 + }, + { + "epoch": 1.4914133061951405, + "ewc_loss": 0.0415705181658268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0785259039257653e-05, + "grad_norm": 27.932790756225586, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8605395555496216, + "num_tokens": 447357922.0, + "step": 11724 + }, + { + "epoch": 1.491540516473731, + "ewc_loss": 0.04145650565624237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0728251911350526e-05, + "grad_norm": 27.910951614379883, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8632078170776367, + "num_tokens": 447394905.0, + "step": 11725 + }, + { + "epoch": 1.4916677267523215, + "ewc_loss": 0.04152141511440277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0760708139277995e-05, + "grad_norm": 27.953344345092773, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8766626119613647, + "num_tokens": 447435379.0, + "step": 11726 + }, + { + "epoch": 1.491794937030912, + "ewc_loss": 0.04156668856739998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.078334364341572e-05, + "grad_norm": 27.983911514282227, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8576653599739075, + "num_tokens": 447473058.0, + "step": 11727 + }, + { + "epoch": 1.4919221473095026, + "ewc_loss": 0.04154177010059357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0770885384990834e-05, + "grad_norm": 28.029279708862305, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8767135143280029, + "num_tokens": 447510966.0, + "step": 11728 + }, + { + "epoch": 1.492049357588093, + "ewc_loss": 0.04157061129808426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0785306332982145e-05, + "grad_norm": 27.97707176208496, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8695369958877563, + "num_tokens": 447554747.0, + "step": 11729 + }, + { + "epoch": 1.4921765678666836, + "ewc_loss": 0.04151849448680878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0759247490786947e-05, + "grad_norm": 27.8507137298584, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8691869974136353, + "num_tokens": 447594426.0, + "step": 11730 + }, + { + "epoch": 1.4923037781452742, + "ewc_loss": 0.0415889173746109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0794459487660788e-05, + "grad_norm": 28.010175704956055, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8592091798782349, + "num_tokens": 447634786.0, + "step": 11731 + }, + { + "epoch": 1.4924309884238647, + "ewc_loss": 0.04154246672987938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.077123281196691e-05, + "grad_norm": 27.922250747680664, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8648303747177124, + "num_tokens": 447668682.0, + "step": 11732 + }, + { + "epoch": 1.4925581987024552, + "ewc_loss": 0.0415625125169754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.078125544358045e-05, + "grad_norm": 27.876127243041992, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8574908971786499, + "num_tokens": 447705566.0, + "step": 11733 + }, + { + "epoch": 1.4926854089810457, + "ewc_loss": 0.04152596741914749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.076298369502183e-05, + "grad_norm": 27.861745834350586, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8642362356185913, + "num_tokens": 447746385.0, + "step": 11734 + }, + { + "epoch": 1.4928126192596363, + "ewc_loss": 0.04155774414539337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0778872567461804e-05, + "grad_norm": 27.947477340698242, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8704161643981934, + "num_tokens": 447790483.0, + "step": 11735 + }, + { + "epoch": 1.4929398295382268, + "ewc_loss": 0.041642457246780396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0821229554712772e-05, + "grad_norm": 27.93718719482422, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8633595705032349, + "num_tokens": 447826872.0, + "step": 11736 + }, + { + "epoch": 1.4930670398168173, + "ewc_loss": 0.041557539254426956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.07787688850658e-05, + "grad_norm": 27.966693878173828, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8714228868484497, + "num_tokens": 447864071.0, + "step": 11737 + }, + { + "epoch": 1.4931942500954076, + "ewc_loss": 0.04154089838266373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0770448827533983e-05, + "grad_norm": 27.854961395263672, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8505957126617432, + "num_tokens": 447903320.0, + "step": 11738 + }, + { + "epoch": 1.4933214603739982, + "ewc_loss": 0.04154987260699272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.077493627439253e-05, + "grad_norm": 27.9457950592041, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8813086152076721, + "num_tokens": 447940253.0, + "step": 11739 + }, + { + "epoch": 1.4934486706525887, + "ewc_loss": 0.04162849113345146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.081424645439256e-05, + "grad_norm": 27.951974868774414, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.872529923915863, + "num_tokens": 447978277.0, + "step": 11740 + }, + { + "epoch": 1.4935758809311792, + "ewc_loss": 0.041478440165519714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0739220417453907e-05, + "grad_norm": 27.839828491210938, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8563113212585449, + "num_tokens": 448018798.0, + "step": 11741 + }, + { + "epoch": 1.4937030912097697, + "ewc_loss": 0.041535165160894394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0767582100233994e-05, + "grad_norm": 27.810882568359375, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8566006422042847, + "num_tokens": 448057469.0, + "step": 11742 + }, + { + "epoch": 1.4938303014883603, + "ewc_loss": 0.04154210165143013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0771050913026556e-05, + "grad_norm": 27.928829193115234, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8528800010681152, + "num_tokens": 448089032.0, + "step": 11743 + }, + { + "epoch": 1.4939575117669508, + "ewc_loss": 0.04162153601646423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.081076854665298e-05, + "grad_norm": 27.927207946777344, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8620309233665466, + "num_tokens": 448122438.0, + "step": 11744 + }, + { + "epoch": 1.4940847220455413, + "ewc_loss": 0.041525840759277344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0762920030392706e-05, + "grad_norm": 28.008197784423828, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8653413653373718, + "num_tokens": 448166844.0, + "step": 11745 + }, + { + "epoch": 1.4942119323241319, + "ewc_loss": 0.041591525077819824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0795761884073727e-05, + "grad_norm": 27.97086524963379, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8567632436752319, + "num_tokens": 448197766.0, + "step": 11746 + }, + { + "epoch": 1.4943391426027224, + "ewc_loss": 0.041539084166288376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0769542970811017e-05, + "grad_norm": 27.893449783325195, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8533457517623901, + "num_tokens": 448240850.0, + "step": 11747 + }, + { + "epoch": 1.4944663528813127, + "ewc_loss": 0.04159628227353096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0798141122213565e-05, + "grad_norm": 28.048532485961914, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8747704029083252, + "num_tokens": 448274187.0, + "step": 11748 + }, + { + "epoch": 1.4945935631599032, + "ewc_loss": 0.041588108986616135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0794053853023797e-05, + "grad_norm": 27.864097595214844, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.871903657913208, + "num_tokens": 448311069.0, + "step": 11749 + }, + { + "epoch": 1.4947207734384937, + "ewc_loss": 0.04162292927503586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0811465219594538e-05, + "grad_norm": 28.048383712768555, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8661108016967773, + "num_tokens": 448351536.0, + "step": 11750 + }, + { + "epoch": 1.4948479837170843, + "ewc_loss": 0.04159039258956909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0795196178369224e-05, + "grad_norm": 27.84699821472168, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8634768724441528, + "num_tokens": 448385108.0, + "step": 11751 + }, + { + "epoch": 1.4949751939956748, + "ewc_loss": 0.04159421846270561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0797109755221754e-05, + "grad_norm": 28.060937881469727, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8767914175987244, + "num_tokens": 448423272.0, + "step": 11752 + }, + { + "epoch": 1.4951024042742653, + "ewc_loss": 0.041634101420640945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0817051336052828e-05, + "grad_norm": 27.80605697631836, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8712015151977539, + "num_tokens": 448462904.0, + "step": 11753 + }, + { + "epoch": 1.4952296145528559, + "ewc_loss": 0.041631173342466354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0815587049582973e-05, + "grad_norm": 27.894269943237305, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8635905981063843, + "num_tokens": 448503257.0, + "step": 11754 + }, + { + "epoch": 1.4953568248314464, + "ewc_loss": 0.04173917695879936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0869589206995443e-05, + "grad_norm": 27.963102340698242, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8715169429779053, + "num_tokens": 448535299.0, + "step": 11755 + }, + { + "epoch": 1.495484035110037, + "ewc_loss": 0.041670527309179306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.083526305796113e-05, + "grad_norm": 28.036884307861328, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8735734224319458, + "num_tokens": 448568878.0, + "step": 11756 + }, + { + "epoch": 1.4956112453886274, + "ewc_loss": 0.04168170690536499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0840852812398225e-05, + "grad_norm": 27.950946807861328, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8844355940818787, + "num_tokens": 448604959.0, + "step": 11757 + }, + { + "epoch": 1.495738455667218, + "ewc_loss": 0.041556861251592636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0778430553036742e-05, + "grad_norm": 28.01780128479004, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8780966401100159, + "num_tokens": 448638487.0, + "step": 11758 + }, + { + "epoch": 1.4958656659458085, + "ewc_loss": 0.04157628118991852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.078814031847287e-05, + "grad_norm": 27.9329891204834, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8723867535591125, + "num_tokens": 448677212.0, + "step": 11759 + }, + { + "epoch": 1.495992876224399, + "ewc_loss": 0.04158473014831543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0792365830857307e-05, + "grad_norm": 27.99650001525879, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.861342191696167, + "num_tokens": 448719654.0, + "step": 11760 + }, + { + "epoch": 1.4961200865029896, + "ewc_loss": 0.04159080982208252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0795405362150632e-05, + "grad_norm": 27.868633270263672, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8700400590896606, + "num_tokens": 448760783.0, + "step": 11761 + }, + { + "epoch": 1.49624729678158, + "ewc_loss": 0.04157290980219841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0786454115295783e-05, + "grad_norm": 28.095111846923828, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8570520877838135, + "num_tokens": 448802843.0, + "step": 11762 + }, + { + "epoch": 1.4963745070601704, + "ewc_loss": 0.04161379113793373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.080689591821283e-05, + "grad_norm": 28.000965118408203, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8629817962646484, + "num_tokens": 448838430.0, + "step": 11763 + }, + { + "epoch": 1.496501717338761, + "ewc_loss": 0.04143959656357765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.071979906759225e-05, + "grad_norm": 27.902061462402344, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8580252528190613, + "num_tokens": 448880450.0, + "step": 11764 + }, + { + "epoch": 1.4966289276173514, + "ewc_loss": 0.041630279272794724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.08151395781897e-05, + "grad_norm": 28.026826858520508, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8679715991020203, + "num_tokens": 448919352.0, + "step": 11765 + }, + { + "epoch": 1.496756137895942, + "ewc_loss": 0.04158623889088631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0793118892470375e-05, + "grad_norm": 27.959762573242188, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8420044779777527, + "num_tokens": 448965402.0, + "step": 11766 + }, + { + "epoch": 1.4968833481745325, + "ewc_loss": 0.041615694761276245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0807847249670886e-05, + "grad_norm": 28.052623748779297, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8775984644889832, + "num_tokens": 448996983.0, + "step": 11767 + }, + { + "epoch": 1.497010558453123, + "ewc_loss": 0.041589073836803436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.079453770420514e-05, + "grad_norm": 27.993478775024414, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8519506454467773, + "num_tokens": 449036555.0, + "step": 11768 + }, + { + "epoch": 1.4971377687317136, + "ewc_loss": 0.04154237359762192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0771187337231822e-05, + "grad_norm": 28.006608963012695, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8777734041213989, + "num_tokens": 449070545.0, + "step": 11769 + }, + { + "epoch": 1.497264979010304, + "ewc_loss": 0.04152081161737442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0760406187037006e-05, + "grad_norm": 27.957754135131836, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8509306311607361, + "num_tokens": 449108859.0, + "step": 11770 + }, + { + "epoch": 1.4973921892888946, + "ewc_loss": 0.041581351310014725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0790675989701413e-05, + "grad_norm": 28.13882827758789, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8641629219055176, + "num_tokens": 449145333.0, + "step": 11771 + }, + { + "epoch": 1.497519399567485, + "ewc_loss": 0.04154649376869202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0773246433236636e-05, + "grad_norm": 28.022634506225586, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.86055588722229, + "num_tokens": 449181292.0, + "step": 11772 + }, + { + "epoch": 1.4976466098460754, + "ewc_loss": 0.041486941277980804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0743471395689994e-05, + "grad_norm": 27.953115463256836, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8693677186965942, + "num_tokens": 449213940.0, + "step": 11773 + }, + { + "epoch": 1.497773820124666, + "ewc_loss": 0.041554491966962814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0777246390935034e-05, + "grad_norm": 28.05194091796875, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8767002820968628, + "num_tokens": 449252487.0, + "step": 11774 + }, + { + "epoch": 1.4979010304032565, + "ewc_loss": 0.0415082611143589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0754130673594773e-05, + "grad_norm": 27.971715927124023, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8591386079788208, + "num_tokens": 449287268.0, + "step": 11775 + }, + { + "epoch": 1.498028240681847, + "ewc_loss": 0.041548632085323334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.077431599900592e-05, + "grad_norm": 27.978078842163086, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8632310032844543, + "num_tokens": 449326975.0, + "step": 11776 + }, + { + "epoch": 1.4981554509604376, + "ewc_loss": 0.04148920997977257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0744604626088403e-05, + "grad_norm": 27.92473030090332, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8651885986328125, + "num_tokens": 449366061.0, + "step": 11777 + }, + { + "epoch": 1.498282661239028, + "ewc_loss": 0.041488733142614365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.074436633847654e-05, + "grad_norm": 28.082927703857422, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8739026784896851, + "num_tokens": 449401763.0, + "step": 11778 + }, + { + "epoch": 1.4984098715176186, + "ewc_loss": 0.04157344624400139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0786723325727507e-05, + "grad_norm": 28.017982482910156, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8857738375663757, + "num_tokens": 449436605.0, + "step": 11779 + }, + { + "epoch": 1.4985370817962091, + "ewc_loss": 0.041488587856292725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0744293578900397e-05, + "grad_norm": 28.040531158447266, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8612458109855652, + "num_tokens": 449474523.0, + "step": 11780 + }, + { + "epoch": 1.4986642920747997, + "ewc_loss": 0.04153813049197197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0769064576597884e-05, + "grad_norm": 28.00417137145996, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8786450028419495, + "num_tokens": 449513301.0, + "step": 11781 + }, + { + "epoch": 1.4987915023533902, + "ewc_loss": 0.04154280945658684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0771405615960248e-05, + "grad_norm": 27.959331512451172, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8635293245315552, + "num_tokens": 449547826.0, + "step": 11782 + }, + { + "epoch": 1.4989187126319807, + "ewc_loss": 0.041459642350673676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0729821699205786e-05, + "grad_norm": 27.986759185791016, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8668816089630127, + "num_tokens": 449584818.0, + "step": 11783 + }, + { + "epoch": 1.4990459229105713, + "ewc_loss": 0.041585445404052734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0792722352780402e-05, + "grad_norm": 27.89278221130371, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8482829928398132, + "num_tokens": 449625368.0, + "step": 11784 + }, + { + "epoch": 1.4991731331891618, + "ewc_loss": 0.04148110747337341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0740553736686707e-05, + "grad_norm": 28.055370330810547, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8540962934494019, + "num_tokens": 449666320.0, + "step": 11785 + }, + { + "epoch": 1.4993003434677523, + "ewc_loss": 0.041631393134593964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0815696188947186e-05, + "grad_norm": 27.914331436157227, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8578945398330688, + "num_tokens": 449705993.0, + "step": 11786 + }, + { + "epoch": 1.4994275537463426, + "ewc_loss": 0.04154709354043007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.077354656648822e-05, + "grad_norm": 28.144039154052734, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8618533611297607, + "num_tokens": 449747088.0, + "step": 11787 + }, + { + "epoch": 1.4995547640249332, + "ewc_loss": 0.04167482256889343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0837411284446716e-05, + "grad_norm": 28.09081268310547, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8684530258178711, + "num_tokens": 449789840.0, + "step": 11788 + }, + { + "epoch": 1.4996819743035237, + "ewc_loss": 0.041457585990428925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.072879215120338e-05, + "grad_norm": 28.031539916992188, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8654775619506836, + "num_tokens": 449830599.0, + "step": 11789 + }, + { + "epoch": 1.4998091845821142, + "ewc_loss": 0.041470546275377274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0735273210448213e-05, + "grad_norm": 28.032224655151367, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8706032037734985, + "num_tokens": 449864476.0, + "step": 11790 + }, + { + "epoch": 1.4999363948607047, + "ewc_loss": 0.04146483913064003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.073241921607405e-05, + "grad_norm": 28.017887115478516, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8502002954483032, + "num_tokens": 449903791.0, + "step": 11791 + }, + { + "epoch": 1.5000636051392953, + "ewc_loss": 0.04153283312916756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.076641612802632e-05, + "grad_norm": 28.03972053527832, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8666784763336182, + "num_tokens": 449945478.0, + "step": 11792 + }, + { + "epoch": 1.5001908154178858, + "ewc_loss": 0.041478049010038376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0739023966598324e-05, + "grad_norm": 27.921602249145508, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8637403249740601, + "num_tokens": 449986175.0, + "step": 11793 + }, + { + "epoch": 1.5003180256964763, + "ewc_loss": 0.041469085961580276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.073454379569739e-05, + "grad_norm": 27.999378204345703, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8631365895271301, + "num_tokens": 450025537.0, + "step": 11794 + }, + { + "epoch": 1.5004452359750666, + "ewc_loss": 0.041546352207660675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0773175492649898e-05, + "grad_norm": 27.938846588134766, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.852783203125, + "num_tokens": 450068269.0, + "step": 11795 + }, + { + "epoch": 1.5005724462536572, + "ewc_loss": 0.041506800800561905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.075340125884395e-05, + "grad_norm": 27.955995559692383, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8814728260040283, + "num_tokens": 450106234.0, + "step": 11796 + }, + { + "epoch": 1.5006996565322477, + "ewc_loss": 0.04156390205025673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0781950297532603e-05, + "grad_norm": 27.95952606201172, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8764384984970093, + "num_tokens": 450141916.0, + "step": 11797 + }, + { + "epoch": 1.5008268668108382, + "ewc_loss": 0.04149813950061798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0749070245074108e-05, + "grad_norm": 28.044694900512695, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.8522616624832153, + "num_tokens": 450175427.0, + "step": 11798 + }, + { + "epoch": 1.5009540770894287, + "ewc_loss": 0.041546959429979324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.077347926388029e-05, + "grad_norm": 27.972871780395508, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8682565689086914, + "num_tokens": 450211104.0, + "step": 11799 + }, + { + "epoch": 1.5010812873680193, + "ewc_loss": 0.041544847190380096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0772424250026233e-05, + "grad_norm": 28.076923370361328, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8579044342041016, + "num_tokens": 450244844.0, + "step": 11800 + }, + { + "epoch": 1.5012084976466098, + "ewc_loss": 0.041580624878406525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0790312191820703e-05, + "grad_norm": 28.085206985473633, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8656291365623474, + "num_tokens": 450286094.0, + "step": 11801 + }, + { + "epoch": 1.5013357079252003, + "ewc_loss": 0.04155522957444191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0777615645783953e-05, + "grad_norm": 27.985206604003906, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8568254113197327, + "num_tokens": 450324938.0, + "step": 11802 + }, + { + "epoch": 1.5014629182037909, + "ewc_loss": 0.041559673845767975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0779836631845683e-05, + "grad_norm": 28.017030715942383, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8661538362503052, + "num_tokens": 450368572.0, + "step": 11803 + }, + { + "epoch": 1.5015901284823814, + "ewc_loss": 0.04158174991607666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.07908742595464e-05, + "grad_norm": 27.981426239013672, + "learning_rate": 1e-06, + "loss": 0.506, + "mean_token_accuracy": 0.8457891345024109, + "num_tokens": 450410926.0, + "step": 11804 + }, + { + "epoch": 1.501717338760972, + "ewc_loss": 0.04148516803979874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0742583728861064e-05, + "grad_norm": 27.878128051757812, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8687427639961243, + "num_tokens": 450450206.0, + "step": 11805 + }, + { + "epoch": 1.5018445490395624, + "ewc_loss": 0.041538599878549576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0769299226230942e-05, + "grad_norm": 27.939722061157227, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8461974859237671, + "num_tokens": 450488998.0, + "step": 11806 + }, + { + "epoch": 1.501971759318153, + "ewc_loss": 0.04166504368185997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0832521840929985e-05, + "grad_norm": 27.989437103271484, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8770928382873535, + "num_tokens": 450530476.0, + "step": 11807 + }, + { + "epoch": 1.5020989695967435, + "ewc_loss": 0.04158053547143936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.079026853607502e-05, + "grad_norm": 27.85423469543457, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8729463815689087, + "num_tokens": 450560655.0, + "step": 11808 + }, + { + "epoch": 1.502226179875334, + "ewc_loss": 0.04169774800539017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0848874555667862e-05, + "grad_norm": 27.968219757080078, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8681032657623291, + "num_tokens": 450601816.0, + "step": 11809 + }, + { + "epoch": 1.5023533901539246, + "ewc_loss": 0.041741982102394104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0870991647825576e-05, + "grad_norm": 27.92138671875, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8695313930511475, + "num_tokens": 450634182.0, + "step": 11810 + }, + { + "epoch": 1.502480600432515, + "ewc_loss": 0.04161987826228142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0809939087484963e-05, + "grad_norm": 27.914335250854492, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8609975576400757, + "num_tokens": 450672705.0, + "step": 11811 + }, + { + "epoch": 1.5026078107111056, + "ewc_loss": 0.04173075780272484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0865378246526234e-05, + "grad_norm": 28.032962799072266, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8543457388877869, + "num_tokens": 450705636.0, + "step": 11812 + }, + { + "epoch": 1.502735020989696, + "ewc_loss": 0.04174318537116051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0871591914328746e-05, + "grad_norm": 27.915803909301758, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8575832843780518, + "num_tokens": 450736165.0, + "step": 11813 + }, + { + "epoch": 1.5028622312682864, + "ewc_loss": 0.04171324893832207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.085662526951637e-05, + "grad_norm": 27.911893844604492, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.868255615234375, + "num_tokens": 450776939.0, + "step": 11814 + }, + { + "epoch": 1.502989441546877, + "ewc_loss": 0.04179852828383446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.089926420012489e-05, + "grad_norm": 27.921449661254883, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8706722259521484, + "num_tokens": 450811994.0, + "step": 11815 + }, + { + "epoch": 1.5031166518254675, + "ewc_loss": 0.041794124990701675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.08970632229466e-05, + "grad_norm": 27.934921264648438, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.867011547088623, + "num_tokens": 450853688.0, + "step": 11816 + }, + { + "epoch": 1.503243862104058, + "ewc_loss": 0.04174404218792915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0872021195827983e-05, + "grad_norm": 27.926803588867188, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8654000759124756, + "num_tokens": 450892100.0, + "step": 11817 + }, + { + "epoch": 1.5033710723826486, + "ewc_loss": 0.04174158349633217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0870791558991186e-05, + "grad_norm": 27.96448516845703, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8825711011886597, + "num_tokens": 450936383.0, + "step": 11818 + }, + { + "epoch": 1.503498282661239, + "ewc_loss": 0.041798971593379974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0899486116832122e-05, + "grad_norm": 28.04531478881836, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8741752505302429, + "num_tokens": 450965525.0, + "step": 11819 + }, + { + "epoch": 1.5036254929398294, + "ewc_loss": 0.041825320571660995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0912661057082005e-05, + "grad_norm": 27.941604614257812, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8531404137611389, + "num_tokens": 451005874.0, + "step": 11820 + }, + { + "epoch": 1.50375270321842, + "ewc_loss": 0.04170941561460495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0854708054685034e-05, + "grad_norm": 28.090087890625, + "learning_rate": 1e-06, + "loss": 0.4976, + "mean_token_accuracy": 0.8468479514122009, + "num_tokens": 451040668.0, + "step": 11821 + }, + { + "epoch": 1.5038799134970104, + "ewc_loss": 0.04180455580353737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0902278265566565e-05, + "grad_norm": 28.006240844726562, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8641559481620789, + "num_tokens": 451082747.0, + "step": 11822 + }, + { + "epoch": 1.504007123775601, + "ewc_loss": 0.0416722372174263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0836117982980795e-05, + "grad_norm": 27.94040298461914, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8722464442253113, + "num_tokens": 451120344.0, + "step": 11823 + }, + { + "epoch": 1.5041343340541915, + "ewc_loss": 0.04175538197159767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0877690985798836e-05, + "grad_norm": 28.0517635345459, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8599193096160889, + "num_tokens": 451156564.0, + "step": 11824 + }, + { + "epoch": 1.504261544332782, + "ewc_loss": 0.04177797958254814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0888990547973663e-05, + "grad_norm": 28.103004455566406, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8612725734710693, + "num_tokens": 451197252.0, + "step": 11825 + }, + { + "epoch": 1.5043887546113726, + "ewc_loss": 0.041694507002830505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0847253836109303e-05, + "grad_norm": 27.930191040039062, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8550475835800171, + "num_tokens": 451236150.0, + "step": 11826 + }, + { + "epoch": 1.504515964889963, + "ewc_loss": 0.041636958718299866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.081847924273461e-05, + "grad_norm": 27.955223083496094, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8621945381164551, + "num_tokens": 451275751.0, + "step": 11827 + }, + { + "epoch": 1.5046431751685536, + "ewc_loss": 0.041653815656900406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0826908439630643e-05, + "grad_norm": 27.919557571411133, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8860666751861572, + "num_tokens": 451316778.0, + "step": 11828 + }, + { + "epoch": 1.5047703854471441, + "ewc_loss": 0.04169059172272682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0845296603511088e-05, + "grad_norm": 27.9450626373291, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8647554516792297, + "num_tokens": 451359921.0, + "step": 11829 + }, + { + "epoch": 1.5048975957257347, + "ewc_loss": 0.04170289263129234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0851446606684476e-05, + "grad_norm": 27.94462776184082, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8715556859970093, + "num_tokens": 451395616.0, + "step": 11830 + }, + { + "epoch": 1.5050248060043252, + "ewc_loss": 0.041695233434438705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0847617633990012e-05, + "grad_norm": 27.944194793701172, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8663612604141235, + "num_tokens": 451427766.0, + "step": 11831 + }, + { + "epoch": 1.5051520162829157, + "ewc_loss": 0.041682492941617966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.084124571410939e-05, + "grad_norm": 28.0200138092041, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8746362328529358, + "num_tokens": 451463073.0, + "step": 11832 + }, + { + "epoch": 1.5052792265615063, + "ewc_loss": 0.04167376831173897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0836883777519688e-05, + "grad_norm": 27.823362350463867, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8838002681732178, + "num_tokens": 451503039.0, + "step": 11833 + }, + { + "epoch": 1.5054064368400968, + "ewc_loss": 0.041767675429582596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0883837350993417e-05, + "grad_norm": 28.056238174438477, + "learning_rate": 1e-06, + "loss": 0.5077, + "mean_token_accuracy": 0.8503276109695435, + "num_tokens": 451541599.0, + "step": 11834 + }, + { + "epoch": 1.5055336471186873, + "ewc_loss": 0.0417301282286644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.086506356135942e-05, + "grad_norm": 27.947031021118164, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8518401980400085, + "num_tokens": 451579278.0, + "step": 11835 + }, + { + "epoch": 1.5056608573972778, + "ewc_loss": 0.04174766317009926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.087383109028451e-05, + "grad_norm": 27.924938201904297, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8694822788238525, + "num_tokens": 451618591.0, + "step": 11836 + }, + { + "epoch": 1.5057880676758684, + "ewc_loss": 0.041681330651044846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.084066545648966e-05, + "grad_norm": 28.019710540771484, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8665326237678528, + "num_tokens": 451654453.0, + "step": 11837 + }, + { + "epoch": 1.5059152779544587, + "ewc_loss": 0.04171318933367729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.085659434669651e-05, + "grad_norm": 27.874055862426758, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8649100661277771, + "num_tokens": 451688302.0, + "step": 11838 + }, + { + "epoch": 1.5060424882330492, + "ewc_loss": 0.04165827855467796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.082913852063939e-05, + "grad_norm": 27.906349182128906, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8706910610198975, + "num_tokens": 451727301.0, + "step": 11839 + }, + { + "epoch": 1.5061696985116397, + "ewc_loss": 0.04184986278414726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0924931959598325e-05, + "grad_norm": 28.043169021606445, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8512156009674072, + "num_tokens": 451770471.0, + "step": 11840 + }, + { + "epoch": 1.5062969087902303, + "ewc_loss": 0.041649363934993744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0824681996600702e-05, + "grad_norm": 27.86115074157715, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8550341129302979, + "num_tokens": 451809413.0, + "step": 11841 + }, + { + "epoch": 1.5064241190688208, + "ewc_loss": 0.04175839200615883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0879195290035568e-05, + "grad_norm": 27.996816635131836, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8627622127532959, + "num_tokens": 451850177.0, + "step": 11842 + }, + { + "epoch": 1.5065513293474113, + "ewc_loss": 0.04172426089644432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.08621295314515e-05, + "grad_norm": 27.858808517456055, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8536269664764404, + "num_tokens": 451887977.0, + "step": 11843 + }, + { + "epoch": 1.5066785396260016, + "ewc_loss": 0.04174178093671799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0870889784418978e-05, + "grad_norm": 27.897245407104492, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8763432502746582, + "num_tokens": 451927403.0, + "step": 11844 + }, + { + "epoch": 1.5068057499045922, + "ewc_loss": 0.04184938222169876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0924691852997057e-05, + "grad_norm": 28.023605346679688, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8654216527938843, + "num_tokens": 451967350.0, + "step": 11845 + }, + { + "epoch": 1.5069329601831827, + "ewc_loss": 0.041794486343860626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.089724330289755e-05, + "grad_norm": 28.08907127380371, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8658905029296875, + "num_tokens": 452008664.0, + "step": 11846 + }, + { + "epoch": 1.5070601704617732, + "ewc_loss": 0.04172684997320175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0863424651906826e-05, + "grad_norm": 28.053022384643555, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8714609146118164, + "num_tokens": 452047303.0, + "step": 11847 + }, + { + "epoch": 1.5071873807403637, + "ewc_loss": 0.04167811945080757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.083905928884633e-05, + "grad_norm": 27.96215057373047, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.867286741733551, + "num_tokens": 452087359.0, + "step": 11848 + }, + { + "epoch": 1.5073145910189543, + "ewc_loss": 0.041728176176548004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.086408858303912e-05, + "grad_norm": 28.02700424194336, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8515844941139221, + "num_tokens": 452129426.0, + "step": 11849 + }, + { + "epoch": 1.5074418012975448, + "ewc_loss": 0.04172682389616966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0863411918981e-05, + "grad_norm": 27.899608612060547, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8708112239837646, + "num_tokens": 452169134.0, + "step": 11850 + }, + { + "epoch": 1.5075690115761353, + "ewc_loss": 0.04173639416694641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.086819768010173e-05, + "grad_norm": 28.088289260864258, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8565493822097778, + "num_tokens": 452209962.0, + "step": 11851 + }, + { + "epoch": 1.5076962218547258, + "ewc_loss": 0.041712142527103424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.085607047774829e-05, + "grad_norm": 27.914541244506836, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.869359016418457, + "num_tokens": 452245978.0, + "step": 11852 + }, + { + "epoch": 1.5078234321333164, + "ewc_loss": 0.04162045940756798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.081023012578953e-05, + "grad_norm": 27.935901641845703, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8753455877304077, + "num_tokens": 452281013.0, + "step": 11853 + }, + { + "epoch": 1.507950642411907, + "ewc_loss": 0.041785236448049545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0892617612844333e-05, + "grad_norm": 27.976966857910156, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8802163004875183, + "num_tokens": 452317174.0, + "step": 11854 + }, + { + "epoch": 1.5080778526904974, + "ewc_loss": 0.04169362038373947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0846810002694838e-05, + "grad_norm": 28.034914016723633, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.862200140953064, + "num_tokens": 452362974.0, + "step": 11855 + }, + { + "epoch": 1.508205062969088, + "ewc_loss": 0.04173916578292847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0869583750027232e-05, + "grad_norm": 27.938129425048828, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8762832283973694, + "num_tokens": 452400701.0, + "step": 11856 + }, + { + "epoch": 1.5083322732476785, + "ewc_loss": 0.041680872440338135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0840436263824813e-05, + "grad_norm": 28.0953311920166, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8748186826705933, + "num_tokens": 452438541.0, + "step": 11857 + }, + { + "epoch": 1.508459483526269, + "ewc_loss": 0.04166511818766594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0832558220718056e-05, + "grad_norm": 27.896251678466797, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8600435256958008, + "num_tokens": 452476918.0, + "step": 11858 + }, + { + "epoch": 1.5085866938048595, + "ewc_loss": 0.041673026978969574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0836512703681365e-05, + "grad_norm": 28.17838478088379, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8704273700714111, + "num_tokens": 452509231.0, + "step": 11859 + }, + { + "epoch": 1.50871390408345, + "ewc_loss": 0.04172978550195694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0864892576355487e-05, + "grad_norm": 27.969898223876953, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8606566190719604, + "num_tokens": 452547079.0, + "step": 11860 + }, + { + "epoch": 1.5088411143620406, + "ewc_loss": 0.04161153361201286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.080576632579323e-05, + "grad_norm": 28.07573699951172, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8774670362472534, + "num_tokens": 452580215.0, + "step": 11861 + }, + { + "epoch": 1.508968324640631, + "ewc_loss": 0.04168621450662613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.084310654026922e-05, + "grad_norm": 27.9870548248291, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.872482419013977, + "num_tokens": 452615778.0, + "step": 11862 + }, + { + "epoch": 1.5090955349192214, + "ewc_loss": 0.04167041555047035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0835208488279022e-05, + "grad_norm": 27.845487594604492, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8679624795913696, + "num_tokens": 452652184.0, + "step": 11863 + }, + { + "epoch": 1.509222745197812, + "ewc_loss": 0.041689012199640274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0844505343120545e-05, + "grad_norm": 27.995311737060547, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8512621521949768, + "num_tokens": 452691345.0, + "step": 11864 + }, + { + "epoch": 1.5093499554764025, + "ewc_loss": 0.041789714246988297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0894856788800098e-05, + "grad_norm": 27.996379852294922, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8671519160270691, + "num_tokens": 452729982.0, + "step": 11865 + }, + { + "epoch": 1.509477165754993, + "ewc_loss": 0.0416792593896389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0839630451519042e-05, + "grad_norm": 27.889307022094727, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8644911050796509, + "num_tokens": 452771272.0, + "step": 11866 + }, + { + "epoch": 1.5096043760335836, + "ewc_loss": 0.04177007079124451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0885036065010354e-05, + "grad_norm": 27.881465911865234, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8622705936431885, + "num_tokens": 452812231.0, + "step": 11867 + }, + { + "epoch": 1.509731586312174, + "ewc_loss": 0.04178854450583458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.089427289320156e-05, + "grad_norm": 27.93801498413086, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8543561697006226, + "num_tokens": 452858406.0, + "step": 11868 + }, + { + "epoch": 1.5098587965907644, + "ewc_loss": 0.0418156236410141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.090781163133215e-05, + "grad_norm": 27.97422981262207, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.857039213180542, + "num_tokens": 452898793.0, + "step": 11869 + }, + { + "epoch": 1.509986006869355, + "ewc_loss": 0.041749365627765656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.087468237732537e-05, + "grad_norm": 27.899633407592773, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8721557259559631, + "num_tokens": 452937929.0, + "step": 11870 + }, + { + "epoch": 1.5101132171479454, + "ewc_loss": 0.041777368634939194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0888684957753867e-05, + "grad_norm": 27.96108627319336, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8688104152679443, + "num_tokens": 452975436.0, + "step": 11871 + }, + { + "epoch": 1.510240427426536, + "ewc_loss": 0.04187721386551857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0938607121934183e-05, + "grad_norm": 27.960418701171875, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8664537668228149, + "num_tokens": 453016334.0, + "step": 11872 + }, + { + "epoch": 1.5103676377051265, + "ewc_loss": 0.04172626882791519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0863133613602258e-05, + "grad_norm": 27.987865447998047, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8692871332168579, + "num_tokens": 453053630.0, + "step": 11873 + }, + { + "epoch": 1.510494847983717, + "ewc_loss": 0.04191506281495094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0957531887688674e-05, + "grad_norm": 28.016250610351562, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.847449004650116, + "num_tokens": 453092341.0, + "step": 11874 + }, + { + "epoch": 1.5106220582623076, + "ewc_loss": 0.041725628077983856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0862813471467234e-05, + "grad_norm": 27.980350494384766, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8742818236351013, + "num_tokens": 453126859.0, + "step": 11875 + }, + { + "epoch": 1.510749268540898, + "ewc_loss": 0.04184997081756592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0924984710291028e-05, + "grad_norm": 28.00533103942871, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8627564311027527, + "num_tokens": 453167327.0, + "step": 11876 + }, + { + "epoch": 1.5108764788194886, + "ewc_loss": 0.04175421595573425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.08771089091897e-05, + "grad_norm": 27.972431182861328, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8705403804779053, + "num_tokens": 453205901.0, + "step": 11877 + }, + { + "epoch": 1.5110036890980791, + "ewc_loss": 0.041700150817632675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0850075088674203e-05, + "grad_norm": 27.92923355102539, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8639118075370789, + "num_tokens": 453241856.0, + "step": 11878 + }, + { + "epoch": 1.5111308993766697, + "ewc_loss": 0.04169086739420891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0845433027716354e-05, + "grad_norm": 27.934274673461914, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8730708360671997, + "num_tokens": 453282528.0, + "step": 11879 + }, + { + "epoch": 1.5112581096552602, + "ewc_loss": 0.04179361090064049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0896804926451296e-05, + "grad_norm": 28.019132614135742, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8563507795333862, + "num_tokens": 453318762.0, + "step": 11880 + }, + { + "epoch": 1.5113853199338507, + "ewc_loss": 0.041656672954559326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0828336346312426e-05, + "grad_norm": 27.993335723876953, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8813847899436951, + "num_tokens": 453352482.0, + "step": 11881 + }, + { + "epoch": 1.5115125302124413, + "ewc_loss": 0.04181744158267975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0908721126033925e-05, + "grad_norm": 28.146514892578125, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8648152351379395, + "num_tokens": 453393692.0, + "step": 11882 + }, + { + "epoch": 1.5116397404910318, + "ewc_loss": 0.04174162819981575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.087081338686403e-05, + "grad_norm": 28.007949829101562, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8692618608474731, + "num_tokens": 453429217.0, + "step": 11883 + }, + { + "epoch": 1.5117669507696223, + "ewc_loss": 0.04170567914843559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0852839952567592e-05, + "grad_norm": 28.21670913696289, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8505813479423523, + "num_tokens": 453468190.0, + "step": 11884 + }, + { + "epoch": 1.5118941610482128, + "ewc_loss": 0.041756268590688705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.08781348192133e-05, + "grad_norm": 27.89295196533203, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8591827154159546, + "num_tokens": 453509893.0, + "step": 11885 + }, + { + "epoch": 1.5120213713268034, + "ewc_loss": 0.041706234216690063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.085311643895693e-05, + "grad_norm": 28.192855834960938, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8690770864486694, + "num_tokens": 453546972.0, + "step": 11886 + }, + { + "epoch": 1.5121485816053937, + "ewc_loss": 0.041775915771722794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.088795736199245e-05, + "grad_norm": 28.004486083984375, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8779381513595581, + "num_tokens": 453578480.0, + "step": 11887 + }, + { + "epoch": 1.5122757918839842, + "ewc_loss": 0.04161355271935463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0806775864912197e-05, + "grad_norm": 28.006235122680664, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8667213916778564, + "num_tokens": 453623248.0, + "step": 11888 + }, + { + "epoch": 1.5124030021625747, + "ewc_loss": 0.041760124266147614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.088006294798106e-05, + "grad_norm": 28.187299728393555, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8864029049873352, + "num_tokens": 453660088.0, + "step": 11889 + }, + { + "epoch": 1.5125302124411653, + "ewc_loss": 0.04173990339040756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0869951185886748e-05, + "grad_norm": 28.17409896850586, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8680572509765625, + "num_tokens": 453698931.0, + "step": 11890 + }, + { + "epoch": 1.5126574227197558, + "ewc_loss": 0.04169522598385811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.08476121770218e-05, + "grad_norm": 27.960948944091797, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8678029775619507, + "num_tokens": 453734220.0, + "step": 11891 + }, + { + "epoch": 1.5127846329983463, + "ewc_loss": 0.04156855121254921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0784274965990335e-05, + "grad_norm": 28.088552474975586, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8647511601448059, + "num_tokens": 453774056.0, + "step": 11892 + }, + { + "epoch": 1.5129118432769366, + "ewc_loss": 0.041800159960985184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0900079107377678e-05, + "grad_norm": 28.016225814819336, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8723131418228149, + "num_tokens": 453806616.0, + "step": 11893 + }, + { + "epoch": 1.5130390535555271, + "ewc_loss": 0.04166269674897194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0831348592764698e-05, + "grad_norm": 27.96153450012207, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8751651048660278, + "num_tokens": 453843838.0, + "step": 11894 + }, + { + "epoch": 1.5131662638341177, + "ewc_loss": 0.041760217398405075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0880108422716148e-05, + "grad_norm": 28.06927490234375, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8733181953430176, + "num_tokens": 453876888.0, + "step": 11895 + }, + { + "epoch": 1.5132934741127082, + "ewc_loss": 0.04176965728402138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.088482870021835e-05, + "grad_norm": 27.929128646850586, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8812456130981445, + "num_tokens": 453915334.0, + "step": 11896 + }, + { + "epoch": 1.5134206843912987, + "ewc_loss": 0.04182193800806999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0910969396936707e-05, + "grad_norm": 28.178321838378906, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8751652836799622, + "num_tokens": 453950590.0, + "step": 11897 + }, + { + "epoch": 1.5135478946698893, + "ewc_loss": 0.0418148934841156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.090744601446204e-05, + "grad_norm": 28.027088165283203, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8696396350860596, + "num_tokens": 453992590.0, + "step": 11898 + }, + { + "epoch": 1.5136751049484798, + "ewc_loss": 0.04178335517644882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.08916771953227e-05, + "grad_norm": 28.153783798217773, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8713141679763794, + "num_tokens": 454038619.0, + "step": 11899 + }, + { + "epoch": 1.5138023152270703, + "ewc_loss": 0.041744790971279144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0872395907645114e-05, + "grad_norm": 27.918413162231445, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8570947647094727, + "num_tokens": 454081564.0, + "step": 11900 + }, + { + "epoch": 1.5139295255056608, + "ewc_loss": 0.041709255427122116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0854628019151278e-05, + "grad_norm": 28.132814407348633, + "learning_rate": 1e-06, + "loss": 0.4854, + "mean_token_accuracy": 0.8546692132949829, + "num_tokens": 454125581.0, + "step": 11901 + }, + { + "epoch": 1.5140567357842514, + "ewc_loss": 0.04174463823437691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0872319510090165e-05, + "grad_norm": 27.886903762817383, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8757694959640503, + "num_tokens": 454163961.0, + "step": 11902 + }, + { + "epoch": 1.514183946062842, + "ewc_loss": 0.0417243167757988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0862158635281958e-05, + "grad_norm": 28.082317352294922, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8598103523254395, + "num_tokens": 454200410.0, + "step": 11903 + }, + { + "epoch": 1.5143111563414324, + "ewc_loss": 0.04175731539726257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.087865686917212e-05, + "grad_norm": 27.922237396240234, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8524289727210999, + "num_tokens": 454237143.0, + "step": 11904 + }, + { + "epoch": 1.514438366620023, + "ewc_loss": 0.041721995919942856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0860998120042495e-05, + "grad_norm": 27.971994400024414, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8739715814590454, + "num_tokens": 454274022.0, + "step": 11905 + }, + { + "epoch": 1.5145655768986135, + "ewc_loss": 0.041708335280418396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.085416781483218e-05, + "grad_norm": 27.968732833862305, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8609611988067627, + "num_tokens": 454307955.0, + "step": 11906 + }, + { + "epoch": 1.514692787177204, + "ewc_loss": 0.04174111783504486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0870558728347532e-05, + "grad_norm": 27.98142433166504, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8738978505134583, + "num_tokens": 454345449.0, + "step": 11907 + }, + { + "epoch": 1.5148199974557945, + "ewc_loss": 0.041749224066734314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0874611436738633e-05, + "grad_norm": 28.07381248474121, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8549575805664062, + "num_tokens": 454383096.0, + "step": 11908 + }, + { + "epoch": 1.514947207734385, + "ewc_loss": 0.0417439267039299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.087196298816707e-05, + "grad_norm": 28.03834342956543, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8558290004730225, + "num_tokens": 454420142.0, + "step": 11909 + }, + { + "epoch": 1.5150744180129756, + "ewc_loss": 0.04174276068806648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0871380911557935e-05, + "grad_norm": 28.01618194580078, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8657552003860474, + "num_tokens": 454454991.0, + "step": 11910 + }, + { + "epoch": 1.515201628291566, + "ewc_loss": 0.04174848645925522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0874244000879116e-05, + "grad_norm": 28.049165725708008, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8621476292610168, + "num_tokens": 454490161.0, + "step": 11911 + }, + { + "epoch": 1.5153288385701564, + "ewc_loss": 0.041721802204847336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0860901713604107e-05, + "grad_norm": 28.05976676940918, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8576897382736206, + "num_tokens": 454534767.0, + "step": 11912 + }, + { + "epoch": 1.515456048848747, + "ewc_loss": 0.04171481728553772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.08574092539493e-05, + "grad_norm": 27.917638778686523, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8759655952453613, + "num_tokens": 454573246.0, + "step": 11913 + }, + { + "epoch": 1.5155832591273375, + "ewc_loss": 0.041812874376773834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.090643647534307e-05, + "grad_norm": 28.17294692993164, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8792946338653564, + "num_tokens": 454607626.0, + "step": 11914 + }, + { + "epoch": 1.515710469405928, + "ewc_loss": 0.041750505566596985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0875253539998084e-05, + "grad_norm": 27.938533782958984, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8592572212219238, + "num_tokens": 454639840.0, + "step": 11915 + }, + { + "epoch": 1.5158376796845185, + "ewc_loss": 0.04169424995779991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.084712468786165e-05, + "grad_norm": 28.059598922729492, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8575512170791626, + "num_tokens": 454681445.0, + "step": 11916 + }, + { + "epoch": 1.515964889963109, + "ewc_loss": 0.04184410348534584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0922052499372512e-05, + "grad_norm": 28.090530395507812, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8592352271080017, + "num_tokens": 454726191.0, + "step": 11917 + }, + { + "epoch": 1.5160921002416994, + "ewc_loss": 0.041713085025548935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0856541596003808e-05, + "grad_norm": 28.048097610473633, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8728678822517395, + "num_tokens": 454765112.0, + "step": 11918 + }, + { + "epoch": 1.51621931052029, + "ewc_loss": 0.041723109781742096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.086155473079998e-05, + "grad_norm": 28.07063102722168, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8635998964309692, + "num_tokens": 454805254.0, + "step": 11919 + }, + { + "epoch": 1.5163465207988804, + "ewc_loss": 0.04182601347565651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.091300666506868e-05, + "grad_norm": 28.16924285888672, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8596950769424438, + "num_tokens": 454841344.0, + "step": 11920 + }, + { + "epoch": 1.516473731077471, + "ewc_loss": 0.04166853427886963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0834266251767986e-05, + "grad_norm": 27.907258987426758, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8789935111999512, + "num_tokens": 454879986.0, + "step": 11921 + }, + { + "epoch": 1.5166009413560615, + "ewc_loss": 0.0418112650513649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0905632482026704e-05, + "grad_norm": 28.217695236206055, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8573494553565979, + "num_tokens": 454917731.0, + "step": 11922 + }, + { + "epoch": 1.516728151634652, + "ewc_loss": 0.04186258837580681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.093129478453193e-05, + "grad_norm": 27.987953186035156, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8717025518417358, + "num_tokens": 454952837.0, + "step": 11923 + }, + { + "epoch": 1.5168553619132426, + "ewc_loss": 0.04173022881150246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.086511449306272e-05, + "grad_norm": 28.18418312072754, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8689647912979126, + "num_tokens": 454992751.0, + "step": 11924 + }, + { + "epoch": 1.516982572191833, + "ewc_loss": 0.04181176796555519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0905883502564393e-05, + "grad_norm": 28.167919158935547, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8644552230834961, + "num_tokens": 455029829.0, + "step": 11925 + }, + { + "epoch": 1.5171097824704236, + "ewc_loss": 0.041667938232421875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0833969756495208e-05, + "grad_norm": 27.975688934326172, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8657522201538086, + "num_tokens": 455072167.0, + "step": 11926 + }, + { + "epoch": 1.5172369927490141, + "ewc_loss": 0.04174118861556053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0870595108135603e-05, + "grad_norm": 28.11493492126465, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8740168809890747, + "num_tokens": 455108433.0, + "step": 11927 + }, + { + "epoch": 1.5173642030276047, + "ewc_loss": 0.041808247566223145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0904124539811164e-05, + "grad_norm": 27.977527618408203, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8694459199905396, + "num_tokens": 455148943.0, + "step": 11928 + }, + { + "epoch": 1.5174914133061952, + "ewc_loss": 0.041621871292591095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0810935893678106e-05, + "grad_norm": 27.9978084564209, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8611888289451599, + "num_tokens": 455185099.0, + "step": 11929 + }, + { + "epoch": 1.5176186235847857, + "ewc_loss": 0.04186883941292763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.093441980832722e-05, + "grad_norm": 28.081985473632812, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8806652426719666, + "num_tokens": 455225322.0, + "step": 11930 + }, + { + "epoch": 1.5177458338633762, + "ewc_loss": 0.04178127273917198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.089063673338387e-05, + "grad_norm": 28.041353225708008, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8633081912994385, + "num_tokens": 455262830.0, + "step": 11931 + }, + { + "epoch": 1.5178730441419668, + "ewc_loss": 0.041759900748729706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.087995017063804e-05, + "grad_norm": 28.015949249267578, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8729276657104492, + "num_tokens": 455300825.0, + "step": 11932 + }, + { + "epoch": 1.5180002544205573, + "ewc_loss": 0.04179045557975769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.089522786263842e-05, + "grad_norm": 28.04991340637207, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8630151748657227, + "num_tokens": 455336209.0, + "step": 11933 + }, + { + "epoch": 1.5181274646991478, + "ewc_loss": 0.04175350442528725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0876752387266606e-05, + "grad_norm": 27.927597045898438, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8649217486381531, + "num_tokens": 455374909.0, + "step": 11934 + }, + { + "epoch": 1.5182546749777384, + "ewc_loss": 0.04177086800336838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.088543442368973e-05, + "grad_norm": 28.024503707885742, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8786600828170776, + "num_tokens": 455417179.0, + "step": 11935 + }, + { + "epoch": 1.5183818852563287, + "ewc_loss": 0.04183057323098183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.091528585879132e-05, + "grad_norm": 27.983383178710938, + "learning_rate": 1e-06, + "loss": 0.492, + "mean_token_accuracy": 0.853010356426239, + "num_tokens": 455457923.0, + "step": 11936 + }, + { + "epoch": 1.5185090955349192, + "ewc_loss": 0.04176674783229828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0883373508695513e-05, + "grad_norm": 27.9672908782959, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8573223948478699, + "num_tokens": 455492932.0, + "step": 11937 + }, + { + "epoch": 1.5186363058135097, + "ewc_loss": 0.04185255989432335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0926279830746353e-05, + "grad_norm": 28.087173461914062, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8566226363182068, + "num_tokens": 455540392.0, + "step": 11938 + }, + { + "epoch": 1.5187635160921003, + "ewc_loss": 0.041753217577934265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0876608687103726e-05, + "grad_norm": 27.977176666259766, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8615930080413818, + "num_tokens": 455574461.0, + "step": 11939 + }, + { + "epoch": 1.5188907263706908, + "ewc_loss": 0.04181939363479614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0909696104354225e-05, + "grad_norm": 28.042993545532227, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8621407747268677, + "num_tokens": 455608315.0, + "step": 11940 + }, + { + "epoch": 1.5190179366492813, + "ewc_loss": 0.0418272465467453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.091362330247648e-05, + "grad_norm": 28.024131774902344, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8704769015312195, + "num_tokens": 455654927.0, + "step": 11941 + }, + { + "epoch": 1.5191451469278716, + "ewc_loss": 0.04178337752819061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.089168810925912e-05, + "grad_norm": 28.110027313232422, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8539741039276123, + "num_tokens": 455695745.0, + "step": 11942 + }, + { + "epoch": 1.5192723572064621, + "ewc_loss": 0.041815001517534256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0907500584144145e-05, + "grad_norm": 28.020057678222656, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8659095764160156, + "num_tokens": 455734652.0, + "step": 11943 + }, + { + "epoch": 1.5193995674850527, + "ewc_loss": 0.04184984415769577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0924922864651307e-05, + "grad_norm": 28.178043365478516, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8441279530525208, + "num_tokens": 455776063.0, + "step": 11944 + }, + { + "epoch": 1.5195267777636432, + "ewc_loss": 0.04182896018028259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.091448004648555e-05, + "grad_norm": 27.97039031982422, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8613305687904358, + "num_tokens": 455820595.0, + "step": 11945 + }, + { + "epoch": 1.5196539880422337, + "ewc_loss": 0.04175426810979843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.087713437504135e-05, + "grad_norm": 27.984766006469727, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8719822764396667, + "num_tokens": 455859553.0, + "step": 11946 + }, + { + "epoch": 1.5197811983208243, + "ewc_loss": 0.04185573384165764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0927867808495648e-05, + "grad_norm": 28.06051254272461, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8667301535606384, + "num_tokens": 455892676.0, + "step": 11947 + }, + { + "epoch": 1.5199084085994148, + "ewc_loss": 0.041768018156290054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.088400833599735e-05, + "grad_norm": 27.91676139831543, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8582446575164795, + "num_tokens": 455932106.0, + "step": 11948 + }, + { + "epoch": 1.5200356188780053, + "ewc_loss": 0.04180179908871651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0900899471598677e-05, + "grad_norm": 28.121662139892578, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8720995187759399, + "num_tokens": 455971555.0, + "step": 11949 + }, + { + "epoch": 1.5201628291565958, + "ewc_loss": 0.04184604063630104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.09230202017352e-05, + "grad_norm": 27.97519874572754, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8605421781539917, + "num_tokens": 456014177.0, + "step": 11950 + }, + { + "epoch": 1.5202900394351864, + "ewc_loss": 0.041692472994327545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.084623702103272e-05, + "grad_norm": 27.982242584228516, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8710588216781616, + "num_tokens": 456048229.0, + "step": 11951 + }, + { + "epoch": 1.520417249713777, + "ewc_loss": 0.041805483400821686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.090274210786447e-05, + "grad_norm": 27.94989585876465, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8844500780105591, + "num_tokens": 456085815.0, + "step": 11952 + }, + { + "epoch": 1.5205444599923674, + "ewc_loss": 0.04170863330364227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.085431697196327e-05, + "grad_norm": 27.95311164855957, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8676655292510986, + "num_tokens": 456128971.0, + "step": 11953 + }, + { + "epoch": 1.520671670270958, + "ewc_loss": 0.04188855364918709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0944276911905035e-05, + "grad_norm": 28.001516342163086, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8477732539176941, + "num_tokens": 456159705.0, + "step": 11954 + }, + { + "epoch": 1.5207988805495485, + "ewc_loss": 0.04181893542408943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.090946691168938e-05, + "grad_norm": 28.022436141967773, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8573101758956909, + "num_tokens": 456198251.0, + "step": 11955 + }, + { + "epoch": 1.520926090828139, + "ewc_loss": 0.041851166635751724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0925583157804795e-05, + "grad_norm": 28.06683921813965, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.859154462814331, + "num_tokens": 456237612.0, + "step": 11956 + }, + { + "epoch": 1.5210533011067295, + "ewc_loss": 0.041870035231113434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0935018255840987e-05, + "grad_norm": 28.009136199951172, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8655067682266235, + "num_tokens": 456273665.0, + "step": 11957 + }, + { + "epoch": 1.52118051138532, + "ewc_loss": 0.041743382811546326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.087169195874594e-05, + "grad_norm": 27.999980926513672, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8675076961517334, + "num_tokens": 456305467.0, + "step": 11958 + }, + { + "epoch": 1.5213077216639106, + "ewc_loss": 0.04193497449159622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.096748721669428e-05, + "grad_norm": 28.025959014892578, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8726195693016052, + "num_tokens": 456342951.0, + "step": 11959 + }, + { + "epoch": 1.521434931942501, + "ewc_loss": 0.04181928187608719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.090964153467212e-05, + "grad_norm": 28.05268096923828, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8660160899162292, + "num_tokens": 456375846.0, + "step": 11960 + }, + { + "epoch": 1.5215621422210914, + "ewc_loss": 0.04191690683364868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0958454115316272e-05, + "grad_norm": 28.02121925354004, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8753247261047363, + "num_tokens": 456407993.0, + "step": 11961 + }, + { + "epoch": 1.521689352499682, + "ewc_loss": 0.04185338318347931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0926690922351554e-05, + "grad_norm": 28.024402618408203, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.8467311859130859, + "num_tokens": 456444278.0, + "step": 11962 + }, + { + "epoch": 1.5218165627782725, + "ewc_loss": 0.04180289804935455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0901448806398548e-05, + "grad_norm": 27.966720581054688, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8743277788162231, + "num_tokens": 456483950.0, + "step": 11963 + }, + { + "epoch": 1.521943773056863, + "ewc_loss": 0.04191824793815613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0959123503416777e-05, + "grad_norm": 28.062944412231445, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8665905594825745, + "num_tokens": 456527438.0, + "step": 11964 + }, + { + "epoch": 1.5220709833354535, + "ewc_loss": 0.04191580414772034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0957901142537594e-05, + "grad_norm": 28.071266174316406, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8760749697685242, + "num_tokens": 456560572.0, + "step": 11965 + }, + { + "epoch": 1.5221981936140438, + "ewc_loss": 0.041791923344135284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.089596091536805e-05, + "grad_norm": 27.95867347717285, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8568028211593628, + "num_tokens": 456604781.0, + "step": 11966 + }, + { + "epoch": 1.5223254038926344, + "ewc_loss": 0.041893377900123596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0946688891854137e-05, + "grad_norm": 28.092376708984375, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8789833188056946, + "num_tokens": 456639415.0, + "step": 11967 + }, + { + "epoch": 1.522452614171225, + "ewc_loss": 0.04189295321702957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0946476070093922e-05, + "grad_norm": 28.02803611755371, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8781328797340393, + "num_tokens": 456671183.0, + "step": 11968 + }, + { + "epoch": 1.5225798244498154, + "ewc_loss": 0.04188083857297897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0940418835380115e-05, + "grad_norm": 28.061830520629883, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8540933728218079, + "num_tokens": 456712995.0, + "step": 11969 + }, + { + "epoch": 1.522707034728406, + "ewc_loss": 0.04181867092847824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0909335944452323e-05, + "grad_norm": 27.920631408691406, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8602606058120728, + "num_tokens": 456750587.0, + "step": 11970 + }, + { + "epoch": 1.5228342450069965, + "ewc_loss": 0.041855525225400925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0927762307110243e-05, + "grad_norm": 28.095726013183594, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8735142946243286, + "num_tokens": 456787503.0, + "step": 11971 + }, + { + "epoch": 1.522961455285587, + "ewc_loss": 0.0419011116027832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0950556063326076e-05, + "grad_norm": 27.97612190246582, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8442122936248779, + "num_tokens": 456831645.0, + "step": 11972 + }, + { + "epoch": 1.5230886655641775, + "ewc_loss": 0.041875094175338745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0937546651111916e-05, + "grad_norm": 28.03731918334961, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8732963800430298, + "num_tokens": 456869143.0, + "step": 11973 + }, + { + "epoch": 1.523215875842768, + "ewc_loss": 0.041935231536626816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0967616364941932e-05, + "grad_norm": 28.174823760986328, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.864355206489563, + "num_tokens": 456906465.0, + "step": 11974 + }, + { + "epoch": 1.5233430861213586, + "ewc_loss": 0.04187142848968506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0935714928782545e-05, + "grad_norm": 28.02456283569336, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8653765916824341, + "num_tokens": 456942048.0, + "step": 11975 + }, + { + "epoch": 1.5234702963999491, + "ewc_loss": 0.04182367026805878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0911835235892795e-05, + "grad_norm": 28.186809539794922, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8829385042190552, + "num_tokens": 456975710.0, + "step": 11976 + }, + { + "epoch": 1.5235975066785397, + "ewc_loss": 0.041908327490091324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0954163119313307e-05, + "grad_norm": 28.021270751953125, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8542890548706055, + "num_tokens": 457022295.0, + "step": 11977 + }, + { + "epoch": 1.5237247169571302, + "ewc_loss": 0.04166419059038162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0832096197409555e-05, + "grad_norm": 28.12923812866211, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8585146069526672, + "num_tokens": 457060433.0, + "step": 11978 + }, + { + "epoch": 1.5238519272357207, + "ewc_loss": 0.041844841092824936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.092241993523203e-05, + "grad_norm": 27.981613159179688, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8719736933708191, + "num_tokens": 457101760.0, + "step": 11979 + }, + { + "epoch": 1.5239791375143112, + "ewc_loss": 0.04177086800336838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.088543442368973e-05, + "grad_norm": 28.079036712646484, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8810975551605225, + "num_tokens": 457142676.0, + "step": 11980 + }, + { + "epoch": 1.5241063477929018, + "ewc_loss": 0.041864946484565735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0932473489665426e-05, + "grad_norm": 28.033411026000977, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8735616207122803, + "num_tokens": 457179507.0, + "step": 11981 + }, + { + "epoch": 1.5242335580714923, + "ewc_loss": 0.041807323694229126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0903662516502663e-05, + "grad_norm": 27.950721740722656, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8670681118965149, + "num_tokens": 457221563.0, + "step": 11982 + }, + { + "epoch": 1.5243607683500828, + "ewc_loss": 0.04182913899421692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0914569176966324e-05, + "grad_norm": 28.045143127441406, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8633587956428528, + "num_tokens": 457258634.0, + "step": 11983 + }, + { + "epoch": 1.5244879786286734, + "ewc_loss": 0.04185771569609642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0928857338731177e-05, + "grad_norm": 28.037641525268555, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8798117637634277, + "num_tokens": 457293097.0, + "step": 11984 + }, + { + "epoch": 1.5246151889072637, + "ewc_loss": 0.04182914271950722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0914570995955728e-05, + "grad_norm": 28.052473068237305, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.880238950252533, + "num_tokens": 457333525.0, + "step": 11985 + }, + { + "epoch": 1.5247423991858542, + "ewc_loss": 0.04188361018896103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0941804905305617e-05, + "grad_norm": 28.128952026367188, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8515288829803467, + "num_tokens": 457379772.0, + "step": 11986 + }, + { + "epoch": 1.5248696094644447, + "ewc_loss": 0.041889872401952744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.094493538606912e-05, + "grad_norm": 28.155899047851562, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8717250823974609, + "num_tokens": 457415852.0, + "step": 11987 + }, + { + "epoch": 1.5249968197430352, + "ewc_loss": 0.04173486679792404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.086743370455224e-05, + "grad_norm": 27.9957332611084, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.870836079120636, + "num_tokens": 457449771.0, + "step": 11988 + }, + { + "epoch": 1.5251240300216258, + "ewc_loss": 0.041802339255809784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0901170501019806e-05, + "grad_norm": 28.096553802490234, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8618403673171997, + "num_tokens": 457489601.0, + "step": 11989 + }, + { + "epoch": 1.5252512403002163, + "ewc_loss": 0.041844356805086136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0922178009641357e-05, + "grad_norm": 28.062408447265625, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8807029724121094, + "num_tokens": 457530516.0, + "step": 11990 + }, + { + "epoch": 1.5253784505788066, + "ewc_loss": 0.041767608374357224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0883804609184153e-05, + "grad_norm": 28.090755462646484, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8630599975585938, + "num_tokens": 457568299.0, + "step": 11991 + }, + { + "epoch": 1.5255056608573971, + "ewc_loss": 0.041896380484104156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0948189558112063e-05, + "grad_norm": 28.07912826538086, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8649292588233948, + "num_tokens": 457600476.0, + "step": 11992 + }, + { + "epoch": 1.5256328711359877, + "ewc_loss": 0.04183925315737724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0919625967508182e-05, + "grad_norm": 28.004484176635742, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8864856362342834, + "num_tokens": 457632024.0, + "step": 11993 + }, + { + "epoch": 1.5257600814145782, + "ewc_loss": 0.04189573600888252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0947867596987635e-05, + "grad_norm": 28.01658821105957, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8724182844161987, + "num_tokens": 457670805.0, + "step": 11994 + }, + { + "epoch": 1.5258872916931687, + "ewc_loss": 0.04187828302383423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0939141904818825e-05, + "grad_norm": 28.038644790649414, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8683337569236755, + "num_tokens": 457702468.0, + "step": 11995 + }, + { + "epoch": 1.5260145019717593, + "ewc_loss": 0.041929323226213455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0964662326150574e-05, + "grad_norm": 27.987245559692383, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.87583327293396, + "num_tokens": 457740443.0, + "step": 11996 + }, + { + "epoch": 1.5261417122503498, + "ewc_loss": 0.04184524342417717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0922621843055822e-05, + "grad_norm": 28.06912612915039, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8676759004592896, + "num_tokens": 457775833.0, + "step": 11997 + }, + { + "epoch": 1.5262689225289403, + "ewc_loss": 0.041947998106479645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0973999198758975e-05, + "grad_norm": 27.971853256225586, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.87053382396698, + "num_tokens": 457815850.0, + "step": 11998 + }, + { + "epoch": 1.5263961328075308, + "ewc_loss": 0.04185834527015686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.092917202389799e-05, + "grad_norm": 28.04939842224121, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8743574619293213, + "num_tokens": 457854946.0, + "step": 11999 + }, + { + "epoch": 1.5265233430861214, + "ewc_loss": 0.0419442281126976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.09721147257369e-05, + "grad_norm": 28.036657333374023, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.871220052242279, + "num_tokens": 457890775.0, + "step": 12000 + }, + { + "epoch": 1.526650553364712, + "ewc_loss": 0.04188492149114609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0942461560480297e-05, + "grad_norm": 28.00684356689453, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8709967136383057, + "num_tokens": 457928143.0, + "step": 12001 + }, + { + "epoch": 1.5267777636433024, + "ewc_loss": 0.04204291105270386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1021454813308083e-05, + "grad_norm": 28.127553939819336, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8730292320251465, + "num_tokens": 457965999.0, + "step": 12002 + }, + { + "epoch": 1.526904973921893, + "ewc_loss": 0.04189146310091019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0945732103427872e-05, + "grad_norm": 27.976455688476562, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8661468625068665, + "num_tokens": 458006638.0, + "step": 12003 + }, + { + "epoch": 1.5270321842004835, + "ewc_loss": 0.041875723749399185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.093786133627873e-05, + "grad_norm": 27.961214065551758, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8673102259635925, + "num_tokens": 458047522.0, + "step": 12004 + }, + { + "epoch": 1.527159394479074, + "ewc_loss": 0.04197297990322113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0986490198993124e-05, + "grad_norm": 28.057992935180664, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8674076795578003, + "num_tokens": 458091219.0, + "step": 12005 + }, + { + "epoch": 1.5272866047576645, + "ewc_loss": 0.041922107338905334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.096105345117394e-05, + "grad_norm": 27.9456787109375, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8552861213684082, + "num_tokens": 458129208.0, + "step": 12006 + }, + { + "epoch": 1.527413815036255, + "ewc_loss": 0.041955895721912384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0977948224754073e-05, + "grad_norm": 28.04791259765625, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8629347085952759, + "num_tokens": 458163632.0, + "step": 12007 + }, + { + "epoch": 1.5275410253148456, + "ewc_loss": 0.041933804750442505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.096690150210634e-05, + "grad_norm": 27.8933162689209, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.86122727394104, + "num_tokens": 458202377.0, + "step": 12008 + }, + { + "epoch": 1.527668235593436, + "ewc_loss": 0.04194227233529091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0971136109437793e-05, + "grad_norm": 28.087617874145508, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8669050931930542, + "num_tokens": 458239901.0, + "step": 12009 + }, + { + "epoch": 1.5277954458720264, + "ewc_loss": 0.04211224615573883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1056122932350263e-05, + "grad_norm": 28.119178771972656, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8615248203277588, + "num_tokens": 458280291.0, + "step": 12010 + }, + { + "epoch": 1.527922656150617, + "ewc_loss": 0.041927047073841095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0963523638783954e-05, + "grad_norm": 28.01923370361328, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.87274569272995, + "num_tokens": 458316931.0, + "step": 12011 + }, + { + "epoch": 1.5280498664292075, + "ewc_loss": 0.04196460172533989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0982301066396758e-05, + "grad_norm": 28.028207778930664, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8607208728790283, + "num_tokens": 458356477.0, + "step": 12012 + }, + { + "epoch": 1.528177076707798, + "ewc_loss": 0.04189673811197281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0948369638063014e-05, + "grad_norm": 28.0205135345459, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8525498509407043, + "num_tokens": 458401340.0, + "step": 12013 + }, + { + "epoch": 1.5283042869863885, + "ewc_loss": 0.04193233698606491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.096616844937671e-05, + "grad_norm": 28.01767921447754, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8730934858322144, + "num_tokens": 458436190.0, + "step": 12014 + }, + { + "epoch": 1.5284314972649788, + "ewc_loss": 0.0418802872300148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.094014416798018e-05, + "grad_norm": 27.974294662475586, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.864761471748352, + "num_tokens": 458465513.0, + "step": 12015 + }, + { + "epoch": 1.5285587075435694, + "ewc_loss": 0.04196317121386528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0981586203561164e-05, + "grad_norm": 28.087560653686523, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.863325834274292, + "num_tokens": 458505465.0, + "step": 12016 + }, + { + "epoch": 1.52868591782216, + "ewc_loss": 0.04192314296960831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.096157186315395e-05, + "grad_norm": 27.97201156616211, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8644121289253235, + "num_tokens": 458545087.0, + "step": 12017 + }, + { + "epoch": 1.5288131281007504, + "ewc_loss": 0.0419243648648262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.096218304359354e-05, + "grad_norm": 28.018280029296875, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8743005990982056, + "num_tokens": 458583274.0, + "step": 12018 + }, + { + "epoch": 1.528940338379341, + "ewc_loss": 0.04196174815297127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0980873159714974e-05, + "grad_norm": 28.111003875732422, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8611744046211243, + "num_tokens": 458626026.0, + "step": 12019 + }, + { + "epoch": 1.5290675486579315, + "ewc_loss": 0.04199768975377083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0998844775022008e-05, + "grad_norm": 28.052589416503906, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8785502910614014, + "num_tokens": 458668774.0, + "step": 12020 + }, + { + "epoch": 1.529194758936522, + "ewc_loss": 0.04191229119896889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0956145817763172e-05, + "grad_norm": 28.13962745666504, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8700423240661621, + "num_tokens": 458702256.0, + "step": 12021 + }, + { + "epoch": 1.5293219692151125, + "ewc_loss": 0.04195629060268402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0978144675609656e-05, + "grad_norm": 28.007585525512695, + "learning_rate": 1e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.8398233652114868, + "num_tokens": 458741429.0, + "step": 12022 + }, + { + "epoch": 1.529449179493703, + "ewc_loss": 0.041831739246845245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.091586975438986e-05, + "grad_norm": 28.07033348083496, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.871099591255188, + "num_tokens": 458775608.0, + "step": 12023 + }, + { + "epoch": 1.5295763897722936, + "ewc_loss": 0.0419197641313076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0959882021998055e-05, + "grad_norm": 27.994129180908203, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8707389831542969, + "num_tokens": 458811451.0, + "step": 12024 + }, + { + "epoch": 1.5297036000508841, + "ewc_loss": 0.04198681563138962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.099340781569481e-05, + "grad_norm": 28.089492797851562, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8708957433700562, + "num_tokens": 458851761.0, + "step": 12025 + }, + { + "epoch": 1.5298308103294747, + "ewc_loss": 0.04196855053305626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0984274669899605e-05, + "grad_norm": 28.103750228881836, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8609331250190735, + "num_tokens": 458888948.0, + "step": 12026 + }, + { + "epoch": 1.5299580206080652, + "ewc_loss": 0.04188181459903717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.094090814352967e-05, + "grad_norm": 28.010204315185547, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8652186393737793, + "num_tokens": 458922466.0, + "step": 12027 + }, + { + "epoch": 1.5300852308866557, + "ewc_loss": 0.04193020984530449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0965104340575635e-05, + "grad_norm": 28.113256454467773, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8621290922164917, + "num_tokens": 458956947.0, + "step": 12028 + }, + { + "epoch": 1.5302124411652462, + "ewc_loss": 0.04191858693957329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0959292669431306e-05, + "grad_norm": 27.98213005065918, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.884566068649292, + "num_tokens": 458993634.0, + "step": 12029 + }, + { + "epoch": 1.5303396514438368, + "ewc_loss": 0.041924115270376205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0962057533324696e-05, + "grad_norm": 28.138477325439453, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8765013217926025, + "num_tokens": 459034314.0, + "step": 12030 + }, + { + "epoch": 1.5304668617224273, + "ewc_loss": 0.04189969599246979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0949848476448096e-05, + "grad_norm": 27.933006286621094, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8719939589500427, + "num_tokens": 459072117.0, + "step": 12031 + }, + { + "epoch": 1.5305940720010178, + "ewc_loss": 0.04192667081952095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.096333628287539e-05, + "grad_norm": 28.16332244873047, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8614940047264099, + "num_tokens": 459112382.0, + "step": 12032 + }, + { + "epoch": 1.5307212822796084, + "ewc_loss": 0.04193463176488876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0967316231690347e-05, + "grad_norm": 28.034591674804688, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.879749059677124, + "num_tokens": 459148973.0, + "step": 12033 + }, + { + "epoch": 1.5308484925581987, + "ewc_loss": 0.04185429587960243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0927147488691844e-05, + "grad_norm": 28.01493263244629, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8639782071113586, + "num_tokens": 459183242.0, + "step": 12034 + }, + { + "epoch": 1.5309757028367892, + "ewc_loss": 0.04206747189164162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1033736629760824e-05, + "grad_norm": 28.172103881835938, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8849838972091675, + "num_tokens": 459219606.0, + "step": 12035 + }, + { + "epoch": 1.5311029131153797, + "ewc_loss": 0.04199963063001633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.09998161153635e-05, + "grad_norm": 28.064485549926758, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8850895166397095, + "num_tokens": 459253763.0, + "step": 12036 + }, + { + "epoch": 1.5312301233939702, + "ewc_loss": 0.04196577146649361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0982884961995296e-05, + "grad_norm": 28.167558670043945, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8700548410415649, + "num_tokens": 459291187.0, + "step": 12037 + }, + { + "epoch": 1.5313573336725608, + "ewc_loss": 0.04191138595342636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.095569288940169e-05, + "grad_norm": 28.034109115600586, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8641332387924194, + "num_tokens": 459327423.0, + "step": 12038 + }, + { + "epoch": 1.5314845439511513, + "ewc_loss": 0.04184778779745102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.09238933166489e-05, + "grad_norm": 28.060792922973633, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8629418611526489, + "num_tokens": 459366207.0, + "step": 12039 + }, + { + "epoch": 1.5316117542297416, + "ewc_loss": 0.04191775247454643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0958876120857894e-05, + "grad_norm": 28.171689987182617, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8600922226905823, + "num_tokens": 459408338.0, + "step": 12040 + }, + { + "epoch": 1.5317389645083321, + "ewc_loss": 0.0419146791100502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0957339074811898e-05, + "grad_norm": 28.102014541625977, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.877381443977356, + "num_tokens": 459443420.0, + "step": 12041 + }, + { + "epoch": 1.5318661747869227, + "ewc_loss": 0.041876111179590225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.093805596814491e-05, + "grad_norm": 28.019445419311523, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8516719341278076, + "num_tokens": 459475835.0, + "step": 12042 + }, + { + "epoch": 1.5319933850655132, + "ewc_loss": 0.041986219584941864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0993109501432627e-05, + "grad_norm": 28.235044479370117, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8656928539276123, + "num_tokens": 459511993.0, + "step": 12043 + }, + { + "epoch": 1.5321205953441037, + "ewc_loss": 0.041871629655361176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.093581497319974e-05, + "grad_norm": 28.07961082458496, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8750725388526917, + "num_tokens": 459546221.0, + "step": 12044 + }, + { + "epoch": 1.5322478056226942, + "ewc_loss": 0.041891612112522125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0945806681993417e-05, + "grad_norm": 28.147140502929688, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8709299564361572, + "num_tokens": 459587271.0, + "step": 12045 + }, + { + "epoch": 1.5323750159012848, + "ewc_loss": 0.04191672429442406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.095836134685669e-05, + "grad_norm": 28.13288116455078, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8562146425247192, + "num_tokens": 459623388.0, + "step": 12046 + }, + { + "epoch": 1.5325022261798753, + "ewc_loss": 0.04189645126461983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0948225937900133e-05, + "grad_norm": 28.176305770874023, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8670154809951782, + "num_tokens": 459666440.0, + "step": 12047 + }, + { + "epoch": 1.5326294364584658, + "ewc_loss": 0.04194744676351547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0973722712369636e-05, + "grad_norm": 28.036460876464844, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8532860279083252, + "num_tokens": 459708373.0, + "step": 12048 + }, + { + "epoch": 1.5327566467370564, + "ewc_loss": 0.04194251820445061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0971259800717235e-05, + "grad_norm": 28.287246704101562, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8777540922164917, + "num_tokens": 459746707.0, + "step": 12049 + }, + { + "epoch": 1.532883857015647, + "ewc_loss": 0.04190593957901001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.095296986226458e-05, + "grad_norm": 28.054834365844727, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8724379539489746, + "num_tokens": 459792493.0, + "step": 12050 + }, + { + "epoch": 1.5330110672942374, + "ewc_loss": 0.041854556649923325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.09272784559289e-05, + "grad_norm": 28.044118881225586, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8755221962928772, + "num_tokens": 459828897.0, + "step": 12051 + }, + { + "epoch": 1.533138277572828, + "ewc_loss": 0.04197997599840164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0989988115616143e-05, + "grad_norm": 28.204776763916016, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8585229516029358, + "num_tokens": 459866274.0, + "step": 12052 + }, + { + "epoch": 1.5332654878514185, + "ewc_loss": 0.04186471179127693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0932355255354196e-05, + "grad_norm": 27.93262481689453, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.882971465587616, + "num_tokens": 459902442.0, + "step": 12053 + }, + { + "epoch": 1.533392698130009, + "ewc_loss": 0.041883792728185654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0941895854775794e-05, + "grad_norm": 28.112592697143555, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.87534499168396, + "num_tokens": 459943331.0, + "step": 12054 + }, + { + "epoch": 1.5335199084085995, + "ewc_loss": 0.04194217175245285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0971085177734494e-05, + "grad_norm": 27.989154815673828, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8728296160697937, + "num_tokens": 459975349.0, + "step": 12055 + }, + { + "epoch": 1.53364711868719, + "ewc_loss": 0.04184022918343544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0920115275657736e-05, + "grad_norm": 28.016742706298828, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8803671002388, + "num_tokens": 460008295.0, + "step": 12056 + }, + { + "epoch": 1.5337743289657806, + "ewc_loss": 0.04192971810698509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0964858777006157e-05, + "grad_norm": 27.95978546142578, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8715230226516724, + "num_tokens": 460045892.0, + "step": 12057 + }, + { + "epoch": 1.533901539244371, + "ewc_loss": 0.041923295706510544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0961648260708898e-05, + "grad_norm": 28.055034637451172, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8671064376831055, + "num_tokens": 460084255.0, + "step": 12058 + }, + { + "epoch": 1.5340287495229614, + "ewc_loss": 0.04195178672671318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0975892766728066e-05, + "grad_norm": 27.91022491455078, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8535904884338379, + "num_tokens": 460123706.0, + "step": 12059 + }, + { + "epoch": 1.534155959801552, + "ewc_loss": 0.04190199822187424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.095099989674054e-05, + "grad_norm": 28.060697555541992, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8536897897720337, + "num_tokens": 460162351.0, + "step": 12060 + }, + { + "epoch": 1.5342831700801425, + "ewc_loss": 0.041976138949394226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0988069081795402e-05, + "grad_norm": 28.020904541015625, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8577782511711121, + "num_tokens": 460200135.0, + "step": 12061 + }, + { + "epoch": 1.534410380358733, + "ewc_loss": 0.04194394871592522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0971974663552828e-05, + "grad_norm": 28.06275177001953, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8621044158935547, + "num_tokens": 460243585.0, + "step": 12062 + }, + { + "epoch": 1.5345375906373235, + "ewc_loss": 0.04198223724961281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0991117708035745e-05, + "grad_norm": 27.984567642211914, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.860951840877533, + "num_tokens": 460283695.0, + "step": 12063 + }, + { + "epoch": 1.5346648009159138, + "ewc_loss": 0.041855793446302414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0927896912326105e-05, + "grad_norm": 28.010269165039062, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8748452663421631, + "num_tokens": 460317241.0, + "step": 12064 + }, + { + "epoch": 1.5347920111945044, + "ewc_loss": 0.04195454716682434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0977273379685357e-05, + "grad_norm": 28.106727600097656, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8695477247238159, + "num_tokens": 460351900.0, + "step": 12065 + }, + { + "epoch": 1.534919221473095, + "ewc_loss": 0.041952334344387054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0976167434128e-05, + "grad_norm": 28.001861572265625, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8693556189537048, + "num_tokens": 460391672.0, + "step": 12066 + }, + { + "epoch": 1.5350464317516854, + "ewc_loss": 0.04191921651363373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0959609173587523e-05, + "grad_norm": 27.983827590942383, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8593238592147827, + "num_tokens": 460434120.0, + "step": 12067 + }, + { + "epoch": 1.535173642030276, + "ewc_loss": 0.042000170797109604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1000085325795226e-05, + "grad_norm": 28.083051681518555, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8688969016075134, + "num_tokens": 460474214.0, + "step": 12068 + }, + { + "epoch": 1.5353008523088665, + "ewc_loss": 0.041980478912591934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0990239136153832e-05, + "grad_norm": 28.106666564941406, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8727859258651733, + "num_tokens": 460507549.0, + "step": 12069 + }, + { + "epoch": 1.535428062587457, + "ewc_loss": 0.04194369912147522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0971849153283983e-05, + "grad_norm": 28.07300567626953, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8593223094940186, + "num_tokens": 460546391.0, + "step": 12070 + }, + { + "epoch": 1.5355552728660475, + "ewc_loss": 0.04191657528281212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0958286768291146e-05, + "grad_norm": 28.18828010559082, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8559948205947876, + "num_tokens": 460585232.0, + "step": 12071 + }, + { + "epoch": 1.535682483144638, + "ewc_loss": 0.04189624264836311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0948120436514728e-05, + "grad_norm": 28.057037353515625, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8757004737854004, + "num_tokens": 460624398.0, + "step": 12072 + }, + { + "epoch": 1.5358096934232286, + "ewc_loss": 0.04189933463931084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0949666577507742e-05, + "grad_norm": 28.086585998535156, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8599660992622375, + "num_tokens": 460667193.0, + "step": 12073 + }, + { + "epoch": 1.5359369037018191, + "ewc_loss": 0.04198125749826431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.099062839988619e-05, + "grad_norm": 28.240076065063477, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8438265323638916, + "num_tokens": 460710614.0, + "step": 12074 + }, + { + "epoch": 1.5360641139804097, + "ewc_loss": 0.04189001023769379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0945004507666454e-05, + "grad_norm": 28.07932472229004, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8628523349761963, + "num_tokens": 460751079.0, + "step": 12075 + }, + { + "epoch": 1.5361913242590002, + "ewc_loss": 0.041812121868133545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.090606176352594e-05, + "grad_norm": 28.040225982666016, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8677331209182739, + "num_tokens": 460786768.0, + "step": 12076 + }, + { + "epoch": 1.5363185345375907, + "ewc_loss": 0.041928745806217194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.096437310683541e-05, + "grad_norm": 28.23501968383789, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8572123050689697, + "num_tokens": 460824757.0, + "step": 12077 + }, + { + "epoch": 1.5364457448161812, + "ewc_loss": 0.04184766486287117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0923833289998583e-05, + "grad_norm": 28.00417137145996, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8599938154220581, + "num_tokens": 460865024.0, + "step": 12078 + }, + { + "epoch": 1.5365729550947718, + "ewc_loss": 0.04189428687095642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0947143639205024e-05, + "grad_norm": 28.12750816345215, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8584602475166321, + "num_tokens": 460900895.0, + "step": 12079 + }, + { + "epoch": 1.5367001653733623, + "ewc_loss": 0.041970256716012955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0985127775929868e-05, + "grad_norm": 28.079458236694336, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8730447292327881, + "num_tokens": 460936876.0, + "step": 12080 + }, + { + "epoch": 1.5368273756519528, + "ewc_loss": 0.04187561944127083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.093781040457543e-05, + "grad_norm": 28.279754638671875, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8518534898757935, + "num_tokens": 460975297.0, + "step": 12081 + }, + { + "epoch": 1.5369545859305433, + "ewc_loss": 0.04194951057434082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.097475589835085e-05, + "grad_norm": 28.01109504699707, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8547108173370361, + "num_tokens": 461008603.0, + "step": 12082 + }, + { + "epoch": 1.5370817962091337, + "ewc_loss": 0.04187715798616409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.093857983709313e-05, + "grad_norm": 28.007007598876953, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8582410216331482, + "num_tokens": 461047770.0, + "step": 12083 + }, + { + "epoch": 1.5372090064877242, + "ewc_loss": 0.04202026501297951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1010133423260413e-05, + "grad_norm": 28.06034278869629, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.854155957698822, + "num_tokens": 461088554.0, + "step": 12084 + }, + { + "epoch": 1.5373362167663147, + "ewc_loss": 0.0419846773147583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0992338249925524e-05, + "grad_norm": 28.247377395629883, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.845483660697937, + "num_tokens": 461126070.0, + "step": 12085 + }, + { + "epoch": 1.5374634270449052, + "ewc_loss": 0.04200204461812973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1001022105338052e-05, + "grad_norm": 28.027738571166992, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8646218776702881, + "num_tokens": 461166394.0, + "step": 12086 + }, + { + "epoch": 1.5375906373234958, + "ewc_loss": 0.04188969358801842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0944846255588345e-05, + "grad_norm": 28.019977569580078, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8594152927398682, + "num_tokens": 461208263.0, + "step": 12087 + }, + { + "epoch": 1.5377178476020863, + "ewc_loss": 0.04197201505303383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0986008166801184e-05, + "grad_norm": 28.019346237182617, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8766754269599915, + "num_tokens": 461245054.0, + "step": 12088 + }, + { + "epoch": 1.5378450578806766, + "ewc_loss": 0.04193834960460663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.096917523886077e-05, + "grad_norm": 28.051799774169922, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8563410043716431, + "num_tokens": 461281584.0, + "step": 12089 + }, + { + "epoch": 1.5379722681592671, + "ewc_loss": 0.04206780716776848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.103390397678595e-05, + "grad_norm": 28.237308502197266, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8751447200775146, + "num_tokens": 461322247.0, + "step": 12090 + }, + { + "epoch": 1.5380994784378577, + "ewc_loss": 0.041994623839855194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.099731136695482e-05, + "grad_norm": 27.94411277770996, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8544036149978638, + "num_tokens": 461358371.0, + "step": 12091 + }, + { + "epoch": 1.5382266887164482, + "ewc_loss": 0.04199503734707832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0997518731746823e-05, + "grad_norm": 28.071603775024414, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8729977011680603, + "num_tokens": 461394666.0, + "step": 12092 + }, + { + "epoch": 1.5383538989950387, + "ewc_loss": 0.04207171872258186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1035859390394762e-05, + "grad_norm": 28.10147476196289, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8786372542381287, + "num_tokens": 461430875.0, + "step": 12093 + }, + { + "epoch": 1.5384811092736292, + "ewc_loss": 0.04202656447887421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.101328209391795e-05, + "grad_norm": 28.242555618286133, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.863116979598999, + "num_tokens": 461467412.0, + "step": 12094 + }, + { + "epoch": 1.5386083195522198, + "ewc_loss": 0.0420030802488327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1001540517318062e-05, + "grad_norm": 27.964439392089844, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8444880247116089, + "num_tokens": 461501970.0, + "step": 12095 + }, + { + "epoch": 1.5387355298308103, + "ewc_loss": 0.041945066303014755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0972533093299717e-05, + "grad_norm": 28.117399215698242, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8528169393539429, + "num_tokens": 461542735.0, + "step": 12096 + }, + { + "epoch": 1.5388627401094008, + "ewc_loss": 0.042048629373311996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1024314264650457e-05, + "grad_norm": 28.082334518432617, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8679924607276917, + "num_tokens": 461583883.0, + "step": 12097 + }, + { + "epoch": 1.5389899503879914, + "ewc_loss": 0.04201135411858559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1005676899221726e-05, + "grad_norm": 28.38392448425293, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8576288223266602, + "num_tokens": 461625124.0, + "step": 12098 + }, + { + "epoch": 1.5391171606665819, + "ewc_loss": 0.04194061458110809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0970306650269777e-05, + "grad_norm": 27.92021942138672, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8561851978302002, + "num_tokens": 461660273.0, + "step": 12099 + }, + { + "epoch": 1.5392443709451724, + "ewc_loss": 0.041908156126737595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.095407762681134e-05, + "grad_norm": 28.207683563232422, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8599227666854858, + "num_tokens": 461698890.0, + "step": 12100 + }, + { + "epoch": 1.539371581223763, + "ewc_loss": 0.04204181209206581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.102090547850821e-05, + "grad_norm": 28.180662155151367, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.866680920124054, + "num_tokens": 461736927.0, + "step": 12101 + }, + { + "epoch": 1.5394987915023535, + "ewc_loss": 0.04191603884100914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0958019376848824e-05, + "grad_norm": 27.942800521850586, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8631670475006104, + "num_tokens": 461777796.0, + "step": 12102 + }, + { + "epoch": 1.539626001780944, + "ewc_loss": 0.04203279688954353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1016398022766225e-05, + "grad_norm": 28.182222366333008, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8757480382919312, + "num_tokens": 461814656.0, + "step": 12103 + }, + { + "epoch": 1.5397532120595345, + "ewc_loss": 0.04205961897969246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.102980943163857e-05, + "grad_norm": 28.127201080322266, + "learning_rate": 1e-06, + "loss": 0.4987, + "mean_token_accuracy": 0.8483664989471436, + "num_tokens": 461852186.0, + "step": 12104 + }, + { + "epoch": 1.539880422338125, + "ewc_loss": 0.04192134737968445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0960673282388598e-05, + "grad_norm": 27.949565887451172, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8793811798095703, + "num_tokens": 461893724.0, + "step": 12105 + }, + { + "epoch": 1.5400076326167156, + "ewc_loss": 0.041919369250535965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.095968375215307e-05, + "grad_norm": 28.172271728515625, + "learning_rate": 1e-06, + "loss": 0.4831, + "mean_token_accuracy": 0.8539462089538574, + "num_tokens": 461935380.0, + "step": 12106 + }, + { + "epoch": 1.5401348428953059, + "ewc_loss": 0.0420258529484272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1012925571994856e-05, + "grad_norm": 27.951372146606445, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8724403977394104, + "num_tokens": 461971463.0, + "step": 12107 + }, + { + "epoch": 1.5402620531738964, + "ewc_loss": 0.041901275515556335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0950637917849235e-05, + "grad_norm": 28.163713455200195, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8730440139770508, + "num_tokens": 462007514.0, + "step": 12108 + }, + { + "epoch": 1.540389263452487, + "ewc_loss": 0.042057134211063385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1028567061875947e-05, + "grad_norm": 28.026952743530273, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8793207406997681, + "num_tokens": 462046022.0, + "step": 12109 + }, + { + "epoch": 1.5405164737310775, + "ewc_loss": 0.042029306292533875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1014653611928225e-05, + "grad_norm": 28.074203491210938, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8698976039886475, + "num_tokens": 462089367.0, + "step": 12110 + }, + { + "epoch": 1.540643684009668, + "ewc_loss": 0.04192301258444786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0961506379535422e-05, + "grad_norm": 28.016719818115234, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8713347911834717, + "num_tokens": 462128558.0, + "step": 12111 + }, + { + "epoch": 1.5407708942882585, + "ewc_loss": 0.04197002947330475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.098501499858685e-05, + "grad_norm": 27.994274139404297, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8741387724876404, + "num_tokens": 462162639.0, + "step": 12112 + }, + { + "epoch": 1.5408981045668488, + "ewc_loss": 0.04204209893941879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1021049178671092e-05, + "grad_norm": 28.09749984741211, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.866851270198822, + "num_tokens": 462199303.0, + "step": 12113 + }, + { + "epoch": 1.5410253148454394, + "ewc_loss": 0.04195179417729378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0975896404706873e-05, + "grad_norm": 27.933605194091797, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8639793395996094, + "num_tokens": 462246095.0, + "step": 12114 + }, + { + "epoch": 1.54115252512403, + "ewc_loss": 0.04199271649122238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.099635821650736e-05, + "grad_norm": 28.082111358642578, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.862049400806427, + "num_tokens": 462286653.0, + "step": 12115 + }, + { + "epoch": 1.5412797354026204, + "ewc_loss": 0.04196656122803688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.098328150168527e-05, + "grad_norm": 28.001232147216797, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8641592264175415, + "num_tokens": 462328295.0, + "step": 12116 + }, + { + "epoch": 1.541406945681211, + "ewc_loss": 0.04200168699026108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1000843844376504e-05, + "grad_norm": 28.02497100830078, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8690623641014099, + "num_tokens": 462365175.0, + "step": 12117 + }, + { + "epoch": 1.5415341559598015, + "ewc_loss": 0.04204290732741356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.102145299431868e-05, + "grad_norm": 27.9704647064209, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8591725826263428, + "num_tokens": 462407044.0, + "step": 12118 + }, + { + "epoch": 1.541661366238392, + "ewc_loss": 0.04210206866264343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.105103339999914e-05, + "grad_norm": 28.107057571411133, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8590198755264282, + "num_tokens": 462443819.0, + "step": 12119 + }, + { + "epoch": 1.5417885765169825, + "ewc_loss": 0.042029254138469696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.101462632708717e-05, + "grad_norm": 28.00763511657715, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8760268092155457, + "num_tokens": 462487319.0, + "step": 12120 + }, + { + "epoch": 1.541915786795573, + "ewc_loss": 0.04200538992881775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1002695575589314e-05, + "grad_norm": 28.053495407104492, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8675880432128906, + "num_tokens": 462527026.0, + "step": 12121 + }, + { + "epoch": 1.5420429970741636, + "ewc_loss": 0.042105551809072495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.105277599184774e-05, + "grad_norm": 27.992809295654297, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8586406111717224, + "num_tokens": 462557552.0, + "step": 12122 + }, + { + "epoch": 1.5421702073527541, + "ewc_loss": 0.042062945663928986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.103147198795341e-05, + "grad_norm": 28.207855224609375, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8678511381149292, + "num_tokens": 462595243.0, + "step": 12123 + }, + { + "epoch": 1.5422974176313446, + "ewc_loss": 0.04201110079884529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1005549569963478e-05, + "grad_norm": 28.049646377563477, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8687195777893066, + "num_tokens": 462636077.0, + "step": 12124 + }, + { + "epoch": 1.5424246279099352, + "ewc_loss": 0.04200087487697601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1000438209739514e-05, + "grad_norm": 28.085403442382812, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.876672089099884, + "num_tokens": 462668407.0, + "step": 12125 + }, + { + "epoch": 1.5425518381885257, + "ewc_loss": 0.04206845164299011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1034225937910378e-05, + "grad_norm": 28.046892166137695, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.865947425365448, + "num_tokens": 462707678.0, + "step": 12126 + }, + { + "epoch": 1.5426790484671162, + "ewc_loss": 0.041940122842788696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0970061086700298e-05, + "grad_norm": 28.149486541748047, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8584370613098145, + "num_tokens": 462739722.0, + "step": 12127 + }, + { + "epoch": 1.5428062587457068, + "ewc_loss": 0.041987594217061996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0993797079427168e-05, + "grad_norm": 28.067651748657227, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8728321194648743, + "num_tokens": 462770369.0, + "step": 12128 + }, + { + "epoch": 1.5429334690242973, + "ewc_loss": 0.041967976838350296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.098398908856325e-05, + "grad_norm": 28.135099411010742, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8727353811264038, + "num_tokens": 462806126.0, + "step": 12129 + }, + { + "epoch": 1.5430606793028878, + "ewc_loss": 0.04208175837993622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.104087980114855e-05, + "grad_norm": 28.101850509643555, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8683819770812988, + "num_tokens": 462846585.0, + "step": 12130 + }, + { + "epoch": 1.5431878895814783, + "ewc_loss": 0.04197438806295395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0987194147892296e-05, + "grad_norm": 28.142040252685547, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8628394603729248, + "num_tokens": 462889892.0, + "step": 12131 + }, + { + "epoch": 1.5433150998600687, + "ewc_loss": 0.042071383446455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1035692043369636e-05, + "grad_norm": 28.190357208251953, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8658692240715027, + "num_tokens": 462927612.0, + "step": 12132 + }, + { + "epoch": 1.5434423101386592, + "ewc_loss": 0.0419330932199955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0966546799172647e-05, + "grad_norm": 28.086828231811523, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8692127466201782, + "num_tokens": 462971661.0, + "step": 12133 + }, + { + "epoch": 1.5435695204172497, + "ewc_loss": 0.042058058083057404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1029029085184447e-05, + "grad_norm": 28.165531158447266, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8705799579620361, + "num_tokens": 463011260.0, + "step": 12134 + }, + { + "epoch": 1.5436967306958402, + "ewc_loss": 0.042034100741147995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1017051039962098e-05, + "grad_norm": 28.045852661132812, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8722530603408813, + "num_tokens": 463045395.0, + "step": 12135 + }, + { + "epoch": 1.5438239409744308, + "ewc_loss": 0.0420067273080349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1003363144700415e-05, + "grad_norm": 28.231420516967773, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8560932278633118, + "num_tokens": 463081015.0, + "step": 12136 + }, + { + "epoch": 1.5439511512530213, + "ewc_loss": 0.04199939966201782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0999699700041674e-05, + "grad_norm": 28.079307556152344, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8656238317489624, + "num_tokens": 463115658.0, + "step": 12137 + }, + { + "epoch": 1.5440783615316116, + "ewc_loss": 0.041914742439985275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0957371816621162e-05, + "grad_norm": 28.123905181884766, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8779404163360596, + "num_tokens": 463152084.0, + "step": 12138 + }, + { + "epoch": 1.5442055718102021, + "ewc_loss": 0.042048003524541855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1024001398473047e-05, + "grad_norm": 28.180450439453125, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8560903072357178, + "num_tokens": 463190137.0, + "step": 12139 + }, + { + "epoch": 1.5443327820887927, + "ewc_loss": 0.041943274438381195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.097163633152377e-05, + "grad_norm": 28.208158493041992, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8663013577461243, + "num_tokens": 463227626.0, + "step": 12140 + }, + { + "epoch": 1.5444599923673832, + "ewc_loss": 0.04192705079913139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0963525457773358e-05, + "grad_norm": 28.11937141418457, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8670530915260315, + "num_tokens": 463266451.0, + "step": 12141 + }, + { + "epoch": 1.5445872026459737, + "ewc_loss": 0.041943490505218506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.097174547088798e-05, + "grad_norm": 28.181854248046875, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.854038417339325, + "num_tokens": 463305327.0, + "step": 12142 + }, + { + "epoch": 1.5447144129245642, + "ewc_loss": 0.04195423051714897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.097711512760725e-05, + "grad_norm": 28.114675521850586, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8632487654685974, + "num_tokens": 463345890.0, + "step": 12143 + }, + { + "epoch": 1.5448416232031548, + "ewc_loss": 0.041930824518203735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0965411749784835e-05, + "grad_norm": 28.057697296142578, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8706411123275757, + "num_tokens": 463381583.0, + "step": 12144 + }, + { + "epoch": 1.5449688334817453, + "ewc_loss": 0.041888415813446045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.09442077903077e-05, + "grad_norm": 28.085023880004883, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.874739944934845, + "num_tokens": 463414442.0, + "step": 12145 + }, + { + "epoch": 1.5450960437603358, + "ewc_loss": 0.04194866120815277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.097433025483042e-05, + "grad_norm": 28.026674270629883, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8723913431167603, + "num_tokens": 463449517.0, + "step": 12146 + }, + { + "epoch": 1.5452232540389264, + "ewc_loss": 0.041957613080739975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0978806787752546e-05, + "grad_norm": 28.147533416748047, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.873227596282959, + "num_tokens": 463487952.0, + "step": 12147 + }, + { + "epoch": 1.5453504643175169, + "ewc_loss": 0.0420280322432518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1014016965636984e-05, + "grad_norm": 28.102378845214844, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.867450475692749, + "num_tokens": 463520724.0, + "step": 12148 + }, + { + "epoch": 1.5454776745961074, + "ewc_loss": 0.04187231883406639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.093615876219701e-05, + "grad_norm": 28.13504981994629, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8635745048522949, + "num_tokens": 463564691.0, + "step": 12149 + }, + { + "epoch": 1.545604884874698, + "ewc_loss": 0.04200999438762665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.10049965971848e-05, + "grad_norm": 28.10799789428711, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.862666666507721, + "num_tokens": 463599037.0, + "step": 12150 + }, + { + "epoch": 1.5457320951532885, + "ewc_loss": 0.04184160754084587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0920802853652276e-05, + "grad_norm": 28.066753387451172, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8535749316215515, + "num_tokens": 463630744.0, + "step": 12151 + }, + { + "epoch": 1.545859305431879, + "ewc_loss": 0.042014725506305695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1007363102398813e-05, + "grad_norm": 28.205286026000977, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8629827499389648, + "num_tokens": 463663551.0, + "step": 12152 + }, + { + "epoch": 1.5459865157104695, + "ewc_loss": 0.04195191338658333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.097595643135719e-05, + "grad_norm": 28.023401260375977, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8779758810997009, + "num_tokens": 463702446.0, + "step": 12153 + }, + { + "epoch": 1.54611372598906, + "ewc_loss": 0.0419788584113121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0989429685869254e-05, + "grad_norm": 28.017791748046875, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8579434156417847, + "num_tokens": 463739324.0, + "step": 12154 + }, + { + "epoch": 1.5462409362676506, + "ewc_loss": 0.041984304785728455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0992152713006362e-05, + "grad_norm": 27.976274490356445, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.846634030342102, + "num_tokens": 463780765.0, + "step": 12155 + }, + { + "epoch": 1.5463681465462409, + "ewc_loss": 0.04213535413146019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1067677153041586e-05, + "grad_norm": 28.14502716064453, + "learning_rate": 1e-06, + "loss": 0.5157, + "mean_token_accuracy": 0.8452579975128174, + "num_tokens": 463817727.0, + "step": 12156 + }, + { + "epoch": 1.5464953568248314, + "ewc_loss": 0.04210284352302551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1051420844742097e-05, + "grad_norm": 28.006650924682617, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8813856840133667, + "num_tokens": 463865793.0, + "step": 12157 + }, + { + "epoch": 1.546622567103422, + "ewc_loss": 0.04209940508008003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1049701899755746e-05, + "grad_norm": 28.001909255981445, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8673412799835205, + "num_tokens": 463908299.0, + "step": 12158 + }, + { + "epoch": 1.5467497773820125, + "ewc_loss": 0.04213850945234299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.106925421685446e-05, + "grad_norm": 28.26120948791504, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8630059361457825, + "num_tokens": 463944513.0, + "step": 12159 + }, + { + "epoch": 1.546876987660603, + "ewc_loss": 0.04208391532301903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.104195846186485e-05, + "grad_norm": 28.027433395385742, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8594245910644531, + "num_tokens": 463982862.0, + "step": 12160 + }, + { + "epoch": 1.5470041979391935, + "ewc_loss": 0.04202837869524956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.101418976963032e-05, + "grad_norm": 28.212848663330078, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8714737296104431, + "num_tokens": 464023357.0, + "step": 12161 + }, + { + "epoch": 1.5471314082177838, + "ewc_loss": 0.042138610035181046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.106930514855776e-05, + "grad_norm": 28.010536193847656, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8659390211105347, + "num_tokens": 464071889.0, + "step": 12162 + }, + { + "epoch": 1.5472586184963744, + "ewc_loss": 0.04199166223406792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0995830709580332e-05, + "grad_norm": 28.120525360107422, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8532920479774475, + "num_tokens": 464112334.0, + "step": 12163 + }, + { + "epoch": 1.5473858287749649, + "ewc_loss": 0.04211601987481117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.105800922436174e-05, + "grad_norm": 28.161542892456055, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8615705966949463, + "num_tokens": 464156833.0, + "step": 12164 + }, + { + "epoch": 1.5475130390535554, + "ewc_loss": 0.041982389986515045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0991195924580097e-05, + "grad_norm": 28.07395362854004, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8687428832054138, + "num_tokens": 464192989.0, + "step": 12165 + }, + { + "epoch": 1.547640249332146, + "ewc_loss": 0.042040593922138214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1020296117058024e-05, + "grad_norm": 28.177356719970703, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8719677925109863, + "num_tokens": 464232855.0, + "step": 12166 + }, + { + "epoch": 1.5477674596107365, + "ewc_loss": 0.04201820492744446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1009102056268603e-05, + "grad_norm": 27.992229461669922, + "learning_rate": 1e-06, + "loss": 0.485, + "mean_token_accuracy": 0.8496547937393188, + "num_tokens": 464271860.0, + "step": 12167 + }, + { + "epoch": 1.547894669889327, + "ewc_loss": 0.04203098639845848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1015493985032663e-05, + "grad_norm": 28.100168228149414, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.878715455532074, + "num_tokens": 464312925.0, + "step": 12168 + }, + { + "epoch": 1.5480218801679175, + "ewc_loss": 0.042067751288414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1033874872955494e-05, + "grad_norm": 28.052703857421875, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8610982298851013, + "num_tokens": 464349190.0, + "step": 12169 + }, + { + "epoch": 1.548149090446508, + "ewc_loss": 0.041961632668972015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0980816771043465e-05, + "grad_norm": 28.108381271362305, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8614815473556519, + "num_tokens": 464392031.0, + "step": 12170 + }, + { + "epoch": 1.5482763007250986, + "ewc_loss": 0.04214122146368027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1070611182949506e-05, + "grad_norm": 28.078845977783203, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.861454963684082, + "num_tokens": 464424884.0, + "step": 12171 + }, + { + "epoch": 1.5484035110036891, + "ewc_loss": 0.04199754446744919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0998772015445866e-05, + "grad_norm": 28.087844848632812, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8611413240432739, + "num_tokens": 464466047.0, + "step": 12172 + }, + { + "epoch": 1.5485307212822796, + "ewc_loss": 0.042117100208997726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1058549464214593e-05, + "grad_norm": 28.122209548950195, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8665143251419067, + "num_tokens": 464497293.0, + "step": 12173 + }, + { + "epoch": 1.5486579315608702, + "ewc_loss": 0.04200117290019989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1000587366870604e-05, + "grad_norm": 28.108850479125977, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8523880839347839, + "num_tokens": 464541873.0, + "step": 12174 + }, + { + "epoch": 1.5487851418394607, + "ewc_loss": 0.04210004210472107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1050020222901367e-05, + "grad_norm": 28.14690589904785, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8663098216056824, + "num_tokens": 464579712.0, + "step": 12175 + }, + { + "epoch": 1.5489123521180512, + "ewc_loss": 0.042017336934804916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.100866913679056e-05, + "grad_norm": 28.083454132080078, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8648202419281006, + "num_tokens": 464612277.0, + "step": 12176 + }, + { + "epoch": 1.5490395623966418, + "ewc_loss": 0.042042527347803116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.102126381942071e-05, + "grad_norm": 28.228717803955078, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.862462043762207, + "num_tokens": 464647062.0, + "step": 12177 + }, + { + "epoch": 1.5491667726752323, + "ewc_loss": 0.04203590750694275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1017953258706257e-05, + "grad_norm": 28.052343368530273, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8680460453033447, + "num_tokens": 464677998.0, + "step": 12178 + }, + { + "epoch": 1.5492939829538228, + "ewc_loss": 0.04200515151023865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.100257552228868e-05, + "grad_norm": 28.05205726623535, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8595155477523804, + "num_tokens": 464719205.0, + "step": 12179 + }, + { + "epoch": 1.5494211932324133, + "ewc_loss": 0.04209670424461365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1048352209618315e-05, + "grad_norm": 28.133337020874023, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8695803284645081, + "num_tokens": 464760758.0, + "step": 12180 + }, + { + "epoch": 1.5495484035110036, + "ewc_loss": 0.04206300526857376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.103150291077327e-05, + "grad_norm": 28.06991958618164, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8688433170318604, + "num_tokens": 464797422.0, + "step": 12181 + }, + { + "epoch": 1.5496756137895942, + "ewc_loss": 0.0420961007475853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1048050257377326e-05, + "grad_norm": 28.06198501586914, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8672220706939697, + "num_tokens": 464832889.0, + "step": 12182 + }, + { + "epoch": 1.5498028240681847, + "ewc_loss": 0.042096398770809174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1048199414508417e-05, + "grad_norm": 28.104042053222656, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8593065738677979, + "num_tokens": 464867743.0, + "step": 12183 + }, + { + "epoch": 1.5499300343467752, + "ewc_loss": 0.04209331423044205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.104665691149421e-05, + "grad_norm": 28.067888259887695, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8531044125556946, + "num_tokens": 464904666.0, + "step": 12184 + }, + { + "epoch": 1.5500572446253658, + "ewc_loss": 0.04212438687682152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1062192899989896e-05, + "grad_norm": 28.096818923950195, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.869641125202179, + "num_tokens": 464942886.0, + "step": 12185 + }, + { + "epoch": 1.5501844549039563, + "ewc_loss": 0.04204782098531723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.102391044900287e-05, + "grad_norm": 28.103609085083008, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.864837646484375, + "num_tokens": 464977463.0, + "step": 12186 + }, + { + "epoch": 1.5503116651825466, + "ewc_loss": 0.04210177809000015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.105088969983626e-05, + "grad_norm": 28.047216415405273, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8748747706413269, + "num_tokens": 465015732.0, + "step": 12187 + }, + { + "epoch": 1.5504388754611371, + "ewc_loss": 0.042112454771995544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1056226614746265e-05, + "grad_norm": 28.090137481689453, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8745541572570801, + "num_tokens": 465060789.0, + "step": 12188 + }, + { + "epoch": 1.5505660857397277, + "ewc_loss": 0.0421702042222023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1085101252538152e-05, + "grad_norm": 28.056949615478516, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8633818030357361, + "num_tokens": 465097907.0, + "step": 12189 + }, + { + "epoch": 1.5506932960183182, + "ewc_loss": 0.0421777218580246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.108886110363528e-05, + "grad_norm": 28.18477439880371, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8732136487960815, + "num_tokens": 465139401.0, + "step": 12190 + }, + { + "epoch": 1.5508205062969087, + "ewc_loss": 0.04209592938423157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.104796476487536e-05, + "grad_norm": 28.087003707885742, + "learning_rate": 1e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.8449880480766296, + "num_tokens": 465176531.0, + "step": 12191 + }, + { + "epoch": 1.5509477165754992, + "ewc_loss": 0.04211985692381859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.105992825818248e-05, + "grad_norm": 28.187976837158203, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8754189014434814, + "num_tokens": 465216074.0, + "step": 12192 + }, + { + "epoch": 1.5510749268540898, + "ewc_loss": 0.04212537780404091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1062689484097064e-05, + "grad_norm": 28.105567932128906, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8791770339012146, + "num_tokens": 465257986.0, + "step": 12193 + }, + { + "epoch": 1.5512021371326803, + "ewc_loss": 0.04203653335571289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1018266124883667e-05, + "grad_norm": 28.262250900268555, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8694794178009033, + "num_tokens": 465296038.0, + "step": 12194 + }, + { + "epoch": 1.5513293474112708, + "ewc_loss": 0.042160939425230026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1080470105516724e-05, + "grad_norm": 28.220989227294922, + "learning_rate": 1e-06, + "loss": 0.5006, + "mean_token_accuracy": 0.8428863286972046, + "num_tokens": 465334215.0, + "step": 12195 + }, + { + "epoch": 1.5514565576898613, + "ewc_loss": 0.041948094964027405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.097404831147287e-05, + "grad_norm": 28.081668853759766, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8660376667976379, + "num_tokens": 465373362.0, + "step": 12196 + }, + { + "epoch": 1.5515837679684519, + "ewc_loss": 0.042060256004333496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.103012775478419e-05, + "grad_norm": 28.159217834472656, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8603875637054443, + "num_tokens": 465410181.0, + "step": 12197 + }, + { + "epoch": 1.5517109782470424, + "ewc_loss": 0.0420447438955307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.102237158396747e-05, + "grad_norm": 28.12538719177246, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8733148574829102, + "num_tokens": 465443056.0, + "step": 12198 + }, + { + "epoch": 1.551838188525633, + "ewc_loss": 0.04208284616470337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.104142367898021e-05, + "grad_norm": 28.134313583374023, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8686245679855347, + "num_tokens": 465484509.0, + "step": 12199 + }, + { + "epoch": 1.5519653988042235, + "ewc_loss": 0.042028505355119705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1014253434259444e-05, + "grad_norm": 28.02210807800293, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8645321130752563, + "num_tokens": 465520017.0, + "step": 12200 + }, + { + "epoch": 1.552092609082814, + "ewc_loss": 0.04198271036148071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.099135599564761e-05, + "grad_norm": 28.092449188232422, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8829813003540039, + "num_tokens": 465557221.0, + "step": 12201 + }, + { + "epoch": 1.5522198193614045, + "ewc_loss": 0.04210382327437401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1051911971881054e-05, + "grad_norm": 28.09276008605957, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8757911324501038, + "num_tokens": 465595455.0, + "step": 12202 + }, + { + "epoch": 1.552347029639995, + "ewc_loss": 0.04202733933925629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1013669538660906e-05, + "grad_norm": 28.161006927490234, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8668262958526611, + "num_tokens": 465636436.0, + "step": 12203 + }, + { + "epoch": 1.5524742399185856, + "ewc_loss": 0.04203350096940994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1016750906710513e-05, + "grad_norm": 28.040283203125, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8635656237602234, + "num_tokens": 465668956.0, + "step": 12204 + }, + { + "epoch": 1.5526014501971759, + "ewc_loss": 0.04209129139780998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.104564555338584e-05, + "grad_norm": 28.281768798828125, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8479099273681641, + "num_tokens": 465708812.0, + "step": 12205 + }, + { + "epoch": 1.5527286604757664, + "ewc_loss": 0.042202770709991455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1101384845678695e-05, + "grad_norm": 28.12664222717285, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8533025979995728, + "num_tokens": 465742610.0, + "step": 12206 + }, + { + "epoch": 1.552855870754357, + "ewc_loss": 0.04203350469470024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1016752725699916e-05, + "grad_norm": 28.1640567779541, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8652505874633789, + "num_tokens": 465776468.0, + "step": 12207 + }, + { + "epoch": 1.5529830810329475, + "ewc_loss": 0.04216155782341957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1080779333715327e-05, + "grad_norm": 28.211027145385742, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8829506635665894, + "num_tokens": 465809300.0, + "step": 12208 + }, + { + "epoch": 1.553110291311538, + "ewc_loss": 0.04203737899661064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1018689949414693e-05, + "grad_norm": 28.15586280822754, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8680874109268188, + "num_tokens": 465849847.0, + "step": 12209 + }, + { + "epoch": 1.5532375015901285, + "ewc_loss": 0.04207410663366318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1037052647443488e-05, + "grad_norm": 28.173547744750977, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8783796429634094, + "num_tokens": 465884986.0, + "step": 12210 + }, + { + "epoch": 1.5533647118687188, + "ewc_loss": 0.04206203296780586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1031017240602523e-05, + "grad_norm": 28.137466430664062, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8734287023544312, + "num_tokens": 465919710.0, + "step": 12211 + }, + { + "epoch": 1.5534919221473094, + "ewc_loss": 0.04203980416059494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1019901396357454e-05, + "grad_norm": 28.209693908691406, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8584494590759277, + "num_tokens": 465956041.0, + "step": 12212 + }, + { + "epoch": 1.5536191324258999, + "ewc_loss": 0.04214038327336311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1070190996397287e-05, + "grad_norm": 28.154457092285156, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8771950006484985, + "num_tokens": 465994352.0, + "step": 12213 + }, + { + "epoch": 1.5537463427044904, + "ewc_loss": 0.04199600592255592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0998002582928166e-05, + "grad_norm": 28.104948043823242, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8667472004890442, + "num_tokens": 466032493.0, + "step": 12214 + }, + { + "epoch": 1.553873552983081, + "ewc_loss": 0.042083192616701126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1041596482973546e-05, + "grad_norm": 28.16623878479004, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8695659637451172, + "num_tokens": 466067048.0, + "step": 12215 + }, + { + "epoch": 1.5540007632616715, + "ewc_loss": 0.04206598550081253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1032992663094774e-05, + "grad_norm": 28.14031219482422, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8441740274429321, + "num_tokens": 466106895.0, + "step": 12216 + }, + { + "epoch": 1.554127973540262, + "ewc_loss": 0.042081110179424286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1040556021034718e-05, + "grad_norm": 28.28813934326172, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8520206809043884, + "num_tokens": 466146740.0, + "step": 12217 + }, + { + "epoch": 1.5542551838188525, + "ewc_loss": 0.042126964777708054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.106348256347701e-05, + "grad_norm": 28.06727409362793, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8724757432937622, + "num_tokens": 466187097.0, + "step": 12218 + }, + { + "epoch": 1.554382394097443, + "ewc_loss": 0.042041659355163574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1020829080953263e-05, + "grad_norm": 28.201745986938477, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8566878437995911, + "num_tokens": 466223969.0, + "step": 12219 + }, + { + "epoch": 1.5545096043760336, + "ewc_loss": 0.042126938700675964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1063469830551185e-05, + "grad_norm": 28.102693557739258, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8769150972366333, + "num_tokens": 466260127.0, + "step": 12220 + }, + { + "epoch": 1.554636814654624, + "ewc_loss": 0.04205649718642235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1028248738730326e-05, + "grad_norm": 28.084732055664062, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8721802830696106, + "num_tokens": 466297219.0, + "step": 12221 + }, + { + "epoch": 1.5547640249332146, + "ewc_loss": 0.04214072972536087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1070365619380027e-05, + "grad_norm": 28.14642906188965, + "learning_rate": 1e-06, + "loss": 0.5005, + "mean_token_accuracy": 0.8466622829437256, + "num_tokens": 466329474.0, + "step": 12222 + }, + { + "epoch": 1.5548912352118052, + "ewc_loss": 0.04209936782717705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.104968370986171e-05, + "grad_norm": 28.2200927734375, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8653289079666138, + "num_tokens": 466372801.0, + "step": 12223 + }, + { + "epoch": 1.5550184454903957, + "ewc_loss": 0.04205404967069626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.102702455886174e-05, + "grad_norm": 28.173860549926758, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8737421631813049, + "num_tokens": 466412950.0, + "step": 12224 + }, + { + "epoch": 1.5551456557689862, + "ewc_loss": 0.04209392890334129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.104696432070341e-05, + "grad_norm": 28.117229461669922, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8726283311843872, + "num_tokens": 466448830.0, + "step": 12225 + }, + { + "epoch": 1.5552728660475768, + "ewc_loss": 0.042119916528463364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.105995918100234e-05, + "grad_norm": 28.21512222290039, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8554450273513794, + "num_tokens": 466489749.0, + "step": 12226 + }, + { + "epoch": 1.5554000763261673, + "ewc_loss": 0.04215948283672333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1079740690765902e-05, + "grad_norm": 28.1824893951416, + "learning_rate": 1e-06, + "loss": 0.4927, + "mean_token_accuracy": 0.8485896587371826, + "num_tokens": 466531320.0, + "step": 12227 + }, + { + "epoch": 1.5555272866047578, + "ewc_loss": 0.04206683114171028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.10334164876258e-05, + "grad_norm": 28.10408592224121, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8578965663909912, + "num_tokens": 466570001.0, + "step": 12228 + }, + { + "epoch": 1.5556544968833483, + "ewc_loss": 0.04207174479961395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1035872123320587e-05, + "grad_norm": 28.189674377441406, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8698129653930664, + "num_tokens": 466607186.0, + "step": 12229 + }, + { + "epoch": 1.5557817071619386, + "ewc_loss": 0.04217566177248955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1087831555632874e-05, + "grad_norm": 28.12948226928711, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8576564788818359, + "num_tokens": 466649480.0, + "step": 12230 + }, + { + "epoch": 1.5559089174405292, + "ewc_loss": 0.04210074245929718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.105037128785625e-05, + "grad_norm": 28.3354549407959, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8723911046981812, + "num_tokens": 466688861.0, + "step": 12231 + }, + { + "epoch": 1.5560361277191197, + "ewc_loss": 0.04221287742257118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1106437998241745e-05, + "grad_norm": 28.200429916381836, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8474677801132202, + "num_tokens": 466723200.0, + "step": 12232 + }, + { + "epoch": 1.5561633379977102, + "ewc_loss": 0.04193790629506111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.096895332215354e-05, + "grad_norm": 28.215417861938477, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8517992496490479, + "num_tokens": 466759639.0, + "step": 12233 + }, + { + "epoch": 1.5562905482763008, + "ewc_loss": 0.04198344424366951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0991721612517722e-05, + "grad_norm": 28.082216262817383, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8599812984466553, + "num_tokens": 466795748.0, + "step": 12234 + }, + { + "epoch": 1.5564177585548913, + "ewc_loss": 0.04202577471733093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.101288737321738e-05, + "grad_norm": 28.184606552124023, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8758177161216736, + "num_tokens": 466832938.0, + "step": 12235 + }, + { + "epoch": 1.5565449688334816, + "ewc_loss": 0.04207261651754379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1036308680777438e-05, + "grad_norm": 28.1724853515625, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8585712909698486, + "num_tokens": 466868710.0, + "step": 12236 + }, + { + "epoch": 1.5566721791120721, + "ewc_loss": 0.04197585582733154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0987927200621925e-05, + "grad_norm": 28.090551376342773, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8628139495849609, + "num_tokens": 466903558.0, + "step": 12237 + }, + { + "epoch": 1.5567993893906626, + "ewc_loss": 0.04208190366625786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.104095256072469e-05, + "grad_norm": 28.192630767822266, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8559553623199463, + "num_tokens": 466946785.0, + "step": 12238 + }, + { + "epoch": 1.5569265996692532, + "ewc_loss": 0.04203299432992935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.101649806718342e-05, + "grad_norm": 28.12452507019043, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8588131666183472, + "num_tokens": 466988563.0, + "step": 12239 + }, + { + "epoch": 1.5570538099478437, + "ewc_loss": 0.042050670832395554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1025334717705846e-05, + "grad_norm": 28.187328338623047, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8586013913154602, + "num_tokens": 467025447.0, + "step": 12240 + }, + { + "epoch": 1.5571810202264342, + "ewc_loss": 0.04206795245409012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1033976736362092e-05, + "grad_norm": 28.137065887451172, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8736395835876465, + "num_tokens": 467060751.0, + "step": 12241 + }, + { + "epoch": 1.5573082305050248, + "ewc_loss": 0.042051829397678375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1025914975325577e-05, + "grad_norm": 28.16022491455078, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8643810749053955, + "num_tokens": 467101889.0, + "step": 12242 + }, + { + "epoch": 1.5574354407836153, + "ewc_loss": 0.042038511484861374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1019255655119196e-05, + "grad_norm": 28.159069061279297, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8683722615242004, + "num_tokens": 467136782.0, + "step": 12243 + }, + { + "epoch": 1.5575626510622058, + "ewc_loss": 0.04208963364362717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1044816094217822e-05, + "grad_norm": 28.11591339111328, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.874893307685852, + "num_tokens": 467174137.0, + "step": 12244 + }, + { + "epoch": 1.5576898613407963, + "ewc_loss": 0.04209589958190918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.104795021296013e-05, + "grad_norm": 28.217391967773438, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8608795404434204, + "num_tokens": 467213919.0, + "step": 12245 + }, + { + "epoch": 1.5578170716193869, + "ewc_loss": 0.042134031653404236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1067015040898696e-05, + "grad_norm": 28.073810577392578, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8723246455192566, + "num_tokens": 467254860.0, + "step": 12246 + }, + { + "epoch": 1.5579442818979774, + "ewc_loss": 0.04204915463924408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.102457801811397e-05, + "grad_norm": 28.11517333984375, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8773351907730103, + "num_tokens": 467296806.0, + "step": 12247 + }, + { + "epoch": 1.558071492176568, + "ewc_loss": 0.04208581894636154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1042909793322906e-05, + "grad_norm": 28.130102157592773, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8724348545074463, + "num_tokens": 467332882.0, + "step": 12248 + }, + { + "epoch": 1.5581987024551585, + "ewc_loss": 0.04211008548736572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1055042452644557e-05, + "grad_norm": 28.201416015625, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8806690573692322, + "num_tokens": 467375088.0, + "step": 12249 + }, + { + "epoch": 1.558325912733749, + "ewc_loss": 0.042146191000938416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.107309592247475e-05, + "grad_norm": 28.13559341430664, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8652117252349854, + "num_tokens": 467413000.0, + "step": 12250 + }, + { + "epoch": 1.5584531230123395, + "ewc_loss": 0.042045850306749344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1022924556746148e-05, + "grad_norm": 28.191688537597656, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8801653385162354, + "num_tokens": 467450947.0, + "step": 12251 + }, + { + "epoch": 1.55858033329093, + "ewc_loss": 0.04219717159867287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1098585420986637e-05, + "grad_norm": 28.33646583557129, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8562501668930054, + "num_tokens": 467493770.0, + "step": 12252 + }, + { + "epoch": 1.5587075435695206, + "ewc_loss": 0.04216849431395531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1084246327518485e-05, + "grad_norm": 28.19542694091797, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8629571199417114, + "num_tokens": 467531882.0, + "step": 12253 + }, + { + "epoch": 1.5588347538481109, + "ewc_loss": 0.042056191712617874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.102809594362043e-05, + "grad_norm": 28.26388931274414, + "learning_rate": 1e-06, + "loss": 0.3414, + "mean_token_accuracy": 0.8910514116287231, + "num_tokens": 467566059.0, + "step": 12254 + }, + { + "epoch": 1.5589619641267014, + "ewc_loss": 0.042083777487277985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1041889340267517e-05, + "grad_norm": 28.223115921020508, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8811485767364502, + "num_tokens": 467603734.0, + "step": 12255 + }, + { + "epoch": 1.559089174405292, + "ewc_loss": 0.04198557138442993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0992785721318796e-05, + "grad_norm": 28.142986297607422, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8740810751914978, + "num_tokens": 467641224.0, + "step": 12256 + }, + { + "epoch": 1.5592163846838825, + "ewc_loss": 0.042076531797647476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1038265913375653e-05, + "grad_norm": 28.235807418823242, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8647603392601013, + "num_tokens": 467677576.0, + "step": 12257 + }, + { + "epoch": 1.559343594962473, + "ewc_loss": 0.04203706234693527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1018531697336584e-05, + "grad_norm": 28.18608856201172, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8731772899627686, + "num_tokens": 467717526.0, + "step": 12258 + }, + { + "epoch": 1.5594708052410635, + "ewc_loss": 0.04202656075358391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1013280274928547e-05, + "grad_norm": 28.186365127563477, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.861794114112854, + "num_tokens": 467757591.0, + "step": 12259 + }, + { + "epoch": 1.5595980155196538, + "ewc_loss": 0.04207821190357208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.103910628648009e-05, + "grad_norm": 28.24553680419922, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8525512218475342, + "num_tokens": 467792794.0, + "step": 12260 + }, + { + "epoch": 1.5597252257982444, + "ewc_loss": 0.04204414784908295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1022073269705288e-05, + "grad_norm": 28.191041946411133, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8611350655555725, + "num_tokens": 467829554.0, + "step": 12261 + }, + { + "epoch": 1.5598524360768349, + "ewc_loss": 0.04199424013495445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.0997120373067446e-05, + "grad_norm": 28.172136306762695, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.850208580493927, + "num_tokens": 467871403.0, + "step": 12262 + }, + { + "epoch": 1.5599796463554254, + "ewc_loss": 0.042135462164878845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1067731722723693e-05, + "grad_norm": 28.254993438720703, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8460633754730225, + "num_tokens": 467906498.0, + "step": 12263 + }, + { + "epoch": 1.560106856634016, + "ewc_loss": 0.04204235225915909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.102117650792934e-05, + "grad_norm": 28.128311157226562, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8642642498016357, + "num_tokens": 467946426.0, + "step": 12264 + }, + { + "epoch": 1.5602340669126065, + "ewc_loss": 0.04198513552546501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.099256744259037e-05, + "grad_norm": 28.026691436767578, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8686423301696777, + "num_tokens": 467979930.0, + "step": 12265 + }, + { + "epoch": 1.560361277191197, + "ewc_loss": 0.04213755950331688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1068779460620135e-05, + "grad_norm": 28.259117126464844, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8594764471054077, + "num_tokens": 468016347.0, + "step": 12266 + }, + { + "epoch": 1.5604884874697875, + "ewc_loss": 0.042078420519828796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1039209968876094e-05, + "grad_norm": 28.05303382873535, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8725864291191101, + "num_tokens": 468051446.0, + "step": 12267 + }, + { + "epoch": 1.560615697748378, + "ewc_loss": 0.04211094230413437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1055471734143794e-05, + "grad_norm": 28.132352828979492, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8825138807296753, + "num_tokens": 468085713.0, + "step": 12268 + }, + { + "epoch": 1.5607429080269686, + "ewc_loss": 0.042149484157562256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.107474210788496e-05, + "grad_norm": 28.154582977294922, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8777255415916443, + "num_tokens": 468117863.0, + "step": 12269 + }, + { + "epoch": 1.560870118305559, + "ewc_loss": 0.04222474247217178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.111237154167611e-05, + "grad_norm": 28.212276458740234, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8626322746276855, + "num_tokens": 468153041.0, + "step": 12270 + }, + { + "epoch": 1.5609973285841496, + "ewc_loss": 0.04217083007097244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1085415937704965e-05, + "grad_norm": 28.049781799316406, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8636422753334045, + "num_tokens": 468193624.0, + "step": 12271 + }, + { + "epoch": 1.5611245388627402, + "ewc_loss": 0.042242761701345444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1121380996191874e-05, + "grad_norm": 28.331310272216797, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8804605007171631, + "num_tokens": 468227091.0, + "step": 12272 + }, + { + "epoch": 1.5612517491413307, + "ewc_loss": 0.0422014556825161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1100728190504014e-05, + "grad_norm": 28.092113494873047, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.861025333404541, + "num_tokens": 468265079.0, + "step": 12273 + }, + { + "epoch": 1.5613789594199212, + "ewc_loss": 0.042118437588214874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1059218852315098e-05, + "grad_norm": 28.18810272216797, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8636223077774048, + "num_tokens": 468307055.0, + "step": 12274 + }, + { + "epoch": 1.5615061696985117, + "ewc_loss": 0.042216554284095764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.110827699652873e-05, + "grad_norm": 28.18017578125, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8601607084274292, + "num_tokens": 468344429.0, + "step": 12275 + }, + { + "epoch": 1.5616333799771023, + "ewc_loss": 0.04217164218425751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1085821572341956e-05, + "grad_norm": 28.25648307800293, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8644922971725464, + "num_tokens": 468379448.0, + "step": 12276 + }, + { + "epoch": 1.5617605902556928, + "ewc_loss": 0.04215943440794945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1079717043903656e-05, + "grad_norm": 28.149486541748047, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8693854212760925, + "num_tokens": 468423344.0, + "step": 12277 + }, + { + "epoch": 1.5618878005342833, + "ewc_loss": 0.04216531664133072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.108265834976919e-05, + "grad_norm": 28.32792854309082, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.872136652469635, + "num_tokens": 468461657.0, + "step": 12278 + }, + { + "epoch": 1.5620150108128736, + "ewc_loss": 0.04219650477170944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.109825254592579e-05, + "grad_norm": 28.24469566345215, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8802657723426819, + "num_tokens": 468500428.0, + "step": 12279 + }, + { + "epoch": 1.5621422210914642, + "ewc_loss": 0.04213089868426323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1065448891022243e-05, + "grad_norm": 28.344287872314453, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8545966148376465, + "num_tokens": 468536310.0, + "step": 12280 + }, + { + "epoch": 1.5622694313700547, + "ewc_loss": 0.042163360863924026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1081679733470082e-05, + "grad_norm": 28.174945831298828, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8641107082366943, + "num_tokens": 468577954.0, + "step": 12281 + }, + { + "epoch": 1.5623966416486452, + "ewc_loss": 0.04209287837147713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1046438632765785e-05, + "grad_norm": 28.205476760864258, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8568678498268127, + "num_tokens": 468612131.0, + "step": 12282 + }, + { + "epoch": 1.5625238519272358, + "ewc_loss": 0.04210353270173073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.105176645272877e-05, + "grad_norm": 28.13515853881836, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8610467314720154, + "num_tokens": 468654022.0, + "step": 12283 + }, + { + "epoch": 1.5626510622058263, + "ewc_loss": 0.04210986942052841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1054935132269748e-05, + "grad_norm": 28.16564178466797, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8565642237663269, + "num_tokens": 468693120.0, + "step": 12284 + }, + { + "epoch": 1.5627782724844166, + "ewc_loss": 0.04212205484509468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1061026927782223e-05, + "grad_norm": 28.21651840209961, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8653374910354614, + "num_tokens": 468727695.0, + "step": 12285 + }, + { + "epoch": 1.5629054827630071, + "ewc_loss": 0.042150598019361496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1075298718642443e-05, + "grad_norm": 28.130638122558594, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8623912930488586, + "num_tokens": 468766705.0, + "step": 12286 + }, + { + "epoch": 1.5630326930415976, + "ewc_loss": 0.042181920260190964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1090960217406973e-05, + "grad_norm": 28.229961395263672, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8567173480987549, + "num_tokens": 468804555.0, + "step": 12287 + }, + { + "epoch": 1.5631599033201882, + "ewc_loss": 0.04211016371846199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1055082470411435e-05, + "grad_norm": 28.174022674560547, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8622715473175049, + "num_tokens": 468842495.0, + "step": 12288 + }, + { + "epoch": 1.5632871135987787, + "ewc_loss": 0.04219680279493332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.109840170305688e-05, + "grad_norm": 28.202550888061523, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8649744987487793, + "num_tokens": 468877574.0, + "step": 12289 + }, + { + "epoch": 1.5634143238773692, + "ewc_loss": 0.04213126376271248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1065632608952e-05, + "grad_norm": 28.192968368530273, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8766754269599915, + "num_tokens": 468914573.0, + "step": 12290 + }, + { + "epoch": 1.5635415341559598, + "ewc_loss": 0.04217471182346344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1087354980409145e-05, + "grad_norm": 28.227649688720703, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8765974044799805, + "num_tokens": 468950943.0, + "step": 12291 + }, + { + "epoch": 1.5636687444345503, + "ewc_loss": 0.042126789689064026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.106339525198564e-05, + "grad_norm": 28.200952529907227, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8806089758872986, + "num_tokens": 468990007.0, + "step": 12292 + }, + { + "epoch": 1.5637959547131408, + "ewc_loss": 0.042137566953897476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1068783098598942e-05, + "grad_norm": 28.13351821899414, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8687659502029419, + "num_tokens": 469022461.0, + "step": 12293 + }, + { + "epoch": 1.5639231649917313, + "ewc_loss": 0.042157091200351715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1078545614727773e-05, + "grad_norm": 28.185344696044922, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8716300129890442, + "num_tokens": 469056396.0, + "step": 12294 + }, + { + "epoch": 1.5640503752703219, + "ewc_loss": 0.04219452291727066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1097261196700856e-05, + "grad_norm": 28.12343406677246, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8752982020378113, + "num_tokens": 469099464.0, + "step": 12295 + }, + { + "epoch": 1.5641775855489124, + "ewc_loss": 0.042222313582897186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1111156456754543e-05, + "grad_norm": 28.295490264892578, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8727197051048279, + "num_tokens": 469139830.0, + "step": 12296 + }, + { + "epoch": 1.564304795827503, + "ewc_loss": 0.04216112941503525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.108056469296571e-05, + "grad_norm": 28.088119506835938, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.864851713180542, + "num_tokens": 469177838.0, + "step": 12297 + }, + { + "epoch": 1.5644320061060935, + "ewc_loss": 0.04212241247296333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1061207007733174e-05, + "grad_norm": 28.161020278930664, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8901400566101074, + "num_tokens": 469218727.0, + "step": 12298 + }, + { + "epoch": 1.564559216384684, + "ewc_loss": 0.04222136363387108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1110681700520217e-05, + "grad_norm": 28.198862075805664, + "learning_rate": 1e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8433087468147278, + "num_tokens": 469261186.0, + "step": 12299 + }, + { + "epoch": 1.5646864266632745, + "ewc_loss": 0.04216207191348076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1081035811221227e-05, + "grad_norm": 28.15888786315918, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8735515475273132, + "num_tokens": 469301191.0, + "step": 12300 + }, + { + "epoch": 1.564813636941865, + "ewc_loss": 0.04214749485254288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.107374712068122e-05, + "grad_norm": 28.1485538482666, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8711985945701599, + "num_tokens": 469340132.0, + "step": 12301 + }, + { + "epoch": 1.5649408472204556, + "ewc_loss": 0.04217531904578209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1087658751639538e-05, + "grad_norm": 28.231138229370117, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8851454257965088, + "num_tokens": 469376211.0, + "step": 12302 + }, + { + "epoch": 1.5650680574990459, + "ewc_loss": 0.04216174781322479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.108087392116431e-05, + "grad_norm": 28.21393394470215, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8566651344299316, + "num_tokens": 469413961.0, + "step": 12303 + }, + { + "epoch": 1.5651952677776364, + "ewc_loss": 0.0421622134745121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1081106751807965e-05, + "grad_norm": 28.21367645263672, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8658298850059509, + "num_tokens": 469449507.0, + "step": 12304 + }, + { + "epoch": 1.565322478056227, + "ewc_loss": 0.0421227402985096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1061370716779493e-05, + "grad_norm": 28.148605346679688, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.852544903755188, + "num_tokens": 469486324.0, + "step": 12305 + }, + { + "epoch": 1.5654496883348175, + "ewc_loss": 0.04214370250701904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1071851733722724e-05, + "grad_norm": 28.286584854125977, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8618108034133911, + "num_tokens": 469523799.0, + "step": 12306 + }, + { + "epoch": 1.565576898613408, + "ewc_loss": 0.042180225253105164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.109011256834492e-05, + "grad_norm": 28.223859786987305, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8528046011924744, + "num_tokens": 469558661.0, + "step": 12307 + }, + { + "epoch": 1.5657041088919985, + "ewc_loss": 0.042085740715265274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1042869775556028e-05, + "grad_norm": 28.1643123626709, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8645365834236145, + "num_tokens": 469603087.0, + "step": 12308 + }, + { + "epoch": 1.5658313191705888, + "ewc_loss": 0.04225228726863861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1126143110450357e-05, + "grad_norm": 28.212661743164062, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8658479452133179, + "num_tokens": 469637344.0, + "step": 12309 + }, + { + "epoch": 1.5659585294491793, + "ewc_loss": 0.042075082659721375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1037541955593042e-05, + "grad_norm": 28.138565063476562, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8753157258033752, + "num_tokens": 469681127.0, + "step": 12310 + }, + { + "epoch": 1.5660857397277699, + "ewc_loss": 0.0422099232673645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1104960978846066e-05, + "grad_norm": 28.17125701904297, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8650408983230591, + "num_tokens": 469724238.0, + "step": 12311 + }, + { + "epoch": 1.5662129500063604, + "ewc_loss": 0.04222128912806511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1110645320732147e-05, + "grad_norm": 28.20844841003418, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8595685958862305, + "num_tokens": 469765967.0, + "step": 12312 + }, + { + "epoch": 1.566340160284951, + "ewc_loss": 0.04215165600180626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1075828044558875e-05, + "grad_norm": 28.140356063842773, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8671888113021851, + "num_tokens": 469801387.0, + "step": 12313 + }, + { + "epoch": 1.5664673705635415, + "ewc_loss": 0.04217549413442612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.108774788212031e-05, + "grad_norm": 28.12790870666504, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8663972616195679, + "num_tokens": 469835448.0, + "step": 12314 + }, + { + "epoch": 1.566594580842132, + "ewc_loss": 0.042207859456539154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1103929611854255e-05, + "grad_norm": 28.173744201660156, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8571997880935669, + "num_tokens": 469876647.0, + "step": 12315 + }, + { + "epoch": 1.5667217911207225, + "ewc_loss": 0.04224996268749237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.112498077622149e-05, + "grad_norm": 28.102983474731445, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8796090483665466, + "num_tokens": 469918158.0, + "step": 12316 + }, + { + "epoch": 1.566849001399313, + "ewc_loss": 0.042192187160253525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.109609340550378e-05, + "grad_norm": 28.128599166870117, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8527452945709229, + "num_tokens": 469953691.0, + "step": 12317 + }, + { + "epoch": 1.5669762116779036, + "ewc_loss": 0.04226750135421753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1133750124135986e-05, + "grad_norm": 28.190824508666992, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8691270351409912, + "num_tokens": 469996174.0, + "step": 12318 + }, + { + "epoch": 1.567103421956494, + "ewc_loss": 0.04220941290259361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.110470632032957e-05, + "grad_norm": 28.13527488708496, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8526986837387085, + "num_tokens": 470031371.0, + "step": 12319 + }, + { + "epoch": 1.5672306322350846, + "ewc_loss": 0.04231619834899902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.115809911629185e-05, + "grad_norm": 28.26280975341797, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8696327209472656, + "num_tokens": 470068783.0, + "step": 12320 + }, + { + "epoch": 1.5673578425136752, + "ewc_loss": 0.04222073033452034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1110365196364e-05, + "grad_norm": 28.121315002441406, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8483080267906189, + "num_tokens": 470103490.0, + "step": 12321 + }, + { + "epoch": 1.5674850527922657, + "ewc_loss": 0.04220075532793999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.110037712554913e-05, + "grad_norm": 28.075664520263672, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8623136281967163, + "num_tokens": 470137213.0, + "step": 12322 + }, + { + "epoch": 1.5676122630708562, + "ewc_loss": 0.04225645586848259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.112822767230682e-05, + "grad_norm": 28.162172317504883, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8707796335220337, + "num_tokens": 470171816.0, + "step": 12323 + }, + { + "epoch": 1.5677394733494467, + "ewc_loss": 0.04227176308631897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1135881979716942e-05, + "grad_norm": 28.161102294921875, + "learning_rate": 1e-06, + "loss": 0.5067, + "mean_token_accuracy": 0.842894971370697, + "num_tokens": 470210517.0, + "step": 12324 + }, + { + "epoch": 1.5678666836280373, + "ewc_loss": 0.042257312685251236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1128656953806058e-05, + "grad_norm": 28.158931732177734, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.853718638420105, + "num_tokens": 470247085.0, + "step": 12325 + }, + { + "epoch": 1.5679938939066278, + "ewc_loss": 0.04226626083254814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1133129848749377e-05, + "grad_norm": 28.189767837524414, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8665814399719238, + "num_tokens": 470288483.0, + "step": 12326 + }, + { + "epoch": 1.5681211041852183, + "ewc_loss": 0.04230617359280586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.115308598149568e-05, + "grad_norm": 28.366235733032227, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.866497278213501, + "num_tokens": 470322173.0, + "step": 12327 + }, + { + "epoch": 1.5682483144638086, + "ewc_loss": 0.0422276109457016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1113804905326106e-05, + "grad_norm": 28.122852325439453, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8675258159637451, + "num_tokens": 470356617.0, + "step": 12328 + }, + { + "epoch": 1.5683755247423992, + "ewc_loss": 0.04218427836894989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.109213892254047e-05, + "grad_norm": 28.2059268951416, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8756162524223328, + "num_tokens": 470393095.0, + "step": 12329 + }, + { + "epoch": 1.5685027350209897, + "ewc_loss": 0.04234744980931282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.117372423526831e-05, + "grad_norm": 28.227439880371094, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8777033686637878, + "num_tokens": 470427785.0, + "step": 12330 + }, + { + "epoch": 1.5686299452995802, + "ewc_loss": 0.042306993156671524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.115349707310088e-05, + "grad_norm": 28.27531623840332, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8602089881896973, + "num_tokens": 470469971.0, + "step": 12331 + }, + { + "epoch": 1.5687571555781707, + "ewc_loss": 0.04224073514342308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.11203678190941e-05, + "grad_norm": 28.189903259277344, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8580055236816406, + "num_tokens": 470503935.0, + "step": 12332 + }, + { + "epoch": 1.5688843658567613, + "ewc_loss": 0.04228061065077782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1140305761946365e-05, + "grad_norm": 28.192367553710938, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8686333298683167, + "num_tokens": 470536014.0, + "step": 12333 + }, + { + "epoch": 1.5690115761353516, + "ewc_loss": 0.04234684258699417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1173420464037918e-05, + "grad_norm": 28.171306610107422, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.879103422164917, + "num_tokens": 470575632.0, + "step": 12334 + }, + { + "epoch": 1.569138786413942, + "ewc_loss": 0.042348138988018036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1174069843254983e-05, + "grad_norm": 28.24147605895996, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8759149312973022, + "num_tokens": 470611295.0, + "step": 12335 + }, + { + "epoch": 1.5692659966925326, + "ewc_loss": 0.042320769280195236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1160383766982704e-05, + "grad_norm": 28.12690544128418, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.864086389541626, + "num_tokens": 470646993.0, + "step": 12336 + }, + { + "epoch": 1.5693932069711232, + "ewc_loss": 0.042310722172260284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1155361537239514e-05, + "grad_norm": 28.166057586669922, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8696285486221313, + "num_tokens": 470680253.0, + "step": 12337 + }, + { + "epoch": 1.5695204172497137, + "ewc_loss": 0.042352356016635895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1176178051973693e-05, + "grad_norm": 28.18933868408203, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.864235520362854, + "num_tokens": 470716169.0, + "step": 12338 + }, + { + "epoch": 1.5696476275283042, + "ewc_loss": 0.042347099632024765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.117354961228557e-05, + "grad_norm": 28.134363174438477, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8785156011581421, + "num_tokens": 470754208.0, + "step": 12339 + }, + { + "epoch": 1.5697748378068948, + "ewc_loss": 0.04231094568967819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.115547249559313e-05, + "grad_norm": 28.206270217895508, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8667800426483154, + "num_tokens": 470801329.0, + "step": 12340 + }, + { + "epoch": 1.5699020480854853, + "ewc_loss": 0.04243728891015053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1218644178588875e-05, + "grad_norm": 28.360437393188477, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8618431091308594, + "num_tokens": 470838408.0, + "step": 12341 + }, + { + "epoch": 1.5700292583640758, + "ewc_loss": 0.042294908314943314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.11474543903023e-05, + "grad_norm": 28.223526000976562, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8655903935432434, + "num_tokens": 470874001.0, + "step": 12342 + }, + { + "epoch": 1.5701564686426663, + "ewc_loss": 0.04225510358810425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.11275510082487e-05, + "grad_norm": 28.214841842651367, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8609097003936768, + "num_tokens": 470916870.0, + "step": 12343 + }, + { + "epoch": 1.5702836789212569, + "ewc_loss": 0.04229645058512688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1148225641809404e-05, + "grad_norm": 28.257726669311523, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8742048144340515, + "num_tokens": 470953862.0, + "step": 12344 + }, + { + "epoch": 1.5704108891998474, + "ewc_loss": 0.04222540184855461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1112700778758153e-05, + "grad_norm": 28.07135009765625, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8766589760780334, + "num_tokens": 470991507.0, + "step": 12345 + }, + { + "epoch": 1.570538099478438, + "ewc_loss": 0.04236840456724167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1184201614232734e-05, + "grad_norm": 28.303516387939453, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8693537712097168, + "num_tokens": 471030109.0, + "step": 12346 + }, + { + "epoch": 1.5706653097570284, + "ewc_loss": 0.04230034723877907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.11501737794606e-05, + "grad_norm": 28.14680290222168, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8800474405288696, + "num_tokens": 471063829.0, + "step": 12347 + }, + { + "epoch": 1.570792520035619, + "ewc_loss": 0.04218742623925209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1093712348374538e-05, + "grad_norm": 28.295368194580078, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8560906648635864, + "num_tokens": 471107259.0, + "step": 12348 + }, + { + "epoch": 1.5709197303142095, + "ewc_loss": 0.04223417490720749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1117088181199506e-05, + "grad_norm": 28.191587448120117, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8689770698547363, + "num_tokens": 471141516.0, + "step": 12349 + }, + { + "epoch": 1.5710469405928, + "ewc_loss": 0.042234353721141815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.111717731168028e-05, + "grad_norm": 28.375513076782227, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8666362166404724, + "num_tokens": 471178655.0, + "step": 12350 + }, + { + "epoch": 1.5711741508713906, + "ewc_loss": 0.0422242097556591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.111210415023379e-05, + "grad_norm": 28.068510055541992, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8775197267532349, + "num_tokens": 471209437.0, + "step": 12351 + }, + { + "epoch": 1.5713013611499809, + "ewc_loss": 0.04220441356301308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1102207028889097e-05, + "grad_norm": 28.251476287841797, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8816882371902466, + "num_tokens": 471249559.0, + "step": 12352 + }, + { + "epoch": 1.5714285714285714, + "ewc_loss": 0.04225831851363182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1129158994881436e-05, + "grad_norm": 28.123754501342773, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8714898824691772, + "num_tokens": 471287179.0, + "step": 12353 + }, + { + "epoch": 1.571555781707162, + "ewc_loss": 0.042201362550258636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1100680896779522e-05, + "grad_norm": 28.083627700805664, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8697192668914795, + "num_tokens": 471328052.0, + "step": 12354 + }, + { + "epoch": 1.5716829919857525, + "ewc_loss": 0.042336005717515945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1168003513594158e-05, + "grad_norm": 28.22737693786621, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.865754246711731, + "num_tokens": 471362414.0, + "step": 12355 + }, + { + "epoch": 1.571810202264343, + "ewc_loss": 0.042342621833086014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1171310436329804e-05, + "grad_norm": 28.22511100769043, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.862206757068634, + "num_tokens": 471397910.0, + "step": 12356 + }, + { + "epoch": 1.5719374125429335, + "ewc_loss": 0.04232390969991684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1161955373827368e-05, + "grad_norm": 28.213573455810547, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8808664083480835, + "num_tokens": 471436951.0, + "step": 12357 + }, + { + "epoch": 1.5720646228215238, + "ewc_loss": 0.04232931137084961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1164656573091634e-05, + "grad_norm": 28.216114044189453, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8687276840209961, + "num_tokens": 471470139.0, + "step": 12358 + }, + { + "epoch": 1.5721918331001143, + "ewc_loss": 0.042281728237867355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1140864191693254e-05, + "grad_norm": 28.159151077270508, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8744624853134155, + "num_tokens": 471508591.0, + "step": 12359 + }, + { + "epoch": 1.5723190433787049, + "ewc_loss": 0.0423244908452034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1162244593142532e-05, + "grad_norm": 28.199352264404297, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8678851127624512, + "num_tokens": 471549165.0, + "step": 12360 + }, + { + "epoch": 1.5724462536572954, + "ewc_loss": 0.042351651936769485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1175825168029405e-05, + "grad_norm": 28.22405433654785, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8609746694564819, + "num_tokens": 471589021.0, + "step": 12361 + }, + { + "epoch": 1.572573463935886, + "ewc_loss": 0.042298682034015656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1149340682313778e-05, + "grad_norm": 28.166818618774414, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8600883483886719, + "num_tokens": 471624263.0, + "step": 12362 + }, + { + "epoch": 1.5727006742144765, + "ewc_loss": 0.0422661229968071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1133060727152042e-05, + "grad_norm": 28.173511505126953, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8613376617431641, + "num_tokens": 471663678.0, + "step": 12363 + }, + { + "epoch": 1.572827884493067, + "ewc_loss": 0.04232403263449669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1162015400477685e-05, + "grad_norm": 28.30608558654785, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.864631712436676, + "num_tokens": 471705887.0, + "step": 12364 + }, + { + "epoch": 1.5729550947716575, + "ewc_loss": 0.04233333095908165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1166664737393148e-05, + "grad_norm": 28.178035736083984, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8740577697753906, + "num_tokens": 471739796.0, + "step": 12365 + }, + { + "epoch": 1.573082305050248, + "ewc_loss": 0.042282965034246445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.114148264809046e-05, + "grad_norm": 28.32148551940918, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8634297251701355, + "num_tokens": 471774723.0, + "step": 12366 + }, + { + "epoch": 1.5732095153288386, + "ewc_loss": 0.04226965829730034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.113482878485229e-05, + "grad_norm": 28.150083541870117, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8600601553916931, + "num_tokens": 471814169.0, + "step": 12367 + }, + { + "epoch": 1.573336725607429, + "ewc_loss": 0.04227238520979881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1136193026904948e-05, + "grad_norm": 28.248416900634766, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8678097128868103, + "num_tokens": 471852367.0, + "step": 12368 + }, + { + "epoch": 1.5734639358860196, + "ewc_loss": 0.042281538248062134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.114076960424427e-05, + "grad_norm": 28.174560546875, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.872575044631958, + "num_tokens": 471883438.0, + "step": 12369 + }, + { + "epoch": 1.5735911461646102, + "ewc_loss": 0.04224347323179245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.112173751811497e-05, + "grad_norm": 28.178466796875, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8622850179672241, + "num_tokens": 471921051.0, + "step": 12370 + }, + { + "epoch": 1.5737183564432007, + "ewc_loss": 0.042265161871910095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1132580513949506e-05, + "grad_norm": 28.201738357543945, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.870032787322998, + "num_tokens": 471959685.0, + "step": 12371 + }, + { + "epoch": 1.5738455667217912, + "ewc_loss": 0.042217593640089035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1108797227498144e-05, + "grad_norm": 28.172956466674805, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8680107593536377, + "num_tokens": 471997345.0, + "step": 12372 + }, + { + "epoch": 1.5739727770003817, + "ewc_loss": 0.04227834567427635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1139172531547956e-05, + "grad_norm": 28.255029678344727, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8730986714363098, + "num_tokens": 472034455.0, + "step": 12373 + }, + { + "epoch": 1.5740999872789723, + "ewc_loss": 0.04221370071172714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.110685090883635e-05, + "grad_norm": 28.126392364501953, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8660159111022949, + "num_tokens": 472073630.0, + "step": 12374 + }, + { + "epoch": 1.5742271975575628, + "ewc_loss": 0.0422951839864254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.114759263349697e-05, + "grad_norm": 28.256755828857422, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8654431104660034, + "num_tokens": 472111556.0, + "step": 12375 + }, + { + "epoch": 1.5743544078361533, + "ewc_loss": 0.04223252087831497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1116260541020893e-05, + "grad_norm": 28.055343627929688, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8739212155342102, + "num_tokens": 472148276.0, + "step": 12376 + }, + { + "epoch": 1.5744816181147436, + "ewc_loss": 0.042276885360479355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1138443116797134e-05, + "grad_norm": 28.263731002807617, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8666958808898926, + "num_tokens": 472181665.0, + "step": 12377 + }, + { + "epoch": 1.5746088283933342, + "ewc_loss": 0.04236219450831413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.118109659932088e-05, + "grad_norm": 28.221752166748047, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8605314493179321, + "num_tokens": 472222759.0, + "step": 12378 + }, + { + "epoch": 1.5747360386719247, + "ewc_loss": 0.04225989431142807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.112994661729317e-05, + "grad_norm": 28.208938598632812, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.860619306564331, + "num_tokens": 472260867.0, + "step": 12379 + }, + { + "epoch": 1.5748632489505152, + "ewc_loss": 0.04221455007791519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1107274733367376e-05, + "grad_norm": 28.231046676635742, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8552864789962769, + "num_tokens": 472300068.0, + "step": 12380 + }, + { + "epoch": 1.5749904592291057, + "ewc_loss": 0.04226791858673096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1133959307917394e-05, + "grad_norm": 28.202251434326172, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8780261874198914, + "num_tokens": 472336459.0, + "step": 12381 + }, + { + "epoch": 1.5751176695076963, + "ewc_loss": 0.04213733971118927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1068670321255922e-05, + "grad_norm": 28.195104598999023, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8712503910064697, + "num_tokens": 472376696.0, + "step": 12382 + }, + { + "epoch": 1.5752448797862866, + "ewc_loss": 0.0423184372484684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.115921779477503e-05, + "grad_norm": 28.16824722290039, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8797494173049927, + "num_tokens": 472414491.0, + "step": 12383 + }, + { + "epoch": 1.575372090064877, + "ewc_loss": 0.042219847440719604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.110992318193894e-05, + "grad_norm": 28.148897171020508, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8668030500411987, + "num_tokens": 472452188.0, + "step": 12384 + }, + { + "epoch": 1.5754993003434676, + "ewc_loss": 0.042308688163757324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1154344722162932e-05, + "grad_norm": 28.29046058654785, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8657664060592651, + "num_tokens": 472489690.0, + "step": 12385 + }, + { + "epoch": 1.5756265106220582, + "ewc_loss": 0.04224785417318344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.112392758135684e-05, + "grad_norm": 28.22051239013672, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8651286363601685, + "num_tokens": 472529912.0, + "step": 12386 + }, + { + "epoch": 1.5757537209006487, + "ewc_loss": 0.042211368680000305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1105684936628677e-05, + "grad_norm": 28.20568084716797, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8655077219009399, + "num_tokens": 472562534.0, + "step": 12387 + }, + { + "epoch": 1.5758809311792392, + "ewc_loss": 0.04219001159071922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.109500564984046e-05, + "grad_norm": 28.182937622070312, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8648818135261536, + "num_tokens": 472598955.0, + "step": 12388 + }, + { + "epoch": 1.5760081414578297, + "ewc_loss": 0.04221845418214798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.110922650899738e-05, + "grad_norm": 28.08979034423828, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8705798387527466, + "num_tokens": 472638064.0, + "step": 12389 + }, + { + "epoch": 1.5761353517364203, + "ewc_loss": 0.0422431081533432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.112155380018521e-05, + "grad_norm": 28.24216079711914, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8705577850341797, + "num_tokens": 472676823.0, + "step": 12390 + }, + { + "epoch": 1.5762625620150108, + "ewc_loss": 0.04233931750059128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1169658793951385e-05, + "grad_norm": 28.264314651489258, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8664961457252502, + "num_tokens": 472718528.0, + "step": 12391 + }, + { + "epoch": 1.5763897722936013, + "ewc_loss": 0.042222797870635986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1111398382345214e-05, + "grad_norm": 28.10675048828125, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8732026219367981, + "num_tokens": 472755634.0, + "step": 12392 + }, + { + "epoch": 1.5765169825721919, + "ewc_loss": 0.04227963835000992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.113982009177562e-05, + "grad_norm": 28.3068904876709, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8788633346557617, + "num_tokens": 472798060.0, + "step": 12393 + }, + { + "epoch": 1.5766441928507824, + "ewc_loss": 0.04231080412864685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1155401555006392e-05, + "grad_norm": 28.239477157592773, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.87205970287323, + "num_tokens": 472837628.0, + "step": 12394 + }, + { + "epoch": 1.576771403129373, + "ewc_loss": 0.04219762980937958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1098814613651484e-05, + "grad_norm": 28.21064567565918, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8688743710517883, + "num_tokens": 472877446.0, + "step": 12395 + }, + { + "epoch": 1.5768986134079634, + "ewc_loss": 0.042186908423900604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1093454051879235e-05, + "grad_norm": 28.197044372558594, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8611335754394531, + "num_tokens": 472918437.0, + "step": 12396 + }, + { + "epoch": 1.577025823686554, + "ewc_loss": 0.04226187989115715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1130939785507508e-05, + "grad_norm": 28.26424789428711, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8764060139656067, + "num_tokens": 472956242.0, + "step": 12397 + }, + { + "epoch": 1.5771530339651445, + "ewc_loss": 0.042196352034807205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.109817614837084e-05, + "grad_norm": 28.100210189819336, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8765785694122314, + "num_tokens": 472985620.0, + "step": 12398 + }, + { + "epoch": 1.577280244243735, + "ewc_loss": 0.04217389598488808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.108694752678275e-05, + "grad_norm": 28.182559967041016, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.87168288230896, + "num_tokens": 473019517.0, + "step": 12399 + }, + { + "epoch": 1.5774074545223256, + "ewc_loss": 0.04225439578294754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1127198124304414e-05, + "grad_norm": 28.09324836730957, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.886759340763092, + "num_tokens": 473054624.0, + "step": 12400 + }, + { + "epoch": 1.5775346648009159, + "ewc_loss": 0.04217404127120972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1087020286358893e-05, + "grad_norm": 28.222299575805664, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.857435941696167, + "num_tokens": 473094140.0, + "step": 12401 + }, + { + "epoch": 1.5776618750795064, + "ewc_loss": 0.04229646176099777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1148231098777615e-05, + "grad_norm": 28.26114845275879, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8486250042915344, + "num_tokens": 473136886.0, + "step": 12402 + }, + { + "epoch": 1.577789085358097, + "ewc_loss": 0.042218782007694244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.11093902180437e-05, + "grad_norm": 28.16229248046875, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8548808097839355, + "num_tokens": 473173267.0, + "step": 12403 + }, + { + "epoch": 1.5779162956366874, + "ewc_loss": 0.04218196123838425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1090980226290412e-05, + "grad_norm": 28.091352462768555, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8471170663833618, + "num_tokens": 473216477.0, + "step": 12404 + }, + { + "epoch": 1.578043505915278, + "ewc_loss": 0.04229291155934334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1146455765119754e-05, + "grad_norm": 28.42181396484375, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8591798543930054, + "num_tokens": 473256986.0, + "step": 12405 + }, + { + "epoch": 1.5781707161938685, + "ewc_loss": 0.04233035072684288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1165174985071644e-05, + "grad_norm": 28.232315063476562, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8749456405639648, + "num_tokens": 473297967.0, + "step": 12406 + }, + { + "epoch": 1.5782979264724588, + "ewc_loss": 0.042233649641275406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1116824427735992e-05, + "grad_norm": 28.314245223999023, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8588558435440063, + "num_tokens": 473333580.0, + "step": 12407 + }, + { + "epoch": 1.5784251367510493, + "ewc_loss": 0.04228180646896362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1140902390470728e-05, + "grad_norm": 28.191434860229492, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8704667091369629, + "num_tokens": 473370553.0, + "step": 12408 + }, + { + "epoch": 1.5785523470296399, + "ewc_loss": 0.04219489172101021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1097446733620018e-05, + "grad_norm": 28.27639389038086, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8486200571060181, + "num_tokens": 473410705.0, + "step": 12409 + }, + { + "epoch": 1.5786795573082304, + "ewc_loss": 0.04228886589407921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.114443304890301e-05, + "grad_norm": 28.210468292236328, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8713850975036621, + "num_tokens": 473443315.0, + "step": 12410 + }, + { + "epoch": 1.578806767586821, + "ewc_loss": 0.042219530791044235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.110976492986083e-05, + "grad_norm": 28.332035064697266, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.858786404132843, + "num_tokens": 473488767.0, + "step": 12411 + }, + { + "epoch": 1.5789339778654115, + "ewc_loss": 0.042283542454242706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1141771867405623e-05, + "grad_norm": 28.24342918395996, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8820985555648804, + "num_tokens": 473525360.0, + "step": 12412 + }, + { + "epoch": 1.579061188144002, + "ewc_loss": 0.042175911366939545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1087955246912315e-05, + "grad_norm": 28.347881317138672, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8589149713516235, + "num_tokens": 473560625.0, + "step": 12413 + }, + { + "epoch": 1.5791883984225925, + "ewc_loss": 0.04225664213299751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.11283204407664e-05, + "grad_norm": 28.171857833862305, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8853892683982849, + "num_tokens": 473595484.0, + "step": 12414 + }, + { + "epoch": 1.579315608701183, + "ewc_loss": 0.04220394045114517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1101970560266636e-05, + "grad_norm": 28.447465896606445, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8649761080741882, + "num_tokens": 473627269.0, + "step": 12415 + }, + { + "epoch": 1.5794428189797736, + "ewc_loss": 0.042331431061029434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1165715224924497e-05, + "grad_norm": 28.193805694580078, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8517489433288574, + "num_tokens": 473669159.0, + "step": 12416 + }, + { + "epoch": 1.579570029258364, + "ewc_loss": 0.042152903974056244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.107645195792429e-05, + "grad_norm": 28.248369216918945, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8769729137420654, + "num_tokens": 473707739.0, + "step": 12417 + }, + { + "epoch": 1.5796972395369546, + "ewc_loss": 0.042314812541007996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.11574060813291e-05, + "grad_norm": 28.280550003051758, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8687319159507751, + "num_tokens": 473746908.0, + "step": 12418 + }, + { + "epoch": 1.5798244498155452, + "ewc_loss": 0.042197562754154205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.109878187184222e-05, + "grad_norm": 28.22623062133789, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8648122549057007, + "num_tokens": 473781480.0, + "step": 12419 + }, + { + "epoch": 1.5799516600941357, + "ewc_loss": 0.04225604236125946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1128020307514817e-05, + "grad_norm": 28.348875045776367, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8662716746330261, + "num_tokens": 473814676.0, + "step": 12420 + }, + { + "epoch": 1.5800788703727262, + "ewc_loss": 0.04228803887963295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1144020138308406e-05, + "grad_norm": 28.268144607543945, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8629563450813293, + "num_tokens": 473855487.0, + "step": 12421 + }, + { + "epoch": 1.5802060806513167, + "ewc_loss": 0.0422545000910759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1127250874997117e-05, + "grad_norm": 28.25661849975586, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8797421455383301, + "num_tokens": 473885648.0, + "step": 12422 + }, + { + "epoch": 1.5803332909299073, + "ewc_loss": 0.04230201244354248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1151006876607426e-05, + "grad_norm": 28.380491256713867, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8627537488937378, + "num_tokens": 473923862.0, + "step": 12423 + }, + { + "epoch": 1.5804605012084978, + "ewc_loss": 0.04227195680141449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.113597838615533e-05, + "grad_norm": 28.19937515258789, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8711460828781128, + "num_tokens": 473957725.0, + "step": 12424 + }, + { + "epoch": 1.5805877114870883, + "ewc_loss": 0.04224232956767082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1121164536452852e-05, + "grad_norm": 28.2946720123291, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8681248426437378, + "num_tokens": 473994939.0, + "step": 12425 + }, + { + "epoch": 1.5807149217656786, + "ewc_loss": 0.04235855117440224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.117927579092793e-05, + "grad_norm": 28.261184692382812, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8625571727752686, + "num_tokens": 474035996.0, + "step": 12426 + }, + { + "epoch": 1.5808421320442692, + "ewc_loss": 0.042263660579919815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1131831090315245e-05, + "grad_norm": 28.22312355041504, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8754117488861084, + "num_tokens": 474071693.0, + "step": 12427 + }, + { + "epoch": 1.5809693423228597, + "ewc_loss": 0.04232412576675415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1162062694202177e-05, + "grad_norm": 28.25527572631836, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8737189173698425, + "num_tokens": 474114415.0, + "step": 12428 + }, + { + "epoch": 1.5810965526014502, + "ewc_loss": 0.04232775792479515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.116387986461632e-05, + "grad_norm": 28.357242584228516, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8609275817871094, + "num_tokens": 474152060.0, + "step": 12429 + }, + { + "epoch": 1.5812237628800407, + "ewc_loss": 0.04236600920557976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.11830047192052e-05, + "grad_norm": 28.2591552734375, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8707095980644226, + "num_tokens": 474190605.0, + "step": 12430 + }, + { + "epoch": 1.5813509731586313, + "ewc_loss": 0.042309775948524475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1154888599994592e-05, + "grad_norm": 28.389570236206055, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8622847199440002, + "num_tokens": 474229447.0, + "step": 12431 + }, + { + "epoch": 1.5814781834372216, + "ewc_loss": 0.04233654588460922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1168272724025883e-05, + "grad_norm": 28.329648971557617, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8747962713241577, + "num_tokens": 474268088.0, + "step": 12432 + }, + { + "epoch": 1.581605393715812, + "ewc_loss": 0.04228093847632408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1140469470992684e-05, + "grad_norm": 28.448890686035156, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.862454354763031, + "num_tokens": 474304856.0, + "step": 12433 + }, + { + "epoch": 1.5817326039944026, + "ewc_loss": 0.04231990873813629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1159954485483468e-05, + "grad_norm": 28.29149055480957, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8699419498443604, + "num_tokens": 474349248.0, + "step": 12434 + }, + { + "epoch": 1.5818598142729932, + "ewc_loss": 0.042139239609241486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1069619833724573e-05, + "grad_norm": 28.436189651489258, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8765376806259155, + "num_tokens": 474384432.0, + "step": 12435 + }, + { + "epoch": 1.5819870245515837, + "ewc_loss": 0.04230273887515068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.115136885549873e-05, + "grad_norm": 28.319246292114258, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.865117609500885, + "num_tokens": 474415881.0, + "step": 12436 + }, + { + "epoch": 1.5821142348301742, + "ewc_loss": 0.04212986305356026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1064932298031636e-05, + "grad_norm": 28.26072120666504, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8653878569602966, + "num_tokens": 474452755.0, + "step": 12437 + }, + { + "epoch": 1.5822414451087647, + "ewc_loss": 0.04229458421468735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1147292500245385e-05, + "grad_norm": 28.528640747070312, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8618583679199219, + "num_tokens": 474493937.0, + "step": 12438 + }, + { + "epoch": 1.5823686553873553, + "ewc_loss": 0.04221788048744202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1108940927661024e-05, + "grad_norm": 28.189495086669922, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8730597496032715, + "num_tokens": 474534072.0, + "step": 12439 + }, + { + "epoch": 1.5824958656659458, + "ewc_loss": 0.042160749435424805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.108037551806774e-05, + "grad_norm": 28.25670051574707, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8591050505638123, + "num_tokens": 474569343.0, + "step": 12440 + }, + { + "epoch": 1.5826230759445363, + "ewc_loss": 0.04223908856511116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1119543816894293e-05, + "grad_norm": 28.325244903564453, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.857198178768158, + "num_tokens": 474606610.0, + "step": 12441 + }, + { + "epoch": 1.5827502862231269, + "ewc_loss": 0.042251456528902054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.112572838086635e-05, + "grad_norm": 28.318832397460938, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8575190901756287, + "num_tokens": 474645071.0, + "step": 12442 + }, + { + "epoch": 1.5828774965017174, + "ewc_loss": 0.04223262518644333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1116313291713595e-05, + "grad_norm": 28.326677322387695, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8538966178894043, + "num_tokens": 474683985.0, + "step": 12443 + }, + { + "epoch": 1.583004706780308, + "ewc_loss": 0.04222290962934494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1111454771016724e-05, + "grad_norm": 28.262584686279297, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8729184865951538, + "num_tokens": 474726102.0, + "step": 12444 + }, + { + "epoch": 1.5831319170588984, + "ewc_loss": 0.04226566106081009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1132829715497792e-05, + "grad_norm": 28.25602912902832, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8694815635681152, + "num_tokens": 474766430.0, + "step": 12445 + }, + { + "epoch": 1.583259127337489, + "ewc_loss": 0.04225379228591919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1126896172063425e-05, + "grad_norm": 28.242919921875, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8649202585220337, + "num_tokens": 474800648.0, + "step": 12446 + }, + { + "epoch": 1.5833863376160795, + "ewc_loss": 0.04225026071071625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1125129933352582e-05, + "grad_norm": 28.111326217651367, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8701882362365723, + "num_tokens": 474845540.0, + "step": 12447 + }, + { + "epoch": 1.58351354789467, + "ewc_loss": 0.042246636003255844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.112331821990665e-05, + "grad_norm": 28.195575714111328, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8640470504760742, + "num_tokens": 474884296.0, + "step": 12448 + }, + { + "epoch": 1.5836407581732606, + "ewc_loss": 0.0422598272562027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1129913875483908e-05, + "grad_norm": 28.045936584472656, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8711588978767395, + "num_tokens": 474929344.0, + "step": 12449 + }, + { + "epoch": 1.5837679684518509, + "ewc_loss": 0.04234984144568443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1174921130295843e-05, + "grad_norm": 28.253002166748047, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8716206550598145, + "num_tokens": 474968035.0, + "step": 12450 + }, + { + "epoch": 1.5838951787304414, + "ewc_loss": 0.0423508919775486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1175446818233468e-05, + "grad_norm": 28.235074996948242, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8615995645523071, + "num_tokens": 475006189.0, + "step": 12451 + }, + { + "epoch": 1.584022389009032, + "ewc_loss": 0.04230276122689247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1151379769435152e-05, + "grad_norm": 28.196861267089844, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8623181581497192, + "num_tokens": 475039768.0, + "step": 12452 + }, + { + "epoch": 1.5841495992876224, + "ewc_loss": 0.04233739525079727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1168698367546313e-05, + "grad_norm": 28.267751693725586, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8720977902412415, + "num_tokens": 475079938.0, + "step": 12453 + }, + { + "epoch": 1.584276809566213, + "ewc_loss": 0.042371898889541626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.118594966304954e-05, + "grad_norm": 28.245935440063477, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.854507327079773, + "num_tokens": 475122405.0, + "step": 12454 + }, + { + "epoch": 1.5844040198448035, + "ewc_loss": 0.042314667254686356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1157333321752958e-05, + "grad_norm": 28.318517684936523, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8593572974205017, + "num_tokens": 475158237.0, + "step": 12455 + }, + { + "epoch": 1.5845312301233938, + "ewc_loss": 0.0423186793923378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.115933966706507e-05, + "grad_norm": 28.324810028076172, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8681069612503052, + "num_tokens": 475194966.0, + "step": 12456 + }, + { + "epoch": 1.5846584404019843, + "ewc_loss": 0.04226215183734894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1131076209712774e-05, + "grad_norm": 28.29340171813965, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8657381534576416, + "num_tokens": 475234483.0, + "step": 12457 + }, + { + "epoch": 1.5847856506805749, + "ewc_loss": 0.042290348559617996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1145173377590254e-05, + "grad_norm": 28.28759002685547, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8618399500846863, + "num_tokens": 475275172.0, + "step": 12458 + }, + { + "epoch": 1.5849128609591654, + "ewc_loss": 0.04227212071418762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.113606024067849e-05, + "grad_norm": 28.281028747558594, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8723940849304199, + "num_tokens": 475313550.0, + "step": 12459 + }, + { + "epoch": 1.585040071237756, + "ewc_loss": 0.04229249432682991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1146246581338346e-05, + "grad_norm": 28.267471313476562, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8748805522918701, + "num_tokens": 475357019.0, + "step": 12460 + }, + { + "epoch": 1.5851672815163464, + "ewc_loss": 0.04227569326758385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.113784648827277e-05, + "grad_norm": 28.247455596923828, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8714429140090942, + "num_tokens": 475394788.0, + "step": 12461 + }, + { + "epoch": 1.585294491794937, + "ewc_loss": 0.042333707213401794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1166853912291117e-05, + "grad_norm": 28.3546142578125, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8648741245269775, + "num_tokens": 475432354.0, + "step": 12462 + }, + { + "epoch": 1.5854217020735275, + "ewc_loss": 0.042280908674001694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1140454919077456e-05, + "grad_norm": 28.249269485473633, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8700022101402283, + "num_tokens": 475467284.0, + "step": 12463 + }, + { + "epoch": 1.585548912352118, + "ewc_loss": 0.04222661256790161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1113306502229534e-05, + "grad_norm": 28.315126419067383, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8590598702430725, + "num_tokens": 475505946.0, + "step": 12464 + }, + { + "epoch": 1.5856761226307086, + "ewc_loss": 0.04221556335687637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1107782231410965e-05, + "grad_norm": 28.197450637817383, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8527924418449402, + "num_tokens": 475542799.0, + "step": 12465 + }, + { + "epoch": 1.585803332909299, + "ewc_loss": 0.04221895709633827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1109479348524474e-05, + "grad_norm": 28.21745491027832, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.867171585559845, + "num_tokens": 475582064.0, + "step": 12466 + }, + { + "epoch": 1.5859305431878896, + "ewc_loss": 0.04236271604895592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.118135853379499e-05, + "grad_norm": 28.33082389831543, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8623893857002258, + "num_tokens": 475615521.0, + "step": 12467 + }, + { + "epoch": 1.5860577534664801, + "ewc_loss": 0.042314451187849045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.115722600137815e-05, + "grad_norm": 28.356122970581055, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.874359130859375, + "num_tokens": 475650500.0, + "step": 12468 + }, + { + "epoch": 1.5861849637450707, + "ewc_loss": 0.042288266122341156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1144132915651426e-05, + "grad_norm": 28.359071731567383, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8675366044044495, + "num_tokens": 475695824.0, + "step": 12469 + }, + { + "epoch": 1.5863121740236612, + "ewc_loss": 0.04230033978819847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1150170141481794e-05, + "grad_norm": 28.361242294311523, + "learning_rate": 1e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8442795872688293, + "num_tokens": 475734015.0, + "step": 12470 + }, + { + "epoch": 1.5864393843022517, + "ewc_loss": 0.04226259887218475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.113129994540941e-05, + "grad_norm": 28.357336044311523, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8643264770507812, + "num_tokens": 475768961.0, + "step": 12471 + }, + { + "epoch": 1.5865665945808423, + "ewc_loss": 0.042233191430568695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1116595235071145e-05, + "grad_norm": 28.229101181030273, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8810745477676392, + "num_tokens": 475806551.0, + "step": 12472 + }, + { + "epoch": 1.5866938048594328, + "ewc_loss": 0.04226638376712799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1133191694389097e-05, + "grad_norm": 28.246797561645508, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8603435754776001, + "num_tokens": 475844066.0, + "step": 12473 + }, + { + "epoch": 1.5868210151380233, + "ewc_loss": 0.042387716472148895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.119385862897616e-05, + "grad_norm": 28.41150665283203, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8708022236824036, + "num_tokens": 475874554.0, + "step": 12474 + }, + { + "epoch": 1.5869482254166136, + "ewc_loss": 0.042302146553993225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1151072360225953e-05, + "grad_norm": 28.201818466186523, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8732692003250122, + "num_tokens": 475915840.0, + "step": 12475 + }, + { + "epoch": 1.5870754356952042, + "ewc_loss": 0.04227559268474579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1137795556569472e-05, + "grad_norm": 28.329803466796875, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8647533655166626, + "num_tokens": 475951516.0, + "step": 12476 + }, + { + "epoch": 1.5872026459737947, + "ewc_loss": 0.04234845191240311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.117422627634369e-05, + "grad_norm": 28.314043045043945, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8690242767333984, + "num_tokens": 475984224.0, + "step": 12477 + }, + { + "epoch": 1.5873298562523852, + "ewc_loss": 0.04234187677502632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1170937543502077e-05, + "grad_norm": 28.361820220947266, + "learning_rate": 1e-06, + "loss": 0.4959, + "mean_token_accuracy": 0.8449145555496216, + "num_tokens": 476015758.0, + "step": 12478 + }, + { + "epoch": 1.5874570665309757, + "ewc_loss": 0.042337566614151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.116878386004828e-05, + "grad_norm": 28.403621673583984, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8633843064308167, + "num_tokens": 476051835.0, + "step": 12479 + }, + { + "epoch": 1.5875842768095663, + "ewc_loss": 0.04237689450383186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.118844713550061e-05, + "grad_norm": 28.40740394592285, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8634267449378967, + "num_tokens": 476090667.0, + "step": 12480 + }, + { + "epoch": 1.5877114870881566, + "ewc_loss": 0.04237700253725052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1188501705182716e-05, + "grad_norm": 28.428211212158203, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8643347024917603, + "num_tokens": 476130448.0, + "step": 12481 + }, + { + "epoch": 1.587838697366747, + "ewc_loss": 0.04226009547710419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.113004848069977e-05, + "grad_norm": 28.275285720825195, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8575648069381714, + "num_tokens": 476174495.0, + "step": 12482 + }, + { + "epoch": 1.5879659076453376, + "ewc_loss": 0.042331527918577194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1165764337638393e-05, + "grad_norm": 28.36668586730957, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8744833469390869, + "num_tokens": 476213412.0, + "step": 12483 + }, + { + "epoch": 1.5880931179239282, + "ewc_loss": 0.04238196834921837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1190984625718556e-05, + "grad_norm": 28.533449172973633, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8801962733268738, + "num_tokens": 476250441.0, + "step": 12484 + }, + { + "epoch": 1.5882203282025187, + "ewc_loss": 0.04229351505637169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1146757717360742e-05, + "grad_norm": 28.396947860717773, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8802695870399475, + "num_tokens": 476284466.0, + "step": 12485 + }, + { + "epoch": 1.5883475384811092, + "ewc_loss": 0.042217738926410675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1108869987074286e-05, + "grad_norm": 28.358173370361328, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8652239441871643, + "num_tokens": 476327760.0, + "step": 12486 + }, + { + "epoch": 1.5884747487596997, + "ewc_loss": 0.0423649325966835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.118246629834175e-05, + "grad_norm": 28.45444107055664, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8629216551780701, + "num_tokens": 476369259.0, + "step": 12487 + }, + { + "epoch": 1.5886019590382903, + "ewc_loss": 0.04226452112197876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1132260371814482e-05, + "grad_norm": 28.37240982055664, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8678470849990845, + "num_tokens": 476403915.0, + "step": 12488 + }, + { + "epoch": 1.5887291693168808, + "ewc_loss": 0.042295314371585846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1147656298126094e-05, + "grad_norm": 28.24750518798828, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8650193214416504, + "num_tokens": 476440837.0, + "step": 12489 + }, + { + "epoch": 1.5888563795954713, + "ewc_loss": 0.04224994406104088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1124971681274474e-05, + "grad_norm": 28.354402542114258, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8647708296775818, + "num_tokens": 476478478.0, + "step": 12490 + }, + { + "epoch": 1.5889835898740619, + "ewc_loss": 0.042359888553619385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1179945179028437e-05, + "grad_norm": 28.354127883911133, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8746905326843262, + "num_tokens": 476514675.0, + "step": 12491 + }, + { + "epoch": 1.5891108001526524, + "ewc_loss": 0.04224974662065506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1124873455846682e-05, + "grad_norm": 28.17510223388672, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8640432357788086, + "num_tokens": 476559015.0, + "step": 12492 + }, + { + "epoch": 1.589238010431243, + "ewc_loss": 0.04227769747376442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1138848751434125e-05, + "grad_norm": 28.432456970214844, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8513732552528381, + "num_tokens": 476601848.0, + "step": 12493 + }, + { + "epoch": 1.5893652207098334, + "ewc_loss": 0.04238766059279442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1193829525145702e-05, + "grad_norm": 28.24146842956543, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8748153448104858, + "num_tokens": 476636794.0, + "step": 12494 + }, + { + "epoch": 1.589492430988424, + "ewc_loss": 0.04224295914173126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1121479221619666e-05, + "grad_norm": 28.316238403320312, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8610355854034424, + "num_tokens": 476671593.0, + "step": 12495 + }, + { + "epoch": 1.5896196412670145, + "ewc_loss": 0.04230368137359619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.115183997375425e-05, + "grad_norm": 28.360584259033203, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8764530420303345, + "num_tokens": 476706776.0, + "step": 12496 + }, + { + "epoch": 1.589746851545605, + "ewc_loss": 0.04223959147930145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1119796656421386e-05, + "grad_norm": 28.315946578979492, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8654874563217163, + "num_tokens": 476746700.0, + "step": 12497 + }, + { + "epoch": 1.5898740618241956, + "ewc_loss": 0.04224034771323204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.112017318722792e-05, + "grad_norm": 28.154117584228516, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8719829320907593, + "num_tokens": 476785837.0, + "step": 12498 + }, + { + "epoch": 1.5900012721027859, + "ewc_loss": 0.04228872060775757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.114436028932687e-05, + "grad_norm": 28.283628463745117, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8724298477172852, + "num_tokens": 476818894.0, + "step": 12499 + }, + { + "epoch": 1.5901284823813764, + "ewc_loss": 0.04232821241021156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.116410541930236e-05, + "grad_norm": 28.185888290405273, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8493162393569946, + "num_tokens": 476861773.0, + "step": 12500 + }, + { + "epoch": 1.590255692659967, + "ewc_loss": 0.04234692081809044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1173460481804796e-05, + "grad_norm": 28.290666580200195, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8713068962097168, + "num_tokens": 476899522.0, + "step": 12501 + }, + { + "epoch": 1.5903829029385574, + "ewc_loss": 0.04236079379916191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1180396288400516e-05, + "grad_norm": 28.243127822875977, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8592843413352966, + "num_tokens": 476939923.0, + "step": 12502 + }, + { + "epoch": 1.590510113217148, + "ewc_loss": 0.0423477441072464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1173871573409997e-05, + "grad_norm": 28.278610229492188, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8661182522773743, + "num_tokens": 476979867.0, + "step": 12503 + }, + { + "epoch": 1.5906373234957385, + "ewc_loss": 0.04239537566900253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1197687601670623e-05, + "grad_norm": 28.340850830078125, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8747639060020447, + "num_tokens": 477016068.0, + "step": 12504 + }, + { + "epoch": 1.5907645337743288, + "ewc_loss": 0.04233409836888313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1167048544157296e-05, + "grad_norm": 28.251422882080078, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.863623321056366, + "num_tokens": 477054526.0, + "step": 12505 + }, + { + "epoch": 1.5908917440529193, + "ewc_loss": 0.04232245311141014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1161225959076546e-05, + "grad_norm": 28.295751571655273, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8620951175689697, + "num_tokens": 477098029.0, + "step": 12506 + }, + { + "epoch": 1.5910189543315099, + "ewc_loss": 0.042436495423316956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.12182476388989e-05, + "grad_norm": 28.23074722290039, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.873355507850647, + "num_tokens": 477137070.0, + "step": 12507 + }, + { + "epoch": 1.5911461646101004, + "ewc_loss": 0.04234824702143669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1174122593947686e-05, + "grad_norm": 28.270889282226562, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8657089471817017, + "num_tokens": 477175640.0, + "step": 12508 + }, + { + "epoch": 1.591273374888691, + "ewc_loss": 0.04240776598453522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.12038830795791e-05, + "grad_norm": 28.213191986083984, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8690441846847534, + "num_tokens": 477210935.0, + "step": 12509 + }, + { + "epoch": 1.5914005851672814, + "ewc_loss": 0.042334411293268204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1167204977246e-05, + "grad_norm": 28.285343170166016, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.873283863067627, + "num_tokens": 477251567.0, + "step": 12510 + }, + { + "epoch": 1.591527795445872, + "ewc_loss": 0.042414527386426926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1207264580880292e-05, + "grad_norm": 28.2485294342041, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8684024810791016, + "num_tokens": 477287964.0, + "step": 12511 + }, + { + "epoch": 1.5916550057244625, + "ewc_loss": 0.04242943227291107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.121471698046662e-05, + "grad_norm": 28.383737564086914, + "learning_rate": 1e-06, + "loss": 0.5057, + "mean_token_accuracy": 0.8412569165229797, + "num_tokens": 477328660.0, + "step": 12512 + }, + { + "epoch": 1.591782216003053, + "ewc_loss": 0.04245929792523384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1229649064480327e-05, + "grad_norm": 28.27518081665039, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8713694214820862, + "num_tokens": 477364111.0, + "step": 12513 + }, + { + "epoch": 1.5919094262816436, + "ewc_loss": 0.04235430061817169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1177149392315187e-05, + "grad_norm": 28.294862747192383, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8725782632827759, + "num_tokens": 477402691.0, + "step": 12514 + }, + { + "epoch": 1.592036636560234, + "ewc_loss": 0.04238835722208023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1194178771111183e-05, + "grad_norm": 28.31492805480957, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8793126344680786, + "num_tokens": 477437526.0, + "step": 12515 + }, + { + "epoch": 1.5921638468388246, + "ewc_loss": 0.04243120551109314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1215602828306146e-05, + "grad_norm": 28.284496307373047, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8649542331695557, + "num_tokens": 477479175.0, + "step": 12516 + }, + { + "epoch": 1.5922910571174151, + "ewc_loss": 0.04234708473086357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1173542336327955e-05, + "grad_norm": 28.238327026367188, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8692245483398438, + "num_tokens": 477516896.0, + "step": 12517 + }, + { + "epoch": 1.5924182673960057, + "ewc_loss": 0.042323753237724304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1161877157283016e-05, + "grad_norm": 28.319900512695312, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8568657636642456, + "num_tokens": 477555543.0, + "step": 12518 + }, + { + "epoch": 1.5925454776745962, + "ewc_loss": 0.04241917282342911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1209585611359216e-05, + "grad_norm": 28.331886291503906, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8786087036132812, + "num_tokens": 477592251.0, + "step": 12519 + }, + { + "epoch": 1.5926726879531867, + "ewc_loss": 0.04237355291843414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1186777303228155e-05, + "grad_norm": 28.306798934936523, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8546510934829712, + "num_tokens": 477630039.0, + "step": 12520 + }, + { + "epoch": 1.5927998982317773, + "ewc_loss": 0.042349472641944885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.117473559337668e-05, + "grad_norm": 28.313383102416992, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.864360511302948, + "num_tokens": 477661028.0, + "step": 12521 + }, + { + "epoch": 1.5929271085103678, + "ewc_loss": 0.0423588752746582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1179437680984847e-05, + "grad_norm": 28.33542823791504, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8800428509712219, + "num_tokens": 477699612.0, + "step": 12522 + }, + { + "epoch": 1.5930543187889583, + "ewc_loss": 0.042375318706035614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1187659513088875e-05, + "grad_norm": 28.310592651367188, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8619344830513, + "num_tokens": 477737300.0, + "step": 12523 + }, + { + "epoch": 1.5931815290675486, + "ewc_loss": 0.042313434183597565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1156716684345156e-05, + "grad_norm": 28.325090408325195, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8547062873840332, + "num_tokens": 477776031.0, + "step": 12524 + }, + { + "epoch": 1.5933087393461391, + "ewc_loss": 0.042382121086120605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1191061023273505e-05, + "grad_norm": 28.312746047973633, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8675369024276733, + "num_tokens": 477813640.0, + "step": 12525 + }, + { + "epoch": 1.5934359496247297, + "ewc_loss": 0.04239927977323532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1199639377300628e-05, + "grad_norm": 28.305360794067383, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.862270176410675, + "num_tokens": 477848652.0, + "step": 12526 + }, + { + "epoch": 1.5935631599033202, + "ewc_loss": 0.042302872985601425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1151436158106662e-05, + "grad_norm": 28.266830444335938, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8617348670959473, + "num_tokens": 477883710.0, + "step": 12527 + }, + { + "epoch": 1.5936903701819107, + "ewc_loss": 0.04239896312355995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.119948112522252e-05, + "grad_norm": 28.274005889892578, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8601778745651245, + "num_tokens": 477920316.0, + "step": 12528 + }, + { + "epoch": 1.5938175804605013, + "ewc_loss": 0.04234076663851738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1170382751733996e-05, + "grad_norm": 28.20769691467285, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8753311634063721, + "num_tokens": 477955523.0, + "step": 12529 + }, + { + "epoch": 1.5939447907390916, + "ewc_loss": 0.042373038828372955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.118651900673285e-05, + "grad_norm": 28.254487991333008, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8597780466079712, + "num_tokens": 477995768.0, + "step": 12530 + }, + { + "epoch": 1.594072001017682, + "ewc_loss": 0.04246048629283905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1230243874015287e-05, + "grad_norm": 28.275659561157227, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8662009239196777, + "num_tokens": 478037718.0, + "step": 12531 + }, + { + "epoch": 1.5941992112962726, + "ewc_loss": 0.042345549911260605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.117277472279966e-05, + "grad_norm": 28.044233322143555, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8542612791061401, + "num_tokens": 478077388.0, + "step": 12532 + }, + { + "epoch": 1.5943264215748632, + "ewc_loss": 0.042452387511730194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.122619298461359e-05, + "grad_norm": 28.380739212036133, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.881045937538147, + "num_tokens": 478109494.0, + "step": 12533 + }, + { + "epoch": 1.5944536318534537, + "ewc_loss": 0.04253167659044266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1265837858663872e-05, + "grad_norm": 28.206995010375977, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8617720007896423, + "num_tokens": 478142420.0, + "step": 12534 + }, + { + "epoch": 1.5945808421320442, + "ewc_loss": 0.04241251200437546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1206256860750727e-05, + "grad_norm": 28.27321434020996, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8711575269699097, + "num_tokens": 478183848.0, + "step": 12535 + }, + { + "epoch": 1.5947080524106347, + "ewc_loss": 0.0425666905939579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1283345631673e-05, + "grad_norm": 28.267980575561523, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8754193782806396, + "num_tokens": 478225242.0, + "step": 12536 + }, + { + "epoch": 1.5948352626892253, + "ewc_loss": 0.042509522289037704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.125476203218568e-05, + "grad_norm": 28.33452796936035, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8729507327079773, + "num_tokens": 478263167.0, + "step": 12537 + }, + { + "epoch": 1.5949624729678158, + "ewc_loss": 0.04248158633708954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1240794012555853e-05, + "grad_norm": 28.342435836791992, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8675634860992432, + "num_tokens": 478297752.0, + "step": 12538 + }, + { + "epoch": 1.5950896832464063, + "ewc_loss": 0.04246048629283905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1230243874015287e-05, + "grad_norm": 28.246658325195312, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8665047287940979, + "num_tokens": 478332955.0, + "step": 12539 + }, + { + "epoch": 1.5952168935249968, + "ewc_loss": 0.042496807873249054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.124840466422029e-05, + "grad_norm": 28.358909606933594, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.8468857407569885, + "num_tokens": 478367298.0, + "step": 12540 + }, + { + "epoch": 1.5953441038035874, + "ewc_loss": 0.042563460767269135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1281730369082652e-05, + "grad_norm": 28.35395622253418, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8703784942626953, + "num_tokens": 478411183.0, + "step": 12541 + }, + { + "epoch": 1.595471314082178, + "ewc_loss": 0.04240630567073822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1203153664828278e-05, + "grad_norm": 28.203453063964844, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8638805150985718, + "num_tokens": 478451184.0, + "step": 12542 + }, + { + "epoch": 1.5955985243607684, + "ewc_loss": 0.04245074465870857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.122537262039259e-05, + "grad_norm": 28.426904678344727, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.862433910369873, + "num_tokens": 478490913.0, + "step": 12543 + }, + { + "epoch": 1.595725734639359, + "ewc_loss": 0.04248972237110138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.124486127286218e-05, + "grad_norm": 28.23291778564453, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8679664134979248, + "num_tokens": 478529769.0, + "step": 12544 + }, + { + "epoch": 1.5958529449179495, + "ewc_loss": 0.0423712283372879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.118561496899929e-05, + "grad_norm": 28.422805786132812, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8627142906188965, + "num_tokens": 478566665.0, + "step": 12545 + }, + { + "epoch": 1.59598015519654, + "ewc_loss": 0.04246436804533005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.123218473570887e-05, + "grad_norm": 28.267135620117188, + "learning_rate": 1e-06, + "loss": 0.3478, + "mean_token_accuracy": 0.8926569223403931, + "num_tokens": 478600814.0, + "step": 12546 + }, + { + "epoch": 1.5961073654751305, + "ewc_loss": 0.04240177571773529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.120088720403146e-05, + "grad_norm": 28.383638381958008, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8703106641769409, + "num_tokens": 478641610.0, + "step": 12547 + }, + { + "epoch": 1.5962345757537209, + "ewc_loss": 0.04242173209786415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1210866179899313e-05, + "grad_norm": 28.278440475463867, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8747562766075134, + "num_tokens": 478678183.0, + "step": 12548 + }, + { + "epoch": 1.5963617860323114, + "ewc_loss": 0.04245293140411377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1226465833024122e-05, + "grad_norm": 28.517196655273438, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8680812120437622, + "num_tokens": 478710881.0, + "step": 12549 + }, + { + "epoch": 1.596488996310902, + "ewc_loss": 0.042464062571525574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1232031940598972e-05, + "grad_norm": 28.378475189208984, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8804557919502258, + "num_tokens": 478748550.0, + "step": 12550 + }, + { + "epoch": 1.5966162065894924, + "ewc_loss": 0.04234306514263153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1171532353037037e-05, + "grad_norm": 28.301544189453125, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.854889988899231, + "num_tokens": 478790417.0, + "step": 12551 + }, + { + "epoch": 1.596743416868083, + "ewc_loss": 0.042447760701179504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1223881049081683e-05, + "grad_norm": 28.396120071411133, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8730414509773254, + "num_tokens": 478828856.0, + "step": 12552 + }, + { + "epoch": 1.5968706271466735, + "ewc_loss": 0.04235166311264038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.117583244398702e-05, + "grad_norm": 28.297374725341797, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8643916249275208, + "num_tokens": 478870240.0, + "step": 12553 + }, + { + "epoch": 1.5969978374252638, + "ewc_loss": 0.04234080761671066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.117040457960684e-05, + "grad_norm": 28.423303604125977, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.858669638633728, + "num_tokens": 478911920.0, + "step": 12554 + }, + { + "epoch": 1.5971250477038543, + "ewc_loss": 0.04235014691948891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.117507392540574e-05, + "grad_norm": 28.329618453979492, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8600174188613892, + "num_tokens": 478949083.0, + "step": 12555 + }, + { + "epoch": 1.5972522579824449, + "ewc_loss": 0.04233111813664436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1165558791835792e-05, + "grad_norm": 28.416419982910156, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.865530252456665, + "num_tokens": 478984087.0, + "step": 12556 + }, + { + "epoch": 1.5973794682610354, + "ewc_loss": 0.04235532879829407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.117766416631639e-05, + "grad_norm": 28.202857971191406, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8482550978660583, + "num_tokens": 479019364.0, + "step": 12557 + }, + { + "epoch": 1.597506678539626, + "ewc_loss": 0.04230428859591484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.115214374498464e-05, + "grad_norm": 28.350685119628906, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8694041967391968, + "num_tokens": 479059002.0, + "step": 12558 + }, + { + "epoch": 1.5976338888182164, + "ewc_loss": 0.04240557551383972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1202788047958165e-05, + "grad_norm": 28.349613189697266, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8653662204742432, + "num_tokens": 479092676.0, + "step": 12559 + }, + { + "epoch": 1.597761099096807, + "ewc_loss": 0.042349763214588165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1174881112528965e-05, + "grad_norm": 28.2957763671875, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8694479465484619, + "num_tokens": 479125951.0, + "step": 12560 + }, + { + "epoch": 1.5978883093753975, + "ewc_loss": 0.04242333769798279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1211668354226276e-05, + "grad_norm": 28.232280731201172, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8832563161849976, + "num_tokens": 479166230.0, + "step": 12561 + }, + { + "epoch": 1.598015519653988, + "ewc_loss": 0.042448755353689194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.122437763318885e-05, + "grad_norm": 28.295398712158203, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.863152801990509, + "num_tokens": 479205813.0, + "step": 12562 + }, + { + "epoch": 1.5981427299325786, + "ewc_loss": 0.0423867292702198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1193363863858394e-05, + "grad_norm": 28.175132751464844, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.862472414970398, + "num_tokens": 479244960.0, + "step": 12563 + }, + { + "epoch": 1.598269940211169, + "ewc_loss": 0.042368534952402115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.118426709785126e-05, + "grad_norm": 28.304100036621094, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.873792290687561, + "num_tokens": 479289617.0, + "step": 12564 + }, + { + "epoch": 1.5983971504897596, + "ewc_loss": 0.04242202267050743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1211011699051596e-05, + "grad_norm": 28.259923934936523, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8676863312721252, + "num_tokens": 479326377.0, + "step": 12565 + }, + { + "epoch": 1.5985243607683501, + "ewc_loss": 0.04235973209142685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.117986514349468e-05, + "grad_norm": 28.218828201293945, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8660328388214111, + "num_tokens": 479360563.0, + "step": 12566 + }, + { + "epoch": 1.5986515710469407, + "ewc_loss": 0.042475949972867966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1237974578980356e-05, + "grad_norm": 28.283510208129883, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8786740899085999, + "num_tokens": 479403042.0, + "step": 12567 + }, + { + "epoch": 1.5987787813255312, + "ewc_loss": 0.042528558522462845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1264278984745033e-05, + "grad_norm": 28.388870239257812, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8722639083862305, + "num_tokens": 479437099.0, + "step": 12568 + }, + { + "epoch": 1.5989059916041217, + "ewc_loss": 0.0424644872546196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1232242943369783e-05, + "grad_norm": 28.286334991455078, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8658528327941895, + "num_tokens": 479479326.0, + "step": 12569 + }, + { + "epoch": 1.5990332018827123, + "ewc_loss": 0.04236435890197754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.118217889801599e-05, + "grad_norm": 28.350465774536133, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8489578366279602, + "num_tokens": 479515320.0, + "step": 12570 + }, + { + "epoch": 1.5991604121613028, + "ewc_loss": 0.04246344789862633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1231724531389773e-05, + "grad_norm": 28.26902198791504, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8565973043441772, + "num_tokens": 479559960.0, + "step": 12571 + }, + { + "epoch": 1.5992876224398933, + "ewc_loss": 0.04245796799659729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.122898331435863e-05, + "grad_norm": 28.34950065612793, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8671190142631531, + "num_tokens": 479599115.0, + "step": 12572 + }, + { + "epoch": 1.5994148327184836, + "ewc_loss": 0.04237272962927818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.118636439263355e-05, + "grad_norm": 28.282472610473633, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.875167191028595, + "num_tokens": 479636116.0, + "step": 12573 + }, + { + "epoch": 1.5995420429970741, + "ewc_loss": 0.042437344789505005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1218671463429928e-05, + "grad_norm": 28.28900718688965, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8581035137176514, + "num_tokens": 479676880.0, + "step": 12574 + }, + { + "epoch": 1.5996692532756647, + "ewc_loss": 0.04241502657532692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1207513782428578e-05, + "grad_norm": 28.295146942138672, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8694028258323669, + "num_tokens": 479717424.0, + "step": 12575 + }, + { + "epoch": 1.5997964635542552, + "ewc_loss": 0.04242316260933876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1211581042734906e-05, + "grad_norm": 28.34869384765625, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8657081723213196, + "num_tokens": 479756240.0, + "step": 12576 + }, + { + "epoch": 1.5999236738328457, + "ewc_loss": 0.04243839159607887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.121919533237815e-05, + "grad_norm": 28.39215850830078, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8532142043113708, + "num_tokens": 479791863.0, + "step": 12577 + }, + { + "epoch": 1.6000508841114363, + "ewc_loss": 0.042356595396995544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1178297174628824e-05, + "grad_norm": 28.26669692993164, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8769516348838806, + "num_tokens": 479827252.0, + "step": 12578 + }, + { + "epoch": 1.6001780943900266, + "ewc_loss": 0.0424303375184536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.12151680898387e-05, + "grad_norm": 28.239662170410156, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8628374338150024, + "num_tokens": 479863100.0, + "step": 12579 + }, + { + "epoch": 1.600305304668617, + "ewc_loss": 0.04241245239973068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1206225937930867e-05, + "grad_norm": 28.262454986572266, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.855886697769165, + "num_tokens": 479899684.0, + "step": 12580 + }, + { + "epoch": 1.6004325149472076, + "ewc_loss": 0.042527131736278534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1263565940898843e-05, + "grad_norm": 28.379505157470703, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8672705888748169, + "num_tokens": 479937656.0, + "step": 12581 + }, + { + "epoch": 1.6005597252257981, + "ewc_loss": 0.042496949434280396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1248475604807027e-05, + "grad_norm": 28.320405960083008, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8620474934577942, + "num_tokens": 479973357.0, + "step": 12582 + }, + { + "epoch": 1.6006869355043887, + "ewc_loss": 0.04244531691074371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.12226586882025e-05, + "grad_norm": 28.242128372192383, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8674500584602356, + "num_tokens": 480017862.0, + "step": 12583 + }, + { + "epoch": 1.6008141457829792, + "ewc_loss": 0.042468879371881485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1234440282569267e-05, + "grad_norm": 28.363128662109375, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8728466033935547, + "num_tokens": 480057039.0, + "step": 12584 + }, + { + "epoch": 1.6009413560615697, + "ewc_loss": 0.042452938854694366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.122646947100293e-05, + "grad_norm": 28.224098205566406, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8693473935127258, + "num_tokens": 480093396.0, + "step": 12585 + }, + { + "epoch": 1.6010685663401603, + "ewc_loss": 0.04247094690799713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.123547346855048e-05, + "grad_norm": 28.348325729370117, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8626589179039001, + "num_tokens": 480125200.0, + "step": 12586 + }, + { + "epoch": 1.6011957766187508, + "ewc_loss": 0.04260888695716858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.130444408976473e-05, + "grad_norm": 28.42023277282715, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8635213971138, + "num_tokens": 480158085.0, + "step": 12587 + }, + { + "epoch": 1.6013229868973413, + "ewc_loss": 0.04244156926870346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1220785129116848e-05, + "grad_norm": 28.065786361694336, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8817894458770752, + "num_tokens": 480192979.0, + "step": 12588 + }, + { + "epoch": 1.6014501971759318, + "ewc_loss": 0.04256045073270798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.128022606484592e-05, + "grad_norm": 28.32412338256836, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8763273358345032, + "num_tokens": 480227984.0, + "step": 12589 + }, + { + "epoch": 1.6015774074545224, + "ewc_loss": 0.04261576756834984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1307883798726834e-05, + "grad_norm": 28.212081909179688, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8837334513664246, + "num_tokens": 480264280.0, + "step": 12590 + }, + { + "epoch": 1.601704617733113, + "ewc_loss": 0.04254447668790817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.127223888237495e-05, + "grad_norm": 28.359420776367188, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8525139093399048, + "num_tokens": 480300670.0, + "step": 12591 + }, + { + "epoch": 1.6018318280117034, + "ewc_loss": 0.042675722390413284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.133786074409727e-05, + "grad_norm": 28.364839553833008, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8654506206512451, + "num_tokens": 480334460.0, + "step": 12592 + }, + { + "epoch": 1.601959038290294, + "ewc_loss": 0.04248283803462982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1241419744910672e-05, + "grad_norm": 28.245370864868164, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8768951892852783, + "num_tokens": 480376066.0, + "step": 12593 + }, + { + "epoch": 1.6020862485688845, + "ewc_loss": 0.04254627227783203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1273135644150898e-05, + "grad_norm": 28.296154022216797, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.873127281665802, + "num_tokens": 480417575.0, + "step": 12594 + }, + { + "epoch": 1.602213458847475, + "ewc_loss": 0.04263569414615631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1317846403690055e-05, + "grad_norm": 28.50871467590332, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.856654942035675, + "num_tokens": 480454106.0, + "step": 12595 + }, + { + "epoch": 1.6023406691260655, + "ewc_loss": 0.042490649968385696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1245325115160085e-05, + "grad_norm": 28.264537811279297, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8544328212738037, + "num_tokens": 480493031.0, + "step": 12596 + }, + { + "epoch": 1.6024678794046558, + "ewc_loss": 0.04249901324510574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1249506971798837e-05, + "grad_norm": 28.35816764831543, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8676420450210571, + "num_tokens": 480528326.0, + "step": 12597 + }, + { + "epoch": 1.6025950896832464, + "ewc_loss": 0.04254818335175514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1274092432577163e-05, + "grad_norm": 28.261966705322266, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8744240999221802, + "num_tokens": 480565399.0, + "step": 12598 + }, + { + "epoch": 1.602722299961837, + "ewc_loss": 0.042492013424634933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1246007236186415e-05, + "grad_norm": 28.37322235107422, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8831002116203308, + "num_tokens": 480600028.0, + "step": 12599 + }, + { + "epoch": 1.6028495102404274, + "ewc_loss": 0.0425228588283062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1261428628349677e-05, + "grad_norm": 28.27513313293457, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8670270442962646, + "num_tokens": 480635759.0, + "step": 12600 + }, + { + "epoch": 1.602976720519018, + "ewc_loss": 0.0424480177462101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.122400837833993e-05, + "grad_norm": 28.26743507385254, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8729109168052673, + "num_tokens": 480669528.0, + "step": 12601 + }, + { + "epoch": 1.6031039307976085, + "ewc_loss": 0.04252808913588524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1264044335111976e-05, + "grad_norm": 28.337135314941406, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8643534183502197, + "num_tokens": 480709425.0, + "step": 12602 + }, + { + "epoch": 1.6032311410761988, + "ewc_loss": 0.04251107573509216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1255538740660995e-05, + "grad_norm": 28.31646728515625, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.852385938167572, + "num_tokens": 480745010.0, + "step": 12603 + }, + { + "epoch": 1.6033583513547893, + "ewc_loss": 0.042565055191516876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1282527086441405e-05, + "grad_norm": 28.25506019592285, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8734046816825867, + "num_tokens": 480780320.0, + "step": 12604 + }, + { + "epoch": 1.6034855616333799, + "ewc_loss": 0.04250066727399826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1250332792988047e-05, + "grad_norm": 28.356678009033203, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8741717338562012, + "num_tokens": 480819335.0, + "step": 12605 + }, + { + "epoch": 1.6036127719119704, + "ewc_loss": 0.042596131563186646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1298066712915897e-05, + "grad_norm": 28.30927848815918, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.879500687122345, + "num_tokens": 480852702.0, + "step": 12606 + }, + { + "epoch": 1.603739982190561, + "ewc_loss": 0.04255802184343338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.127901097992435e-05, + "grad_norm": 28.41706657409668, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8608485460281372, + "num_tokens": 480890258.0, + "step": 12607 + }, + { + "epoch": 1.6038671924691514, + "ewc_loss": 0.04262971505522728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.131485780410003e-05, + "grad_norm": 28.19818687438965, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8668996095657349, + "num_tokens": 480929555.0, + "step": 12608 + }, + { + "epoch": 1.603994402747742, + "ewc_loss": 0.04252235218882561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1261175788822584e-05, + "grad_norm": 28.366554260253906, + "learning_rate": 1e-06, + "loss": 0.3627, + "mean_token_accuracy": 0.8894978761672974, + "num_tokens": 480970295.0, + "step": 12609 + }, + { + "epoch": 1.6041216130263325, + "ewc_loss": 0.04257318750023842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1286594346747734e-05, + "grad_norm": 28.343666076660156, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.868026614189148, + "num_tokens": 481011213.0, + "step": 12610 + }, + { + "epoch": 1.604248823304923, + "ewc_loss": 0.04256458953022957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.128229425579775e-05, + "grad_norm": 28.303438186645508, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8622298240661621, + "num_tokens": 481045273.0, + "step": 12611 + }, + { + "epoch": 1.6043760335835135, + "ewc_loss": 0.042527444660663605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1263722373987548e-05, + "grad_norm": 28.363378524780273, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.848272442817688, + "num_tokens": 481084567.0, + "step": 12612 + }, + { + "epoch": 1.604503243862104, + "ewc_loss": 0.04259016364812851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.129508175130468e-05, + "grad_norm": 28.19835662841797, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8681648373603821, + "num_tokens": 481119455.0, + "step": 12613 + }, + { + "epoch": 1.6046304541406946, + "ewc_loss": 0.042587727308273315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1293863028404303e-05, + "grad_norm": 28.389299392700195, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8521531820297241, + "num_tokens": 481154718.0, + "step": 12614 + }, + { + "epoch": 1.6047576644192851, + "ewc_loss": 0.04267344996333122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1336725694709457e-05, + "grad_norm": 28.385894775390625, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8613063097000122, + "num_tokens": 481193716.0, + "step": 12615 + }, + { + "epoch": 1.6048848746978757, + "ewc_loss": 0.04248383641242981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1241918148007244e-05, + "grad_norm": 28.31092071533203, + "learning_rate": 1e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8437896966934204, + "num_tokens": 481231378.0, + "step": 12616 + }, + { + "epoch": 1.6050120849764662, + "ewc_loss": 0.04258355870842934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.129177846654784e-05, + "grad_norm": 28.298383712768555, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8655792474746704, + "num_tokens": 481267907.0, + "step": 12617 + }, + { + "epoch": 1.6051392952550567, + "ewc_loss": 0.042500488460063934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1250243662507273e-05, + "grad_norm": 28.306659698486328, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8681955337524414, + "num_tokens": 481309899.0, + "step": 12618 + }, + { + "epoch": 1.6052665055336472, + "ewc_loss": 0.042581818997859955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1290908989612944e-05, + "grad_norm": 28.41510772705078, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8768818378448486, + "num_tokens": 481348845.0, + "step": 12619 + }, + { + "epoch": 1.6053937158122378, + "ewc_loss": 0.04253691807389259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.126845902239438e-05, + "grad_norm": 28.283885955810547, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8573296070098877, + "num_tokens": 481384999.0, + "step": 12620 + }, + { + "epoch": 1.6055209260908283, + "ewc_loss": 0.04253531992435455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1267660486046225e-05, + "grad_norm": 28.376949310302734, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8686143159866333, + "num_tokens": 481417287.0, + "step": 12621 + }, + { + "epoch": 1.6056481363694186, + "ewc_loss": 0.04259152337908745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1295762053341605e-05, + "grad_norm": 28.32915687561035, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8713477253913879, + "num_tokens": 481455943.0, + "step": 12622 + }, + { + "epoch": 1.6057753466480091, + "ewc_loss": 0.042543310672044754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.127165498677641e-05, + "grad_norm": 28.32185173034668, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8809338212013245, + "num_tokens": 481492017.0, + "step": 12623 + }, + { + "epoch": 1.6059025569265997, + "ewc_loss": 0.04258548095822334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1292740711942315e-05, + "grad_norm": 28.349946975708008, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8599437475204468, + "num_tokens": 481535115.0, + "step": 12624 + }, + { + "epoch": 1.6060297672051902, + "ewc_loss": 0.042582862079143524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1291431039571762e-05, + "grad_norm": 28.22146987915039, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.851438045501709, + "num_tokens": 481580048.0, + "step": 12625 + }, + { + "epoch": 1.6061569774837807, + "ewc_loss": 0.04256312921643257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.128156484104693e-05, + "grad_norm": 28.33544158935547, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.877346396446228, + "num_tokens": 481616748.0, + "step": 12626 + }, + { + "epoch": 1.6062841877623713, + "ewc_loss": 0.04264890030026436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1324449335224926e-05, + "grad_norm": 28.329191207885742, + "learning_rate": 1e-06, + "loss": 0.5136, + "mean_token_accuracy": 0.8387109041213989, + "num_tokens": 481659123.0, + "step": 12627 + }, + { + "epoch": 1.6064113980409616, + "ewc_loss": 0.04253501817584038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.126750950992573e-05, + "grad_norm": 28.264875411987305, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8578363656997681, + "num_tokens": 481704791.0, + "step": 12628 + }, + { + "epoch": 1.606538608319552, + "ewc_loss": 0.042557355016469955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1278678104863502e-05, + "grad_norm": 28.272258758544922, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8537441492080688, + "num_tokens": 481742913.0, + "step": 12629 + }, + { + "epoch": 1.6066658185981426, + "ewc_loss": 0.04254129156470299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1270645447657444e-05, + "grad_norm": 28.23026466369629, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8533472418785095, + "num_tokens": 481783549.0, + "step": 12630 + }, + { + "epoch": 1.6067930288767331, + "ewc_loss": 0.04255146160721779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1275731342029758e-05, + "grad_norm": 28.293149948120117, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8724977970123291, + "num_tokens": 481818486.0, + "step": 12631 + }, + { + "epoch": 1.6069202391553237, + "ewc_loss": 0.042650576680898666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.132528788933996e-05, + "grad_norm": 28.40213966369629, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8667293190956116, + "num_tokens": 481849969.0, + "step": 12632 + }, + { + "epoch": 1.6070474494339142, + "ewc_loss": 0.04257584363222122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1287922209012322e-05, + "grad_norm": 28.25105094909668, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8510376811027527, + "num_tokens": 481886108.0, + "step": 12633 + }, + { + "epoch": 1.6071746597125047, + "ewc_loss": 0.04258294776082039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1291474695317447e-05, + "grad_norm": 28.326663970947266, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8782290816307068, + "num_tokens": 481919389.0, + "step": 12634 + }, + { + "epoch": 1.6073018699910953, + "ewc_loss": 0.04263807088136673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1319036022759974e-05, + "grad_norm": 28.38271141052246, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8797214031219482, + "num_tokens": 481958531.0, + "step": 12635 + }, + { + "epoch": 1.6074290802696858, + "ewc_loss": 0.04254221171140671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.127110565197654e-05, + "grad_norm": 28.336559295654297, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.862301766872406, + "num_tokens": 481999864.0, + "step": 12636 + }, + { + "epoch": 1.6075562905482763, + "ewc_loss": 0.042609408497810364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1304704205249436e-05, + "grad_norm": 28.362361907958984, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8908315896987915, + "num_tokens": 482033890.0, + "step": 12637 + }, + { + "epoch": 1.6076835008268668, + "ewc_loss": 0.042614296078681946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1307147108018398e-05, + "grad_norm": 28.305023193359375, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8728315234184265, + "num_tokens": 482071186.0, + "step": 12638 + }, + { + "epoch": 1.6078107111054574, + "ewc_loss": 0.04250844568014145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.125422361132223e-05, + "grad_norm": 28.285749435424805, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8659802675247192, + "num_tokens": 482107218.0, + "step": 12639 + }, + { + "epoch": 1.607937921384048, + "ewc_loss": 0.04261026531457901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1305133486748673e-05, + "grad_norm": 28.386560440063477, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8625216484069824, + "num_tokens": 482146101.0, + "step": 12640 + }, + { + "epoch": 1.6080651316626384, + "ewc_loss": 0.04251964017748833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1259820641716942e-05, + "grad_norm": 28.23093605041504, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8802012801170349, + "num_tokens": 482178531.0, + "step": 12641 + }, + { + "epoch": 1.608192341941229, + "ewc_loss": 0.04254508763551712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1272544472594745e-05, + "grad_norm": 28.269084930419922, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8758811354637146, + "num_tokens": 482217019.0, + "step": 12642 + }, + { + "epoch": 1.6083195522198195, + "ewc_loss": 0.04266297072172165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1331485186237842e-05, + "grad_norm": 28.393354415893555, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8674642443656921, + "num_tokens": 482250547.0, + "step": 12643 + }, + { + "epoch": 1.60844676249841, + "ewc_loss": 0.0425802543759346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.129012682416942e-05, + "grad_norm": 28.25798225402832, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8629521727561951, + "num_tokens": 482287009.0, + "step": 12644 + }, + { + "epoch": 1.6085739727770005, + "ewc_loss": 0.042647477239370346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.132373811036814e-05, + "grad_norm": 28.427433013916016, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8531705141067505, + "num_tokens": 482327373.0, + "step": 12645 + }, + { + "epoch": 1.6087011830555908, + "ewc_loss": 0.04263553023338318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1317764549166895e-05, + "grad_norm": 28.44516372680664, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8554856181144714, + "num_tokens": 482363409.0, + "step": 12646 + }, + { + "epoch": 1.6088283933341814, + "ewc_loss": 0.0425020307302475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1251014914014377e-05, + "grad_norm": 28.356016159057617, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.856523871421814, + "num_tokens": 482396499.0, + "step": 12647 + }, + { + "epoch": 1.608955603612772, + "ewc_loss": 0.04261009767651558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1305047994246706e-05, + "grad_norm": 28.343730926513672, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8682768940925598, + "num_tokens": 482435059.0, + "step": 12648 + }, + { + "epoch": 1.6090828138913624, + "ewc_loss": 0.04257586970925331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1287934941938147e-05, + "grad_norm": 28.41423988342285, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8599604964256287, + "num_tokens": 482472793.0, + "step": 12649 + }, + { + "epoch": 1.609210024169953, + "ewc_loss": 0.042563546448946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1281774024828337e-05, + "grad_norm": 28.338788986206055, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8682665824890137, + "num_tokens": 482508857.0, + "step": 12650 + }, + { + "epoch": 1.6093372344485435, + "ewc_loss": 0.04256201535463333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1281008230289444e-05, + "grad_norm": 28.355012893676758, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8497543334960938, + "num_tokens": 482548526.0, + "step": 12651 + }, + { + "epoch": 1.6094644447271338, + "ewc_loss": 0.042502228170633316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1251114958431572e-05, + "grad_norm": 28.394176483154297, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.858397901058197, + "num_tokens": 482586461.0, + "step": 12652 + }, + { + "epoch": 1.6095916550057243, + "ewc_loss": 0.04259031265974045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1295156329870224e-05, + "grad_norm": 28.351560592651367, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8598455786705017, + "num_tokens": 482624642.0, + "step": 12653 + }, + { + "epoch": 1.6097188652843148, + "ewc_loss": 0.042528968304395676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1264484530547634e-05, + "grad_norm": 28.360376358032227, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8656611442565918, + "num_tokens": 482664096.0, + "step": 12654 + }, + { + "epoch": 1.6098460755629054, + "ewc_loss": 0.04264020919799805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1320103769539855e-05, + "grad_norm": 28.396326065063477, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8772404193878174, + "num_tokens": 482701230.0, + "step": 12655 + }, + { + "epoch": 1.609973285841496, + "ewc_loss": 0.04258475825190544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.129237873305101e-05, + "grad_norm": 28.363258361816406, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8668478727340698, + "num_tokens": 482741611.0, + "step": 12656 + }, + { + "epoch": 1.6101004961200864, + "ewc_loss": 0.042508725076913834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.12543618545169e-05, + "grad_norm": 28.203506469726562, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8764163255691528, + "num_tokens": 482779482.0, + "step": 12657 + }, + { + "epoch": 1.610227706398677, + "ewc_loss": 0.04261261224746704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.130630673491396e-05, + "grad_norm": 28.467493057250977, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8751634359359741, + "num_tokens": 482816279.0, + "step": 12658 + }, + { + "epoch": 1.6103549166772675, + "ewc_loss": 0.04261654242873192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.130827124346979e-05, + "grad_norm": 28.394433975219727, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.858184814453125, + "num_tokens": 482859941.0, + "step": 12659 + }, + { + "epoch": 1.610482126955858, + "ewc_loss": 0.04248058423399925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1240291971480474e-05, + "grad_norm": 28.425580978393555, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8713334798812866, + "num_tokens": 482894348.0, + "step": 12660 + }, + { + "epoch": 1.6106093372344485, + "ewc_loss": 0.04253099113702774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1265495888656005e-05, + "grad_norm": 28.256546020507812, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8605443239212036, + "num_tokens": 482930882.0, + "step": 12661 + }, + { + "epoch": 1.610736547513039, + "ewc_loss": 0.04249711334705353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1248557459330186e-05, + "grad_norm": 28.402612686157227, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8494411706924438, + "num_tokens": 482969515.0, + "step": 12662 + }, + { + "epoch": 1.6108637577916296, + "ewc_loss": 0.04257502779364586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1287514755385928e-05, + "grad_norm": 28.314741134643555, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8634672164916992, + "num_tokens": 483008074.0, + "step": 12663 + }, + { + "epoch": 1.6109909680702201, + "ewc_loss": 0.042479027062654495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1239513444015756e-05, + "grad_norm": 28.444177627563477, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.859455943107605, + "num_tokens": 483038920.0, + "step": 12664 + }, + { + "epoch": 1.6111181783488107, + "ewc_loss": 0.04254337027668953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1271685909596272e-05, + "grad_norm": 28.302621841430664, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8773527145385742, + "num_tokens": 483075559.0, + "step": 12665 + }, + { + "epoch": 1.6112453886274012, + "ewc_loss": 0.04255665838718414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.127832885889802e-05, + "grad_norm": 28.392717361450195, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8702956438064575, + "num_tokens": 483114378.0, + "step": 12666 + }, + { + "epoch": 1.6113725989059917, + "ewc_loss": 0.042528409510850906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1264204406179488e-05, + "grad_norm": 28.358003616333008, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8704974055290222, + "num_tokens": 483147911.0, + "step": 12667 + }, + { + "epoch": 1.6114998091845822, + "ewc_loss": 0.04252616688609123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1263083908706903e-05, + "grad_norm": 28.276750564575195, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8815081119537354, + "num_tokens": 483188275.0, + "step": 12668 + }, + { + "epoch": 1.6116270194631728, + "ewc_loss": 0.04263212904334068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.131606379407458e-05, + "grad_norm": 28.483179092407227, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8779430389404297, + "num_tokens": 483230804.0, + "step": 12669 + }, + { + "epoch": 1.6117542297417633, + "ewc_loss": 0.04253533482551575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.126766776200384e-05, + "grad_norm": 28.39727020263672, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8694548010826111, + "num_tokens": 483262368.0, + "step": 12670 + }, + { + "epoch": 1.6118814400203536, + "ewc_loss": 0.042558979243040085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1279489374137484e-05, + "grad_norm": 28.327274322509766, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8547700643539429, + "num_tokens": 483300894.0, + "step": 12671 + }, + { + "epoch": 1.6120086502989441, + "ewc_loss": 0.04258302226662636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1291511075105518e-05, + "grad_norm": 28.44581413269043, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8728762865066528, + "num_tokens": 483337377.0, + "step": 12672 + }, + { + "epoch": 1.6121358605775347, + "ewc_loss": 0.0426042266190052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1302113964338787e-05, + "grad_norm": 28.366811752319336, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8658105134963989, + "num_tokens": 483376572.0, + "step": 12673 + }, + { + "epoch": 1.6122630708561252, + "ewc_loss": 0.042528752237558365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.126437539118342e-05, + "grad_norm": 28.32605743408203, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8548840880393982, + "num_tokens": 483415584.0, + "step": 12674 + }, + { + "epoch": 1.6123902811347157, + "ewc_loss": 0.042616620659828186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1308311261236668e-05, + "grad_norm": 28.389789581298828, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8676822185516357, + "num_tokens": 483459584.0, + "step": 12675 + }, + { + "epoch": 1.6125174914133062, + "ewc_loss": 0.04259654879570007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.12982740777079e-05, + "grad_norm": 28.32436752319336, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8606893420219421, + "num_tokens": 483504260.0, + "step": 12676 + }, + { + "epoch": 1.6126447016918966, + "ewc_loss": 0.04256657138466835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1283285605022684e-05, + "grad_norm": 28.487407684326172, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8692273497581482, + "num_tokens": 483547271.0, + "step": 12677 + }, + { + "epoch": 1.612771911970487, + "ewc_loss": 0.04261024296283722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1305120753822848e-05, + "grad_norm": 28.359926223754883, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8687424659729004, + "num_tokens": 483582255.0, + "step": 12678 + }, + { + "epoch": 1.6128991222490776, + "ewc_loss": 0.04261086881160736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1305433620000258e-05, + "grad_norm": 28.416156768798828, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8616561889648438, + "num_tokens": 483622290.0, + "step": 12679 + }, + { + "epoch": 1.6130263325276681, + "ewc_loss": 0.04262151941657066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.131075962097384e-05, + "grad_norm": 28.462812423706055, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8795484304428101, + "num_tokens": 483657253.0, + "step": 12680 + }, + { + "epoch": 1.6131535428062587, + "ewc_loss": 0.0425434336066246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1271716832416132e-05, + "grad_norm": 28.282377243041992, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8708118796348572, + "num_tokens": 483693062.0, + "step": 12681 + }, + { + "epoch": 1.6132807530848492, + "ewc_loss": 0.04252154007554054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1260770154185593e-05, + "grad_norm": 28.380868911743164, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.861181378364563, + "num_tokens": 483735845.0, + "step": 12682 + }, + { + "epoch": 1.6134079633634397, + "ewc_loss": 0.04252684488892555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1263422240735963e-05, + "grad_norm": 28.32463836669922, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.870763897895813, + "num_tokens": 483783833.0, + "step": 12683 + }, + { + "epoch": 1.6135351736420303, + "ewc_loss": 0.04249844700098038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1249223209451884e-05, + "grad_norm": 28.29470443725586, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.865687370300293, + "num_tokens": 483820175.0, + "step": 12684 + }, + { + "epoch": 1.6136623839206208, + "ewc_loss": 0.04255126789212227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1275633116601966e-05, + "grad_norm": 28.43381118774414, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.878182053565979, + "num_tokens": 483863454.0, + "step": 12685 + }, + { + "epoch": 1.6137895941992113, + "ewc_loss": 0.04251034930348396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1255174942780286e-05, + "grad_norm": 28.354679107666016, + "learning_rate": 1e-06, + "loss": 0.3654, + "mean_token_accuracy": 0.8868575096130371, + "num_tokens": 483900657.0, + "step": 12686 + }, + { + "epoch": 1.6139168044778018, + "ewc_loss": 0.04254964366555214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1274821847327985e-05, + "grad_norm": 28.543489456176758, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8603832721710205, + "num_tokens": 483935912.0, + "step": 12687 + }, + { + "epoch": 1.6140440147563924, + "ewc_loss": 0.042472511529922485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1236255633994006e-05, + "grad_norm": 28.29024314880371, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8677419424057007, + "num_tokens": 483968594.0, + "step": 12688 + }, + { + "epoch": 1.614171225034983, + "ewc_loss": 0.04248052462935448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1240262867650017e-05, + "grad_norm": 28.452409744262695, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8756459951400757, + "num_tokens": 484010866.0, + "step": 12689 + }, + { + "epoch": 1.6142984353135734, + "ewc_loss": 0.0425649993121624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1282499801600352e-05, + "grad_norm": 28.4219970703125, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8612010478973389, + "num_tokens": 484051355.0, + "step": 12690 + }, + { + "epoch": 1.614425645592164, + "ewc_loss": 0.04241790995001793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1208954422036186e-05, + "grad_norm": 28.42406463623047, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8765301704406738, + "num_tokens": 484087049.0, + "step": 12691 + }, + { + "epoch": 1.6145528558707545, + "ewc_loss": 0.04246030002832413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1230149286566302e-05, + "grad_norm": 28.33098793029785, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8564245700836182, + "num_tokens": 484124914.0, + "step": 12692 + }, + { + "epoch": 1.614680066149345, + "ewc_loss": 0.04253493249416351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1267465854180045e-05, + "grad_norm": 28.519060134887695, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8842575550079346, + "num_tokens": 484154524.0, + "step": 12693 + }, + { + "epoch": 1.6148072764279355, + "ewc_loss": 0.04249219223856926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.124609636666719e-05, + "grad_norm": 28.364599227905273, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8665042519569397, + "num_tokens": 484195091.0, + "step": 12694 + }, + { + "epoch": 1.6149344867065258, + "ewc_loss": 0.042433179914951324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.121659053955227e-05, + "grad_norm": 28.479013442993164, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8687506914138794, + "num_tokens": 484236787.0, + "step": 12695 + }, + { + "epoch": 1.6150616969851164, + "ewc_loss": 0.04243636876344681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1218183974269778e-05, + "grad_norm": 28.25873565673828, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8624202013015747, + "num_tokens": 484281748.0, + "step": 12696 + }, + { + "epoch": 1.615188907263707, + "ewc_loss": 0.042446862906217575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1223431758699007e-05, + "grad_norm": 28.358966827392578, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8734899163246155, + "num_tokens": 484321181.0, + "step": 12697 + }, + { + "epoch": 1.6153161175422974, + "ewc_loss": 0.042529068887233734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.126453364326153e-05, + "grad_norm": 28.279787063598633, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8644270896911621, + "num_tokens": 484360476.0, + "step": 12698 + }, + { + "epoch": 1.615443327820888, + "ewc_loss": 0.04247166961431503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1235835447441787e-05, + "grad_norm": 28.334396362304688, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8678171038627625, + "num_tokens": 484395065.0, + "step": 12699 + }, + { + "epoch": 1.6155705380994785, + "ewc_loss": 0.04247427359223366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1237136024865322e-05, + "grad_norm": 28.25733757019043, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8687598705291748, + "num_tokens": 484439309.0, + "step": 12700 + }, + { + "epoch": 1.6156977483780688, + "ewc_loss": 0.042498644441366196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1249321434879676e-05, + "grad_norm": 28.420616149902344, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8676002621650696, + "num_tokens": 484473018.0, + "step": 12701 + }, + { + "epoch": 1.6158249586566593, + "ewc_loss": 0.04258495569229126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.12924769584788e-05, + "grad_norm": 28.326648712158203, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8745578527450562, + "num_tokens": 484518306.0, + "step": 12702 + }, + { + "epoch": 1.6159521689352498, + "ewc_loss": 0.04259294271469116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1296471459208988e-05, + "grad_norm": 28.407922744750977, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.852948009967804, + "num_tokens": 484556348.0, + "step": 12703 + }, + { + "epoch": 1.6160793792138404, + "ewc_loss": 0.04256003722548485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1280018700053915e-05, + "grad_norm": 28.312284469604492, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8786829710006714, + "num_tokens": 484593136.0, + "step": 12704 + }, + { + "epoch": 1.616206589492431, + "ewc_loss": 0.04250459372997284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1252297301543877e-05, + "grad_norm": 28.34975814819336, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8766388297080994, + "num_tokens": 484628025.0, + "step": 12705 + }, + { + "epoch": 1.6163337997710214, + "ewc_loss": 0.04258221015334129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1291105440468527e-05, + "grad_norm": 28.35133934020996, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8730524778366089, + "num_tokens": 484666703.0, + "step": 12706 + }, + { + "epoch": 1.616461010049612, + "ewc_loss": 0.042573507875204086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1286754417815246e-05, + "grad_norm": 28.354578018188477, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8765554428100586, + "num_tokens": 484703417.0, + "step": 12707 + }, + { + "epoch": 1.6165882203282025, + "ewc_loss": 0.04265202581882477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1326013666111976e-05, + "grad_norm": 28.543594360351562, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8813221454620361, + "num_tokens": 484741830.0, + "step": 12708 + }, + { + "epoch": 1.616715430606793, + "ewc_loss": 0.04250700771808624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1253503291518427e-05, + "grad_norm": 28.437955856323242, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8652430176734924, + "num_tokens": 484780140.0, + "step": 12709 + }, + { + "epoch": 1.6168426408853835, + "ewc_loss": 0.04257563501596451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1287816707626916e-05, + "grad_norm": 28.46930503845215, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8660650253295898, + "num_tokens": 484813856.0, + "step": 12710 + }, + { + "epoch": 1.616969851163974, + "ewc_loss": 0.04251159727573395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1255798856145702e-05, + "grad_norm": 28.412391662597656, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.877423107624054, + "num_tokens": 484851383.0, + "step": 12711 + }, + { + "epoch": 1.6170970614425646, + "ewc_loss": 0.04257655888795853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1288278730935417e-05, + "grad_norm": 28.48041534423828, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8755336999893188, + "num_tokens": 484884361.0, + "step": 12712 + }, + { + "epoch": 1.6172242717211551, + "ewc_loss": 0.04257296398282051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1286481569404714e-05, + "grad_norm": 28.456287384033203, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8465859293937683, + "num_tokens": 484922106.0, + "step": 12713 + }, + { + "epoch": 1.6173514819997457, + "ewc_loss": 0.042477089911699295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.123854574165307e-05, + "grad_norm": 28.389116287231445, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8684508800506592, + "num_tokens": 484963100.0, + "step": 12714 + }, + { + "epoch": 1.6174786922783362, + "ewc_loss": 0.042565468698740005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.128273445123341e-05, + "grad_norm": 28.434072494506836, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8787577152252197, + "num_tokens": 485006896.0, + "step": 12715 + }, + { + "epoch": 1.6176059025569267, + "ewc_loss": 0.042455051094293594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.122752630384639e-05, + "grad_norm": 28.399473190307617, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8680078387260437, + "num_tokens": 485045968.0, + "step": 12716 + }, + { + "epoch": 1.6177331128355172, + "ewc_loss": 0.042494043707847595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1247022232273594e-05, + "grad_norm": 28.329702377319336, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8645367622375488, + "num_tokens": 485084736.0, + "step": 12717 + }, + { + "epoch": 1.6178603231141078, + "ewc_loss": 0.04243047535419464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1215237211436033e-05, + "grad_norm": 28.392627716064453, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8757381439208984, + "num_tokens": 485123130.0, + "step": 12718 + }, + { + "epoch": 1.6179875333926983, + "ewc_loss": 0.04253889620304108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.126944855262991e-05, + "grad_norm": 28.291217803955078, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8650147914886475, + "num_tokens": 485158072.0, + "step": 12719 + }, + { + "epoch": 1.6181147436712886, + "ewc_loss": 0.04247820004820824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.123910053342115e-05, + "grad_norm": 28.359657287597656, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8761941194534302, + "num_tokens": 485196783.0, + "step": 12720 + }, + { + "epoch": 1.6182419539498791, + "ewc_loss": 0.04251236468553543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.125618266290985e-05, + "grad_norm": 28.387338638305664, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8736376166343689, + "num_tokens": 485231433.0, + "step": 12721 + }, + { + "epoch": 1.6183691642284697, + "ewc_loss": 0.042528215795755386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.12641079997411e-05, + "grad_norm": 28.37912368774414, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8657006621360779, + "num_tokens": 485269016.0, + "step": 12722 + }, + { + "epoch": 1.6184963745070602, + "ewc_loss": 0.042532265186309814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1266132534947246e-05, + "grad_norm": 28.424266815185547, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.856521725654602, + "num_tokens": 485308293.0, + "step": 12723 + }, + { + "epoch": 1.6186235847856507, + "ewc_loss": 0.04248196631669998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.124098318745382e-05, + "grad_norm": 28.389511108398438, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8769875764846802, + "num_tokens": 485340758.0, + "step": 12724 + }, + { + "epoch": 1.6187507950642412, + "ewc_loss": 0.042494140565395355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1247069525998086e-05, + "grad_norm": 28.449115753173828, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.882646918296814, + "num_tokens": 485384047.0, + "step": 12725 + }, + { + "epoch": 1.6188780053428315, + "ewc_loss": 0.04256578907370567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.128289452230092e-05, + "grad_norm": 28.437211990356445, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8707666993141174, + "num_tokens": 485416909.0, + "step": 12726 + }, + { + "epoch": 1.619005215621422, + "ewc_loss": 0.04254717007279396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1273584934533574e-05, + "grad_norm": 28.39026641845703, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8689806461334229, + "num_tokens": 485449532.0, + "step": 12727 + }, + { + "epoch": 1.6191324259000126, + "ewc_loss": 0.04244463890790939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1222318537184037e-05, + "grad_norm": 28.28744125366211, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8783189654350281, + "num_tokens": 485491171.0, + "step": 12728 + }, + { + "epoch": 1.6192596361786031, + "ewc_loss": 0.04264092445373535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1320462110452354e-05, + "grad_norm": 28.40061378479004, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8636850118637085, + "num_tokens": 485525511.0, + "step": 12729 + }, + { + "epoch": 1.6193868464571937, + "ewc_loss": 0.04261460527777672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.13073017221177e-05, + "grad_norm": 28.389015197753906, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8640642166137695, + "num_tokens": 485562517.0, + "step": 12730 + }, + { + "epoch": 1.6195140567357842, + "ewc_loss": 0.04259437322616577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.129718632204458e-05, + "grad_norm": 28.37626075744629, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8685377836227417, + "num_tokens": 485601058.0, + "step": 12731 + }, + { + "epoch": 1.6196412670143747, + "ewc_loss": 0.04259052872657776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1295263650245033e-05, + "grad_norm": 28.4500732421875, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8812626600265503, + "num_tokens": 485634737.0, + "step": 12732 + }, + { + "epoch": 1.6197684772929652, + "ewc_loss": 0.04260478541254997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.130239226971753e-05, + "grad_norm": 28.429737091064453, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8715659379959106, + "num_tokens": 485671300.0, + "step": 12733 + }, + { + "epoch": 1.6198956875715558, + "ewc_loss": 0.04255065694451332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1275329345371574e-05, + "grad_norm": 28.327177047729492, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.876736581325531, + "num_tokens": 485708712.0, + "step": 12734 + }, + { + "epoch": 1.6200228978501463, + "ewc_loss": 0.04262206703424454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1311034288373776e-05, + "grad_norm": 28.429716110229492, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8801772594451904, + "num_tokens": 485746820.0, + "step": 12735 + }, + { + "epoch": 1.6201501081287368, + "ewc_loss": 0.042625971138477325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.131298606400378e-05, + "grad_norm": 28.346721649169922, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.868071436882019, + "num_tokens": 485785921.0, + "step": 12736 + }, + { + "epoch": 1.6202773184073274, + "ewc_loss": 0.04259312525391579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1296562408679165e-05, + "grad_norm": 28.450353622436523, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8690813779830933, + "num_tokens": 485825766.0, + "step": 12737 + }, + { + "epoch": 1.6204045286859179, + "ewc_loss": 0.04257962852716446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.128981395799201e-05, + "grad_norm": 28.32095718383789, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8629962205886841, + "num_tokens": 485862288.0, + "step": 12738 + }, + { + "epoch": 1.6205317389645084, + "ewc_loss": 0.042576391249895096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1288195057422854e-05, + "grad_norm": 28.40043067932129, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8614601492881775, + "num_tokens": 485903582.0, + "step": 12739 + }, + { + "epoch": 1.620658949243099, + "ewc_loss": 0.042658302932977676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1329151422833093e-05, + "grad_norm": 28.355518341064453, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8653761148452759, + "num_tokens": 485950931.0, + "step": 12740 + }, + { + "epoch": 1.6207861595216895, + "ewc_loss": 0.042584363371133804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1292182282195427e-05, + "grad_norm": 28.431835174560547, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.869918167591095, + "num_tokens": 485984185.0, + "step": 12741 + }, + { + "epoch": 1.62091336980028, + "ewc_loss": 0.042612943798303604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1306472262949683e-05, + "grad_norm": 28.402172088623047, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8727133274078369, + "num_tokens": 486018796.0, + "step": 12742 + }, + { + "epoch": 1.6210405800788705, + "ewc_loss": 0.042550597339868546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1275298422551714e-05, + "grad_norm": 28.345600128173828, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8553350567817688, + "num_tokens": 486055328.0, + "step": 12743 + }, + { + "epoch": 1.6211677903574608, + "ewc_loss": 0.04258788377046585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1293941244948655e-05, + "grad_norm": 28.306095123291016, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8628003597259521, + "num_tokens": 486092340.0, + "step": 12744 + }, + { + "epoch": 1.6212950006360514, + "ewc_loss": 0.0425659641623497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.128298183379229e-05, + "grad_norm": 28.40067481994629, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8556967377662659, + "num_tokens": 486123195.0, + "step": 12745 + }, + { + "epoch": 1.621422210914642, + "ewc_loss": 0.04269127547740936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.134563692379743e-05, + "grad_norm": 28.38037872314453, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8778958320617676, + "num_tokens": 486167145.0, + "step": 12746 + }, + { + "epoch": 1.6215494211932324, + "ewc_loss": 0.04254823178052902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.127411607943941e-05, + "grad_norm": 28.36478614807129, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8601442575454712, + "num_tokens": 486207925.0, + "step": 12747 + }, + { + "epoch": 1.621676631471823, + "ewc_loss": 0.04264381527900696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1321908207028173e-05, + "grad_norm": 28.454914093017578, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8734160661697388, + "num_tokens": 486245446.0, + "step": 12748 + }, + { + "epoch": 1.6218038417504135, + "ewc_loss": 0.042610056698322296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1305027985363267e-05, + "grad_norm": 28.429962158203125, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.867770791053772, + "num_tokens": 486280461.0, + "step": 12749 + }, + { + "epoch": 1.6219310520290038, + "ewc_loss": 0.04261425882577896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1307128918124363e-05, + "grad_norm": 28.301759719848633, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8562576174736023, + "num_tokens": 486323926.0, + "step": 12750 + }, + { + "epoch": 1.6220582623075943, + "ewc_loss": 0.04254208505153656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1271041987347417e-05, + "grad_norm": 28.415863037109375, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8614765405654907, + "num_tokens": 486360016.0, + "step": 12751 + }, + { + "epoch": 1.6221854725861848, + "ewc_loss": 0.042644016444683075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1322008251445368e-05, + "grad_norm": 28.428836822509766, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8715147972106934, + "num_tokens": 486394994.0, + "step": 12752 + }, + { + "epoch": 1.6223126828647754, + "ewc_loss": 0.04266401380300522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.133200723619666e-05, + "grad_norm": 28.333024978637695, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.867235541343689, + "num_tokens": 486436909.0, + "step": 12753 + }, + { + "epoch": 1.622439893143366, + "ewc_loss": 0.04259732738137245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.129866334144026e-05, + "grad_norm": 28.521554946899414, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8738597631454468, + "num_tokens": 486473201.0, + "step": 12754 + }, + { + "epoch": 1.6225671034219564, + "ewc_loss": 0.04265280067920685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.132640111085493e-05, + "grad_norm": 28.25868797302246, + "learning_rate": 1e-06, + "loss": 0.4786, + "mean_token_accuracy": 0.8488914966583252, + "num_tokens": 486510733.0, + "step": 12755 + }, + { + "epoch": 1.622694313700547, + "ewc_loss": 0.04261540621519089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1307703718775883e-05, + "grad_norm": 28.429834365844727, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8838202357292175, + "num_tokens": 486546666.0, + "step": 12756 + }, + { + "epoch": 1.6228215239791375, + "ewc_loss": 0.04274837672710419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1374187781475484e-05, + "grad_norm": 28.393962860107422, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.849639892578125, + "num_tokens": 486585639.0, + "step": 12757 + }, + { + "epoch": 1.622948734257728, + "ewc_loss": 0.04258308187127113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1291540178935975e-05, + "grad_norm": 28.310739517211914, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8722684979438782, + "num_tokens": 486623028.0, + "step": 12758 + }, + { + "epoch": 1.6230759445363185, + "ewc_loss": 0.042742419987916946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.137121009582188e-05, + "grad_norm": 28.619543075561523, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8636259436607361, + "num_tokens": 486655906.0, + "step": 12759 + }, + { + "epoch": 1.623203154814909, + "ewc_loss": 0.04271653667092323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.135826798621565e-05, + "grad_norm": 28.349300384521484, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8648465871810913, + "num_tokens": 486689461.0, + "step": 12760 + }, + { + "epoch": 1.6233303650934996, + "ewc_loss": 0.04265948012471199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.132974077539984e-05, + "grad_norm": 28.483295440673828, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8520034551620483, + "num_tokens": 486728373.0, + "step": 12761 + }, + { + "epoch": 1.6234575753720901, + "ewc_loss": 0.04271822050213814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1359110178309493e-05, + "grad_norm": 28.2884578704834, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8775538802146912, + "num_tokens": 486769955.0, + "step": 12762 + }, + { + "epoch": 1.6235847856506807, + "ewc_loss": 0.04263332486152649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1316662241588347e-05, + "grad_norm": 28.350200653076172, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8715319633483887, + "num_tokens": 486806468.0, + "step": 12763 + }, + { + "epoch": 1.6237119959292712, + "ewc_loss": 0.042682673782110214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1341336832847446e-05, + "grad_norm": 28.429723739624023, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8562315702438354, + "num_tokens": 486846526.0, + "step": 12764 + }, + { + "epoch": 1.6238392062078617, + "ewc_loss": 0.04272737354040146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1363686755648814e-05, + "grad_norm": 28.339149475097656, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8852032423019409, + "num_tokens": 486886323.0, + "step": 12765 + }, + { + "epoch": 1.6239664164864522, + "ewc_loss": 0.04262939840555191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.131469955202192e-05, + "grad_norm": 28.336347579956055, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8504124879837036, + "num_tokens": 486918099.0, + "step": 12766 + }, + { + "epoch": 1.6240936267650428, + "ewc_loss": 0.04275231435894966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1376157746999525e-05, + "grad_norm": 28.421051025390625, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8818508982658386, + "num_tokens": 486953789.0, + "step": 12767 + }, + { + "epoch": 1.6242208370436333, + "ewc_loss": 0.04271359741687775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1356798242777586e-05, + "grad_norm": 28.358797073364258, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8669878840446472, + "num_tokens": 486996165.0, + "step": 12768 + }, + { + "epoch": 1.6243480473222236, + "ewc_loss": 0.042704321444034576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1352161638787948e-05, + "grad_norm": 28.384634017944336, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8786171078681946, + "num_tokens": 487028249.0, + "step": 12769 + }, + { + "epoch": 1.6244752576008141, + "ewc_loss": 0.04275527969002724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1377640223363414e-05, + "grad_norm": 28.405025482177734, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8686812520027161, + "num_tokens": 487070099.0, + "step": 12770 + }, + { + "epoch": 1.6246024678794047, + "ewc_loss": 0.042760200798511505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1380101316026412e-05, + "grad_norm": 28.35297393798828, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8718141317367554, + "num_tokens": 487107117.0, + "step": 12771 + }, + { + "epoch": 1.6247296781579952, + "ewc_loss": 0.04270593822002411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1352969270083122e-05, + "grad_norm": 28.49569320678711, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8566105365753174, + "num_tokens": 487146024.0, + "step": 12772 + }, + { + "epoch": 1.6248568884365857, + "ewc_loss": 0.04274744540452957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1373722120188177e-05, + "grad_norm": 28.40407371520996, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8551715612411499, + "num_tokens": 487189864.0, + "step": 12773 + }, + { + "epoch": 1.6249840987151762, + "ewc_loss": 0.042622875422239304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1311438104021363e-05, + "grad_norm": 28.4091796875, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8711256980895996, + "num_tokens": 487230307.0, + "step": 12774 + }, + { + "epoch": 1.6251113089937665, + "ewc_loss": 0.04279477521777153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1397387172328308e-05, + "grad_norm": 28.503725051879883, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8712265491485596, + "num_tokens": 487261387.0, + "step": 12775 + }, + { + "epoch": 1.625238519272357, + "ewc_loss": 0.04268486425280571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.134243186446838e-05, + "grad_norm": 28.4284725189209, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8563275933265686, + "num_tokens": 487298611.0, + "step": 12776 + }, + { + "epoch": 1.6253657295509476, + "ewc_loss": 0.04262532666325569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1312664102879353e-05, + "grad_norm": 28.330663681030273, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8677359223365784, + "num_tokens": 487338172.0, + "step": 12777 + }, + { + "epoch": 1.6254929398295381, + "ewc_loss": 0.04263070970773697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1315354388207197e-05, + "grad_norm": 28.29446792602539, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8696988821029663, + "num_tokens": 487375126.0, + "step": 12778 + }, + { + "epoch": 1.6256201501081287, + "ewc_loss": 0.04271623119711876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1358115191105753e-05, + "grad_norm": 28.453006744384766, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.878957211971283, + "num_tokens": 487416539.0, + "step": 12779 + }, + { + "epoch": 1.6257473603867192, + "ewc_loss": 0.042682282626628876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1341140381991863e-05, + "grad_norm": 28.313730239868164, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8701896071434021, + "num_tokens": 487448429.0, + "step": 12780 + }, + { + "epoch": 1.6258745706653097, + "ewc_loss": 0.04267524182796478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1337620637496002e-05, + "grad_norm": 28.46869659423828, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8678516745567322, + "num_tokens": 487486054.0, + "step": 12781 + }, + { + "epoch": 1.6260017809439002, + "ewc_loss": 0.04271191358566284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1355956050683744e-05, + "grad_norm": 28.314517974853516, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8741642236709595, + "num_tokens": 487523771.0, + "step": 12782 + }, + { + "epoch": 1.6261289912224908, + "ewc_loss": 0.042645130306482315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1322564862202853e-05, + "grad_norm": 28.427358627319336, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8737244606018066, + "num_tokens": 487561334.0, + "step": 12783 + }, + { + "epoch": 1.6262562015010813, + "ewc_loss": 0.042668260633945465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1334129996830598e-05, + "grad_norm": 28.315258026123047, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8691623210906982, + "num_tokens": 487606382.0, + "step": 12784 + }, + { + "epoch": 1.6263834117796718, + "ewc_loss": 0.04269733652472496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.134866917913314e-05, + "grad_norm": 28.380210876464844, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8640556931495667, + "num_tokens": 487647640.0, + "step": 12785 + }, + { + "epoch": 1.6265106220582624, + "ewc_loss": 0.04268788546323776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1343943444662727e-05, + "grad_norm": 28.488079071044922, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8655797243118286, + "num_tokens": 487682604.0, + "step": 12786 + }, + { + "epoch": 1.6266378323368529, + "ewc_loss": 0.042667992413043976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1333995391614735e-05, + "grad_norm": 28.47246742248535, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8708381652832031, + "num_tokens": 487718271.0, + "step": 12787 + }, + { + "epoch": 1.6267650426154434, + "ewc_loss": 0.04263646900653839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.131823384843301e-05, + "grad_norm": 28.33700180053711, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8555234670639038, + "num_tokens": 487752872.0, + "step": 12788 + }, + { + "epoch": 1.626892252894034, + "ewc_loss": 0.04267815127968788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.133907582901884e-05, + "grad_norm": 28.36644744873047, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.858084499835968, + "num_tokens": 487798309.0, + "step": 12789 + }, + { + "epoch": 1.6270194631726245, + "ewc_loss": 0.04268540441989899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.134270289388951e-05, + "grad_norm": 28.284791946411133, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8561986088752747, + "num_tokens": 487834052.0, + "step": 12790 + }, + { + "epoch": 1.627146673451215, + "ewc_loss": 0.042688049376010895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1344025299185887e-05, + "grad_norm": 28.40018653869629, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.856564998626709, + "num_tokens": 487868662.0, + "step": 12791 + }, + { + "epoch": 1.6272738837298055, + "ewc_loss": 0.04269720986485481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1348605514504015e-05, + "grad_norm": 28.24777603149414, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8652937412261963, + "num_tokens": 487907425.0, + "step": 12792 + }, + { + "epoch": 1.6274010940083958, + "ewc_loss": 0.04263663291931152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.131831570295617e-05, + "grad_norm": 28.391847610473633, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.875360906124115, + "num_tokens": 487942979.0, + "step": 12793 + }, + { + "epoch": 1.6275283042869864, + "ewc_loss": 0.04275278374552727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1376392396632582e-05, + "grad_norm": 28.36031723022461, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8721101880073547, + "num_tokens": 487982402.0, + "step": 12794 + }, + { + "epoch": 1.6276555145655769, + "ewc_loss": 0.042658206075429916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1329102310119197e-05, + "grad_norm": 28.39328384399414, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.875368058681488, + "num_tokens": 488020071.0, + "step": 12795 + }, + { + "epoch": 1.6277827248441674, + "ewc_loss": 0.04268550127744675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1342750187614e-05, + "grad_norm": 28.250741958618164, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8548046350479126, + "num_tokens": 488057457.0, + "step": 12796 + }, + { + "epoch": 1.627909935122758, + "ewc_loss": 0.042677830904722214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1338915757951327e-05, + "grad_norm": 28.37982940673828, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8644697070121765, + "num_tokens": 488094002.0, + "step": 12797 + }, + { + "epoch": 1.6280371454013485, + "ewc_loss": 0.04276404529809952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1382022168836556e-05, + "grad_norm": 28.435470581054688, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8528010845184326, + "num_tokens": 488134995.0, + "step": 12798 + }, + { + "epoch": 1.6281643556799388, + "ewc_loss": 0.04268665611743927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.134332862624433e-05, + "grad_norm": 28.44626235961914, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.87126624584198, + "num_tokens": 488169942.0, + "step": 12799 + }, + { + "epoch": 1.6282915659585293, + "ewc_loss": 0.04271592199802399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1357960577006452e-05, + "grad_norm": 28.3269100189209, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8697457313537598, + "num_tokens": 488210852.0, + "step": 12800 + }, + { + "epoch": 1.6284187762371198, + "ewc_loss": 0.042681120336055756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1340560124372132e-05, + "grad_norm": 28.45221710205078, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8709179759025574, + "num_tokens": 488253067.0, + "step": 12801 + }, + { + "epoch": 1.6285459865157104, + "ewc_loss": 0.0426999069750309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1349953385652043e-05, + "grad_norm": 28.2938175201416, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8763823509216309, + "num_tokens": 488293498.0, + "step": 12802 + }, + { + "epoch": 1.628673196794301, + "ewc_loss": 0.04268398508429527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1341991669032723e-05, + "grad_norm": 28.44466209411621, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8699076175689697, + "num_tokens": 488333931.0, + "step": 12803 + }, + { + "epoch": 1.6288004070728914, + "ewc_loss": 0.04264049232006073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1320245650713332e-05, + "grad_norm": 28.286283493041992, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.872803270816803, + "num_tokens": 488371724.0, + "step": 12804 + }, + { + "epoch": 1.628927617351482, + "ewc_loss": 0.04278510436415672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1392552298493683e-05, + "grad_norm": 28.433170318603516, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8627106547355652, + "num_tokens": 488415171.0, + "step": 12805 + }, + { + "epoch": 1.6290548276300725, + "ewc_loss": 0.042674701660871506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1337351427064277e-05, + "grad_norm": 28.337162017822266, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8617900609970093, + "num_tokens": 488451672.0, + "step": 12806 + }, + { + "epoch": 1.629182037908663, + "ewc_loss": 0.04274383559823036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.137191768269986e-05, + "grad_norm": 28.40586280822754, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8818262815475464, + "num_tokens": 488492657.0, + "step": 12807 + }, + { + "epoch": 1.6293092481872535, + "ewc_loss": 0.04271994158625603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.135997056029737e-05, + "grad_norm": 28.397998809814453, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8748749494552612, + "num_tokens": 488530681.0, + "step": 12808 + }, + { + "epoch": 1.629436458465844, + "ewc_loss": 0.04265688732266426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1328443835955113e-05, + "grad_norm": 28.41318130493164, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8784386515617371, + "num_tokens": 488560542.0, + "step": 12809 + }, + { + "epoch": 1.6295636687444346, + "ewc_loss": 0.042678456753492355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1339228624128737e-05, + "grad_norm": 28.383638381958008, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8695892095565796, + "num_tokens": 488597491.0, + "step": 12810 + }, + { + "epoch": 1.6296908790230251, + "ewc_loss": 0.04269179701805115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.134589885827154e-05, + "grad_norm": 28.393640518188477, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.875556468963623, + "num_tokens": 488634556.0, + "step": 12811 + }, + { + "epoch": 1.6298180893016156, + "ewc_loss": 0.042738404124975204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1369201931520365e-05, + "grad_norm": 28.498554229736328, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8645109534263611, + "num_tokens": 488668174.0, + "step": 12812 + }, + { + "epoch": 1.6299452995802062, + "ewc_loss": 0.04275747016072273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.137873525498435e-05, + "grad_norm": 28.43174171447754, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8710222244262695, + "num_tokens": 488705994.0, + "step": 12813 + }, + { + "epoch": 1.6300725098587967, + "ewc_loss": 0.04259059205651283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1295296392054297e-05, + "grad_norm": 28.254594802856445, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8569369316101074, + "num_tokens": 488746908.0, + "step": 12814 + }, + { + "epoch": 1.6301997201373872, + "ewc_loss": 0.042765676975250244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.138283889507875e-05, + "grad_norm": 28.499774932861328, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8684641122817993, + "num_tokens": 488785607.0, + "step": 12815 + }, + { + "epoch": 1.6303269304159778, + "ewc_loss": 0.04275861755013466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1379308236646466e-05, + "grad_norm": 28.36586570739746, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8654384613037109, + "num_tokens": 488823831.0, + "step": 12816 + }, + { + "epoch": 1.630454140694568, + "ewc_loss": 0.042676012963056564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1338006263249554e-05, + "grad_norm": 28.466957092285156, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8754153847694397, + "num_tokens": 488859944.0, + "step": 12817 + }, + { + "epoch": 1.6305813509731586, + "ewc_loss": 0.042749542742967606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1374771677074023e-05, + "grad_norm": 28.44736099243164, + "learning_rate": 1e-06, + "loss": 0.4721, + "mean_token_accuracy": 0.8526895046234131, + "num_tokens": 488903764.0, + "step": 12818 + }, + { + "epoch": 1.6307085612517491, + "ewc_loss": 0.042640309780836105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1320154701243155e-05, + "grad_norm": 28.455829620361328, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8582931756973267, + "num_tokens": 488946890.0, + "step": 12819 + }, + { + "epoch": 1.6308357715303397, + "ewc_loss": 0.0426904670894146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.134523310814984e-05, + "grad_norm": 28.363079071044922, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8657735586166382, + "num_tokens": 488981901.0, + "step": 12820 + }, + { + "epoch": 1.6309629818089302, + "ewc_loss": 0.042687393724918365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1343696062103845e-05, + "grad_norm": 28.453828811645508, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8702425360679626, + "num_tokens": 489024180.0, + "step": 12821 + }, + { + "epoch": 1.6310901920875207, + "ewc_loss": 0.042796265333890915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1398132957983762e-05, + "grad_norm": 28.521461486816406, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.872109055519104, + "num_tokens": 489070971.0, + "step": 12822 + }, + { + "epoch": 1.6312174023661112, + "ewc_loss": 0.04263187572360039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1315938283805735e-05, + "grad_norm": 28.42914390563965, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8780505657196045, + "num_tokens": 489111636.0, + "step": 12823 + }, + { + "epoch": 1.6313446126447015, + "ewc_loss": 0.04262658208608627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1313291654223576e-05, + "grad_norm": 28.457691192626953, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.866631031036377, + "num_tokens": 489148677.0, + "step": 12824 + }, + { + "epoch": 1.631471822923292, + "ewc_loss": 0.04269006475806236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.134503301931545e-05, + "grad_norm": 28.43939971923828, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8561376929283142, + "num_tokens": 489186222.0, + "step": 12825 + }, + { + "epoch": 1.6315990332018826, + "ewc_loss": 0.04255364090204239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1276820916682482e-05, + "grad_norm": 28.5266056060791, + "learning_rate": 1e-06, + "loss": 0.5023, + "mean_token_accuracy": 0.8421198129653931, + "num_tokens": 489223173.0, + "step": 12826 + }, + { + "epoch": 1.6317262434804731, + "ewc_loss": 0.042689427733421326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.134471469616983e-05, + "grad_norm": 28.483844757080078, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8688793182373047, + "num_tokens": 489259236.0, + "step": 12827 + }, + { + "epoch": 1.6318534537590637, + "ewc_loss": 0.04255754128098488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1278770873323083e-05, + "grad_norm": 28.453536987304688, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8607749342918396, + "num_tokens": 489298449.0, + "step": 12828 + }, + { + "epoch": 1.6319806640376542, + "ewc_loss": 0.042633768171072006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.131688415829558e-05, + "grad_norm": 28.493404388427734, + "learning_rate": 1e-06, + "loss": 0.4954, + "mean_token_accuracy": 0.8467128276824951, + "num_tokens": 489334800.0, + "step": 12829 + }, + { + "epoch": 1.6321078743162447, + "ewc_loss": 0.04266601800918579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1333009499358013e-05, + "grad_norm": 28.34518814086914, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8737295269966125, + "num_tokens": 489378510.0, + "step": 12830 + }, + { + "epoch": 1.6322350845948352, + "ewc_loss": 0.0426916740834713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.134583701263182e-05, + "grad_norm": 28.48219871520996, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8553280234336853, + "num_tokens": 489419267.0, + "step": 12831 + }, + { + "epoch": 1.6323622948734258, + "ewc_loss": 0.04274466633796692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1372332412283868e-05, + "grad_norm": 28.45380973815918, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8710798621177673, + "num_tokens": 489454772.0, + "step": 12832 + }, + { + "epoch": 1.6324895051520163, + "ewc_loss": 0.04260160028934479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1300800653989427e-05, + "grad_norm": 28.496967315673828, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8624424934387207, + "num_tokens": 489490146.0, + "step": 12833 + }, + { + "epoch": 1.6326167154306068, + "ewc_loss": 0.04267343878746033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1336720237741247e-05, + "grad_norm": 28.45577621459961, + "learning_rate": 1e-06, + "loss": 0.5233, + "mean_token_accuracy": 0.8383991718292236, + "num_tokens": 489532754.0, + "step": 12834 + }, + { + "epoch": 1.6327439257091974, + "ewc_loss": 0.04262786731123924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1313933757483028e-05, + "grad_norm": 28.47260284423828, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8660666942596436, + "num_tokens": 489570378.0, + "step": 12835 + }, + { + "epoch": 1.6328711359877879, + "ewc_loss": 0.042669087648391724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1334544726414606e-05, + "grad_norm": 28.393329620361328, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8593475222587585, + "num_tokens": 489604562.0, + "step": 12836 + }, + { + "epoch": 1.6329983462663784, + "ewc_loss": 0.04266543686389923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1332718461053446e-05, + "grad_norm": 28.496177673339844, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8750858902931213, + "num_tokens": 489644060.0, + "step": 12837 + }, + { + "epoch": 1.633125556544969, + "ewc_loss": 0.04273432120680809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1367161025409587e-05, + "grad_norm": 28.54303741455078, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.868889570236206, + "num_tokens": 489682693.0, + "step": 12838 + }, + { + "epoch": 1.6332527668235595, + "ewc_loss": 0.04257099702954292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.12854993151268e-05, + "grad_norm": 28.552751541137695, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8753401637077332, + "num_tokens": 489720786.0, + "step": 12839 + }, + { + "epoch": 1.63337997710215, + "ewc_loss": 0.04266837239265442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1334186385502107e-05, + "grad_norm": 28.37939453125, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8725775480270386, + "num_tokens": 489758773.0, + "step": 12840 + }, + { + "epoch": 1.6335071873807405, + "ewc_loss": 0.04265637323260307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1328187358449213e-05, + "grad_norm": 28.45321273803711, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8678038716316223, + "num_tokens": 489793436.0, + "step": 12841 + }, + { + "epoch": 1.6336343976593308, + "ewc_loss": 0.04267968237400055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.133984162355773e-05, + "grad_norm": 28.502429962158203, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8862491846084595, + "num_tokens": 489828634.0, + "step": 12842 + }, + { + "epoch": 1.6337616079379214, + "ewc_loss": 0.04263976961374283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.131988549081143e-05, + "grad_norm": 28.352703094482422, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8714264035224915, + "num_tokens": 489866335.0, + "step": 12843 + }, + { + "epoch": 1.6338888182165119, + "ewc_loss": 0.04256472736597061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1282363377395086e-05, + "grad_norm": 28.288488388061523, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.876593828201294, + "num_tokens": 489903059.0, + "step": 12844 + }, + { + "epoch": 1.6340160284951024, + "ewc_loss": 0.04273436591029167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.136718285328243e-05, + "grad_norm": 28.4468994140625, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8747597932815552, + "num_tokens": 489940186.0, + "step": 12845 + }, + { + "epoch": 1.634143238773693, + "ewc_loss": 0.04269156605005264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1345782442949712e-05, + "grad_norm": 28.368553161621094, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8800835013389587, + "num_tokens": 489984732.0, + "step": 12846 + }, + { + "epoch": 1.6342704490522835, + "ewc_loss": 0.04262993857264519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1314968762453645e-05, + "grad_norm": 28.380020141601562, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.869261622428894, + "num_tokens": 490016699.0, + "step": 12847 + }, + { + "epoch": 1.6343976593308738, + "ewc_loss": 0.0427422821521759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1371140974224545e-05, + "grad_norm": 28.435487747192383, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8601679801940918, + "num_tokens": 490051351.0, + "step": 12848 + }, + { + "epoch": 1.6345248696094643, + "ewc_loss": 0.04275988042354584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1379939425969496e-05, + "grad_norm": 28.474946975708008, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8761286735534668, + "num_tokens": 490089668.0, + "step": 12849 + }, + { + "epoch": 1.6346520798880548, + "ewc_loss": 0.04278213903307915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1391069822129793e-05, + "grad_norm": 28.435476303100586, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8687986135482788, + "num_tokens": 490129651.0, + "step": 12850 + }, + { + "epoch": 1.6347792901666454, + "ewc_loss": 0.04269002377986908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.134501119144261e-05, + "grad_norm": 28.43390655517578, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8614039421081543, + "num_tokens": 490168441.0, + "step": 12851 + }, + { + "epoch": 1.6349065004452359, + "ewc_loss": 0.04279002919793129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1395015210146084e-05, + "grad_norm": 28.481319427490234, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8773626685142517, + "num_tokens": 490204754.0, + "step": 12852 + }, + { + "epoch": 1.6350337107238264, + "ewc_loss": 0.04273246228694916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1366231521824375e-05, + "grad_norm": 28.374345779418945, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8828119039535522, + "num_tokens": 490244178.0, + "step": 12853 + }, + { + "epoch": 1.635160921002417, + "ewc_loss": 0.042735565453767776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.13677831197856e-05, + "grad_norm": 28.384174346923828, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8639954924583435, + "num_tokens": 490281161.0, + "step": 12854 + }, + { + "epoch": 1.6352881312810075, + "ewc_loss": 0.042814239859580994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1407120584626682e-05, + "grad_norm": 28.443334579467773, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8692967891693115, + "num_tokens": 490314892.0, + "step": 12855 + }, + { + "epoch": 1.635415341559598, + "ewc_loss": 0.04277443140745163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.138721538358368e-05, + "grad_norm": 28.522022247314453, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8563969135284424, + "num_tokens": 490352600.0, + "step": 12856 + }, + { + "epoch": 1.6355425518381885, + "ewc_loss": 0.04280262812972069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1401314370450564e-05, + "grad_norm": 28.44979476928711, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.879499077796936, + "num_tokens": 490384487.0, + "step": 12857 + }, + { + "epoch": 1.635669762116779, + "ewc_loss": 0.04275006800889969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1375033611548133e-05, + "grad_norm": 28.489643096923828, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8472310900688171, + "num_tokens": 490422199.0, + "step": 12858 + }, + { + "epoch": 1.6357969723953696, + "ewc_loss": 0.042773615568876266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1386807929957286e-05, + "grad_norm": 28.52716636657715, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8602526783943176, + "num_tokens": 490458058.0, + "step": 12859 + }, + { + "epoch": 1.6359241826739601, + "ewc_loss": 0.042699359357357025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.134968053724151e-05, + "grad_norm": 28.4314022064209, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8792120218276978, + "num_tokens": 490498447.0, + "step": 12860 + }, + { + "epoch": 1.6360513929525506, + "ewc_loss": 0.04273073375225067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.136536750185769e-05, + "grad_norm": 28.311588287353516, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8554170727729797, + "num_tokens": 490538502.0, + "step": 12861 + }, + { + "epoch": 1.6361786032311412, + "ewc_loss": 0.04278823733329773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1394118448370136e-05, + "grad_norm": 28.367393493652344, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8710821866989136, + "num_tokens": 490575752.0, + "step": 12862 + }, + { + "epoch": 1.6363058135097317, + "ewc_loss": 0.04281071573495865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1405357983894646e-05, + "grad_norm": 28.354612350463867, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8671584129333496, + "num_tokens": 490613502.0, + "step": 12863 + }, + { + "epoch": 1.6364330237883222, + "ewc_loss": 0.04280327633023262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1401638150564395e-05, + "grad_norm": 28.348060607910156, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8625211715698242, + "num_tokens": 490653820.0, + "step": 12864 + }, + { + "epoch": 1.6365602340669128, + "ewc_loss": 0.04277196526527405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1385982108768076e-05, + "grad_norm": 28.252023696899414, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.8519375324249268, + "num_tokens": 490693332.0, + "step": 12865 + }, + { + "epoch": 1.636687444345503, + "ewc_loss": 0.04286332428455353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1431662389659323e-05, + "grad_norm": 28.45899200439453, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8654503226280212, + "num_tokens": 490733135.0, + "step": 12866 + }, + { + "epoch": 1.6368146546240936, + "ewc_loss": 0.042985815554857254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1492907762876712e-05, + "grad_norm": 28.5362491607666, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.871323823928833, + "num_tokens": 490775505.0, + "step": 12867 + }, + { + "epoch": 1.6369418649026841, + "ewc_loss": 0.04287954792380333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1439773263409734e-05, + "grad_norm": 28.440231323242188, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8670364618301392, + "num_tokens": 490817783.0, + "step": 12868 + }, + { + "epoch": 1.6370690751812746, + "ewc_loss": 0.04282841831445694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.14142091863323e-05, + "grad_norm": 28.41511344909668, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8628848195075989, + "num_tokens": 490849464.0, + "step": 12869 + }, + { + "epoch": 1.6371962854598652, + "ewc_loss": 0.0428258441388607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.141292134183459e-05, + "grad_norm": 28.454681396484375, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8616266250610352, + "num_tokens": 490891925.0, + "step": 12870 + }, + { + "epoch": 1.6373234957384557, + "ewc_loss": 0.042853519320487976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1426760213216767e-05, + "grad_norm": 28.532255172729492, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8610506057739258, + "num_tokens": 490926221.0, + "step": 12871 + }, + { + "epoch": 1.6374507060170462, + "ewc_loss": 0.04284479096531868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1422394638648257e-05, + "grad_norm": 28.343502044677734, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8585156798362732, + "num_tokens": 490964141.0, + "step": 12872 + }, + { + "epoch": 1.6375779162956365, + "ewc_loss": 0.04282251000404358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1411255147540942e-05, + "grad_norm": 28.54078483581543, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8642270565032959, + "num_tokens": 490997917.0, + "step": 12873 + }, + { + "epoch": 1.637705126574227, + "ewc_loss": 0.04283171519637108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1415857190731913e-05, + "grad_norm": 28.32122802734375, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8817330002784729, + "num_tokens": 491029244.0, + "step": 12874 + }, + { + "epoch": 1.6378323368528176, + "ewc_loss": 0.042835086584091187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1417543393909e-05, + "grad_norm": 28.531251907348633, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8669692277908325, + "num_tokens": 491066591.0, + "step": 12875 + }, + { + "epoch": 1.6379595471314081, + "ewc_loss": 0.04292035102844238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1460175048559904e-05, + "grad_norm": 28.358312606811523, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8662391901016235, + "num_tokens": 491106546.0, + "step": 12876 + }, + { + "epoch": 1.6380867574099987, + "ewc_loss": 0.04280173033475876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1400865080067888e-05, + "grad_norm": 28.451135635375977, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8600817918777466, + "num_tokens": 491145512.0, + "step": 12877 + }, + { + "epoch": 1.6382139676885892, + "ewc_loss": 0.04299456998705864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1497284251381643e-05, + "grad_norm": 28.58591651916504, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8657585382461548, + "num_tokens": 491185624.0, + "step": 12878 + }, + { + "epoch": 1.6383411779671797, + "ewc_loss": 0.04284810274839401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1424051737994887e-05, + "grad_norm": 28.536270141601562, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8606817126274109, + "num_tokens": 491220982.0, + "step": 12879 + }, + { + "epoch": 1.6384683882457702, + "ewc_loss": 0.042812034487724304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.140601645805873e-05, + "grad_norm": 28.508827209472656, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8691153526306152, + "num_tokens": 491262501.0, + "step": 12880 + }, + { + "epoch": 1.6385955985243608, + "ewc_loss": 0.04281623661518097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1408117390819825e-05, + "grad_norm": 28.509441375732422, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8776322603225708, + "num_tokens": 491298078.0, + "step": 12881 + }, + { + "epoch": 1.6387228088029513, + "ewc_loss": 0.0427529513835907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1376476070145145e-05, + "grad_norm": 28.424182891845703, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8743425011634827, + "num_tokens": 491336203.0, + "step": 12882 + }, + { + "epoch": 1.6388500190815418, + "ewc_loss": 0.04278557375073433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.139278694812674e-05, + "grad_norm": 28.44774055480957, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8555471301078796, + "num_tokens": 491374723.0, + "step": 12883 + }, + { + "epoch": 1.6389772293601323, + "ewc_loss": 0.042815595865249634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.14077972486848e-05, + "grad_norm": 28.403091430664062, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8690788149833679, + "num_tokens": 491410546.0, + "step": 12884 + }, + { + "epoch": 1.6391044396387229, + "ewc_loss": 0.0428287535905838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1414376533357427e-05, + "grad_norm": 28.542316436767578, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8631356358528137, + "num_tokens": 491450642.0, + "step": 12885 + }, + { + "epoch": 1.6392316499173134, + "ewc_loss": 0.04287005215883255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.143502570106648e-05, + "grad_norm": 28.467477798461914, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8648260831832886, + "num_tokens": 491489893.0, + "step": 12886 + }, + { + "epoch": 1.639358860195904, + "ewc_loss": 0.04276734218001366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1383671992225572e-05, + "grad_norm": 28.45648956298828, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8575661778450012, + "num_tokens": 491535870.0, + "step": 12887 + }, + { + "epoch": 1.6394860704744945, + "ewc_loss": 0.04281122237443924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.140561082342174e-05, + "grad_norm": 28.555763244628906, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8655644655227661, + "num_tokens": 491575988.0, + "step": 12888 + }, + { + "epoch": 1.639613280753085, + "ewc_loss": 0.04280220717191696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1401103367679752e-05, + "grad_norm": 28.50795555114746, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8692176342010498, + "num_tokens": 491616822.0, + "step": 12889 + }, + { + "epoch": 1.6397404910316755, + "ewc_loss": 0.04282430186867714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1412150090327486e-05, + "grad_norm": 28.497344970703125, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8880902528762817, + "num_tokens": 491654132.0, + "step": 12890 + }, + { + "epoch": 1.6398677013102658, + "ewc_loss": 0.042700354009866714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.135017712134868e-05, + "grad_norm": 28.553274154663086, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8543962836265564, + "num_tokens": 491697215.0, + "step": 12891 + }, + { + "epoch": 1.6399949115888564, + "ewc_loss": 0.0427144430577755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1357222067308612e-05, + "grad_norm": 28.5487003326416, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8456149101257324, + "num_tokens": 491731338.0, + "step": 12892 + }, + { + "epoch": 1.6401221218674469, + "ewc_loss": 0.042739175260066986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1369587557273917e-05, + "grad_norm": 28.446399688720703, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8655475974082947, + "num_tokens": 491770808.0, + "step": 12893 + }, + { + "epoch": 1.6402493321460374, + "ewc_loss": 0.04279559105634689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1397794625954702e-05, + "grad_norm": 28.534000396728516, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8749028444290161, + "num_tokens": 491804167.0, + "step": 12894 + }, + { + "epoch": 1.640376542424628, + "ewc_loss": 0.042719606310129166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1359803213272244e-05, + "grad_norm": 28.35887908935547, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8689769506454468, + "num_tokens": 491840094.0, + "step": 12895 + }, + { + "epoch": 1.6405037527032185, + "ewc_loss": 0.042683910578489304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1341955289244652e-05, + "grad_norm": 28.4322452545166, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8537126779556274, + "num_tokens": 491881643.0, + "step": 12896 + }, + { + "epoch": 1.6406309629818088, + "ewc_loss": 0.04276729002594948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.138364470738452e-05, + "grad_norm": 28.46868133544922, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8549362421035767, + "num_tokens": 491920951.0, + "step": 12897 + }, + { + "epoch": 1.6407581732603993, + "ewc_loss": 0.042718805372714996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1359403035603464e-05, + "grad_norm": 28.38776206970215, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8630653619766235, + "num_tokens": 491954599.0, + "step": 12898 + }, + { + "epoch": 1.6408853835389898, + "ewc_loss": 0.04275258630514145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1376292352215387e-05, + "grad_norm": 28.48011589050293, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8689577579498291, + "num_tokens": 491995375.0, + "step": 12899 + }, + { + "epoch": 1.6410125938175804, + "ewc_loss": 0.04281486198306084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.140743163181469e-05, + "grad_norm": 28.358013153076172, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8625258207321167, + "num_tokens": 492035297.0, + "step": 12900 + }, + { + "epoch": 1.6411398040961709, + "ewc_loss": 0.0426640585064888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1332029064069502e-05, + "grad_norm": 28.450239181518555, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.86733478307724, + "num_tokens": 492071003.0, + "step": 12901 + }, + { + "epoch": 1.6412670143747614, + "ewc_loss": 0.042768895626068115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1384446881711483e-05, + "grad_norm": 28.340551376342773, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8661619424819946, + "num_tokens": 492106046.0, + "step": 12902 + }, + { + "epoch": 1.641394224653352, + "ewc_loss": 0.042762186378240585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1381092665251344e-05, + "grad_norm": 28.358400344848633, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8784887790679932, + "num_tokens": 492147252.0, + "step": 12903 + }, + { + "epoch": 1.6415214349319425, + "ewc_loss": 0.04283479228615761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1417396055767313e-05, + "grad_norm": 28.572420120239258, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8578140139579773, + "num_tokens": 492188185.0, + "step": 12904 + }, + { + "epoch": 1.641648645210533, + "ewc_loss": 0.042814336717128754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1407167878351174e-05, + "grad_norm": 28.53375244140625, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8726346492767334, + "num_tokens": 492219630.0, + "step": 12905 + }, + { + "epoch": 1.6417758554891235, + "ewc_loss": 0.042786166071891785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.139308344339952e-05, + "grad_norm": 28.3601016998291, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8685807585716248, + "num_tokens": 492251955.0, + "step": 12906 + }, + { + "epoch": 1.641903065767714, + "ewc_loss": 0.04281550273299217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1407751773949713e-05, + "grad_norm": 28.49766731262207, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8694190382957458, + "num_tokens": 492293484.0, + "step": 12907 + }, + { + "epoch": 1.6420302760463046, + "ewc_loss": 0.04282902181148529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.141451113857329e-05, + "grad_norm": 28.33791732788086, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8534864187240601, + "num_tokens": 492331169.0, + "step": 12908 + }, + { + "epoch": 1.642157486324895, + "ewc_loss": 0.04285689815878868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1428448235383257e-05, + "grad_norm": 28.560977935791016, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8746957778930664, + "num_tokens": 492366071.0, + "step": 12909 + }, + { + "epoch": 1.6422846966034856, + "ewc_loss": 0.042777203023433685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1388601453509182e-05, + "grad_norm": 28.406497955322266, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8871350884437561, + "num_tokens": 492396820.0, + "step": 12910 + }, + { + "epoch": 1.6424119068820762, + "ewc_loss": 0.042864229530096054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1432115318020806e-05, + "grad_norm": 28.549882888793945, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8741666078567505, + "num_tokens": 492435282.0, + "step": 12911 + }, + { + "epoch": 1.6425391171606667, + "ewc_loss": 0.04285305738449097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1426529201562516e-05, + "grad_norm": 28.449796676635742, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8749895691871643, + "num_tokens": 492470195.0, + "step": 12912 + }, + { + "epoch": 1.6426663274392572, + "ewc_loss": 0.042794860899448395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1397430828073993e-05, + "grad_norm": 28.477216720581055, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.861337423324585, + "num_tokens": 492514265.0, + "step": 12913 + }, + { + "epoch": 1.6427935377178478, + "ewc_loss": 0.042850200086832047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.142509947589133e-05, + "grad_norm": 28.450897216796875, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.869763195514679, + "num_tokens": 492550348.0, + "step": 12914 + }, + { + "epoch": 1.642920747996438, + "ewc_loss": 0.04278786852955818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1393934730440378e-05, + "grad_norm": 28.497188568115234, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8697863817214966, + "num_tokens": 492590648.0, + "step": 12915 + }, + { + "epoch": 1.6430479582750286, + "ewc_loss": 0.04293007031083107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.146503538824618e-05, + "grad_norm": 28.572023391723633, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8697818517684937, + "num_tokens": 492629523.0, + "step": 12916 + }, + { + "epoch": 1.6431751685536191, + "ewc_loss": 0.04277291148900986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1386455046012998e-05, + "grad_norm": 28.43399429321289, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8626108169555664, + "num_tokens": 492668392.0, + "step": 12917 + }, + { + "epoch": 1.6433023788322096, + "ewc_loss": 0.04274608567357063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1373043637140654e-05, + "grad_norm": 28.441919326782227, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8562067747116089, + "num_tokens": 492696073.0, + "step": 12918 + }, + { + "epoch": 1.6434295891108002, + "ewc_loss": 0.04283180460333824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1415902665467e-05, + "grad_norm": 28.455400466918945, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8696263432502747, + "num_tokens": 492728590.0, + "step": 12919 + }, + { + "epoch": 1.6435567993893907, + "ewc_loss": 0.04280601441860199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1403007849585265e-05, + "grad_norm": 28.441938400268555, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8603920340538025, + "num_tokens": 492767290.0, + "step": 12920 + }, + { + "epoch": 1.6436840096679812, + "ewc_loss": 0.042802631855010986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1401316189439967e-05, + "grad_norm": 28.406749725341797, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8782437443733215, + "num_tokens": 492801019.0, + "step": 12921 + }, + { + "epoch": 1.6438112199465715, + "ewc_loss": 0.042760878801345825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.138043964805547e-05, + "grad_norm": 28.349050521850586, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8669198155403137, + "num_tokens": 492841220.0, + "step": 12922 + }, + { + "epoch": 1.643938430225162, + "ewc_loss": 0.04286924749612808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1434623704408295e-05, + "grad_norm": 28.39844512939453, + "learning_rate": 1e-06, + "loss": 0.4686, + "mean_token_accuracy": 0.8576270341873169, + "num_tokens": 492885076.0, + "step": 12923 + }, + { + "epoch": 1.6440656405037526, + "ewc_loss": 0.0428752563893795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1437628674902953e-05, + "grad_norm": 28.385128021240234, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8623366951942444, + "num_tokens": 492918554.0, + "step": 12924 + }, + { + "epoch": 1.6441928507823431, + "ewc_loss": 0.042910538613796234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.145526923413854e-05, + "grad_norm": 28.45697784423828, + "learning_rate": 1e-06, + "loss": 0.3785, + "mean_token_accuracy": 0.8826711773872375, + "num_tokens": 492961890.0, + "step": 12925 + }, + { + "epoch": 1.6443200610609336, + "ewc_loss": 0.0428294911980629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1414745788206346e-05, + "grad_norm": 28.439538955688477, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8772953748703003, + "num_tokens": 492997768.0, + "step": 12926 + }, + { + "epoch": 1.6444472713395242, + "ewc_loss": 0.04291355237364769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1456775357364677e-05, + "grad_norm": 28.500411987304688, + "learning_rate": 1e-06, + "loss": 0.3545, + "mean_token_accuracy": 0.8927841782569885, + "num_tokens": 493037017.0, + "step": 12927 + }, + { + "epoch": 1.6445744816181147, + "ewc_loss": 0.04280304163694382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1401521735242568e-05, + "grad_norm": 28.315458297729492, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8771945834159851, + "num_tokens": 493074579.0, + "step": 12928 + }, + { + "epoch": 1.6447016918967052, + "ewc_loss": 0.04281105101108551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1405525330919772e-05, + "grad_norm": 28.44881248474121, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8680722713470459, + "num_tokens": 493111249.0, + "step": 12929 + }, + { + "epoch": 1.6448289021752958, + "ewc_loss": 0.042986854910850525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1493427993846126e-05, + "grad_norm": 28.495716094970703, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8514646291732788, + "num_tokens": 493150723.0, + "step": 12930 + }, + { + "epoch": 1.6449561124538863, + "ewc_loss": 0.04285506159067154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.142753146472387e-05, + "grad_norm": 28.53990364074707, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8654993772506714, + "num_tokens": 493189530.0, + "step": 12931 + }, + { + "epoch": 1.6450833227324768, + "ewc_loss": 0.04287216067314148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1436080714920536e-05, + "grad_norm": 28.469860076904297, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8747727274894714, + "num_tokens": 493231369.0, + "step": 12932 + }, + { + "epoch": 1.6452105330110673, + "ewc_loss": 0.04279749467968941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.139874777640216e-05, + "grad_norm": 28.492162704467773, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8538123369216919, + "num_tokens": 493278430.0, + "step": 12933 + }, + { + "epoch": 1.6453377432896579, + "ewc_loss": 0.042860567569732666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1430283595691435e-05, + "grad_norm": 28.50773048400879, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8726717233657837, + "num_tokens": 493312258.0, + "step": 12934 + }, + { + "epoch": 1.6454649535682484, + "ewc_loss": 0.042834650725126266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1417325115180574e-05, + "grad_norm": 28.58197784423828, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8671183586120605, + "num_tokens": 493347725.0, + "step": 12935 + }, + { + "epoch": 1.645592163846839, + "ewc_loss": 0.04286051541566849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1430258129839785e-05, + "grad_norm": 28.36476707458496, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8707388043403625, + "num_tokens": 493388119.0, + "step": 12936 + }, + { + "epoch": 1.6457193741254295, + "ewc_loss": 0.04280407354235649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.140203650924377e-05, + "grad_norm": 28.512590408325195, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8676170706748962, + "num_tokens": 493424607.0, + "step": 12937 + }, + { + "epoch": 1.64584658440402, + "ewc_loss": 0.04289238899946213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.144619429600425e-05, + "grad_norm": 28.485496520996094, + "learning_rate": 1e-06, + "loss": 0.4984, + "mean_token_accuracy": 0.8452622890472412, + "num_tokens": 493459546.0, + "step": 12938 + }, + { + "epoch": 1.6459737946826105, + "ewc_loss": 0.04280376434326172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.140188189514447e-05, + "grad_norm": 28.50152015686035, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.871813178062439, + "num_tokens": 493494162.0, + "step": 12939 + }, + { + "epoch": 1.6461010049612008, + "ewc_loss": 0.04289174824953079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1445874153869227e-05, + "grad_norm": 28.602527618408203, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8644525408744812, + "num_tokens": 493537523.0, + "step": 12940 + }, + { + "epoch": 1.6462282152397913, + "ewc_loss": 0.042760323733091354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.138016134267673e-05, + "grad_norm": 28.429346084594727, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8661546111106873, + "num_tokens": 493580065.0, + "step": 12941 + }, + { + "epoch": 1.6463554255183819, + "ewc_loss": 0.04282589256763458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.141294680768624e-05, + "grad_norm": 28.507780075073242, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.873988687992096, + "num_tokens": 493626717.0, + "step": 12942 + }, + { + "epoch": 1.6464826357969724, + "ewc_loss": 0.04289288446307182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1446441678563133e-05, + "grad_norm": 28.558496475219727, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8731927871704102, + "num_tokens": 493666266.0, + "step": 12943 + }, + { + "epoch": 1.646609846075563, + "ewc_loss": 0.04277966916561127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1389834728324786e-05, + "grad_norm": 28.419673919677734, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8579807281494141, + "num_tokens": 493703184.0, + "step": 12944 + }, + { + "epoch": 1.6467370563541535, + "ewc_loss": 0.04276915267109871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1384576029959135e-05, + "grad_norm": 28.42816925048828, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8833611607551575, + "num_tokens": 493744061.0, + "step": 12945 + }, + { + "epoch": 1.6468642666327438, + "ewc_loss": 0.042869988828897476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.143499477824662e-05, + "grad_norm": 28.517623901367188, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.867626428604126, + "num_tokens": 493783585.0, + "step": 12946 + }, + { + "epoch": 1.6469914769113343, + "ewc_loss": 0.0427805557847023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.139027856173925e-05, + "grad_norm": 28.43165397644043, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8737926483154297, + "num_tokens": 493827781.0, + "step": 12947 + }, + { + "epoch": 1.6471186871899248, + "ewc_loss": 0.04279584065079689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1397920136223547e-05, + "grad_norm": 28.543745040893555, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.871860921382904, + "num_tokens": 493864418.0, + "step": 12948 + }, + { + "epoch": 1.6472458974685154, + "ewc_loss": 0.04278375580906868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1391877453424968e-05, + "grad_norm": 28.34918212890625, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8677258491516113, + "num_tokens": 493895226.0, + "step": 12949 + }, + { + "epoch": 1.6473731077471059, + "ewc_loss": 0.0429081991314888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.145409962395206e-05, + "grad_norm": 28.658540725708008, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8592885732650757, + "num_tokens": 493935210.0, + "step": 12950 + }, + { + "epoch": 1.6475003180256964, + "ewc_loss": 0.042833469808101654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1416735762613826e-05, + "grad_norm": 28.419139862060547, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8663297891616821, + "num_tokens": 493966368.0, + "step": 12951 + }, + { + "epoch": 1.647627528304287, + "ewc_loss": 0.04282483458518982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1412417481769808e-05, + "grad_norm": 28.452436447143555, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8608797788619995, + "num_tokens": 494004304.0, + "step": 12952 + }, + { + "epoch": 1.6477547385828775, + "ewc_loss": 0.042851515114307404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1425757950055413e-05, + "grad_norm": 28.417909622192383, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8748055696487427, + "num_tokens": 494044758.0, + "step": 12953 + }, + { + "epoch": 1.647881948861468, + "ewc_loss": 0.04278077185153961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.139038588211406e-05, + "grad_norm": 28.42434310913086, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8520492315292358, + "num_tokens": 494080624.0, + "step": 12954 + }, + { + "epoch": 1.6480091591400585, + "ewc_loss": 0.04285481572151184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.142740777344443e-05, + "grad_norm": 28.502819061279297, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8735251426696777, + "num_tokens": 494116215.0, + "step": 12955 + }, + { + "epoch": 1.648136369418649, + "ewc_loss": 0.04284931719303131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.142465928045567e-05, + "grad_norm": 28.40214729309082, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8668546676635742, + "num_tokens": 494150317.0, + "step": 12956 + }, + { + "epoch": 1.6482635796972396, + "ewc_loss": 0.04288172721862793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1440862838062458e-05, + "grad_norm": 28.4390926361084, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8807435631752014, + "num_tokens": 494187273.0, + "step": 12957 + }, + { + "epoch": 1.64839078997583, + "ewc_loss": 0.0429484099149704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1474204913829453e-05, + "grad_norm": 28.533489227294922, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8640305995941162, + "num_tokens": 494226921.0, + "step": 12958 + }, + { + "epoch": 1.6485180002544206, + "ewc_loss": 0.042915064841508865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1457532056956552e-05, + "grad_norm": 28.50871467590332, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8738484978675842, + "num_tokens": 494263141.0, + "step": 12959 + }, + { + "epoch": 1.6486452105330112, + "ewc_loss": 0.04281661659479141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1408308384707198e-05, + "grad_norm": 28.350126266479492, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8624001741409302, + "num_tokens": 494307925.0, + "step": 12960 + }, + { + "epoch": 1.6487724208116017, + "ewc_loss": 0.042897433042526245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.144871723430697e-05, + "grad_norm": 28.490928649902344, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8616412878036499, + "num_tokens": 494353428.0, + "step": 12961 + }, + { + "epoch": 1.6488996310901922, + "ewc_loss": 0.0428721122443676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1436055249068886e-05, + "grad_norm": 28.464149475097656, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.855732798576355, + "num_tokens": 494383559.0, + "step": 12962 + }, + { + "epoch": 1.6490268413687827, + "ewc_loss": 0.04289992153644562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1449961423058994e-05, + "grad_norm": 28.471410751342773, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8705408573150635, + "num_tokens": 494419564.0, + "step": 12963 + }, + { + "epoch": 1.649154051647373, + "ewc_loss": 0.04292641952633858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.146320912288502e-05, + "grad_norm": 28.53125, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8706241846084595, + "num_tokens": 494466886.0, + "step": 12964 + }, + { + "epoch": 1.6492812619259636, + "ewc_loss": 0.04289261996746063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1446310711326078e-05, + "grad_norm": 28.412721633911133, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8503985404968262, + "num_tokens": 494506672.0, + "step": 12965 + }, + { + "epoch": 1.649408472204554, + "ewc_loss": 0.042979080229997635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.148954081349075e-05, + "grad_norm": 28.60310935974121, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8681443333625793, + "num_tokens": 494541881.0, + "step": 12966 + }, + { + "epoch": 1.6495356824831446, + "ewc_loss": 0.042889129370450974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1444564481498674e-05, + "grad_norm": 28.333520889282227, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8719919919967651, + "num_tokens": 494583546.0, + "step": 12967 + }, + { + "epoch": 1.6496628927617352, + "ewc_loss": 0.04287797212600708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1438985640998e-05, + "grad_norm": 28.55495834350586, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8559447526931763, + "num_tokens": 494616625.0, + "step": 12968 + }, + { + "epoch": 1.6497901030403257, + "ewc_loss": 0.04296833276748657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.148416569980327e-05, + "grad_norm": 28.393735885620117, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8799055218696594, + "num_tokens": 494654590.0, + "step": 12969 + }, + { + "epoch": 1.6499173133189162, + "ewc_loss": 0.04290549084544182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1452746295835823e-05, + "grad_norm": 28.533971786499023, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8823760747909546, + "num_tokens": 494692471.0, + "step": 12970 + }, + { + "epoch": 1.6500445235975065, + "ewc_loss": 0.04298140108585358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.149070132873021e-05, + "grad_norm": 28.543981552124023, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8596562743186951, + "num_tokens": 494730347.0, + "step": 12971 + }, + { + "epoch": 1.650171733876097, + "ewc_loss": 0.04281432181596756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.140716060239356e-05, + "grad_norm": 28.38163185119629, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8559097051620483, + "num_tokens": 494763753.0, + "step": 12972 + }, + { + "epoch": 1.6502989441546876, + "ewc_loss": 0.042954765260219574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1477382688317448e-05, + "grad_norm": 28.55104637145996, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8625584244728088, + "num_tokens": 494800416.0, + "step": 12973 + }, + { + "epoch": 1.6504261544332781, + "ewc_loss": 0.04287560656666756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1437803297885694e-05, + "grad_norm": 28.35366439819336, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8657183647155762, + "num_tokens": 494842733.0, + "step": 12974 + }, + { + "epoch": 1.6505533647118686, + "ewc_loss": 0.04287593066692352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.143796518794261e-05, + "grad_norm": 28.39898681640625, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8604394197463989, + "num_tokens": 494878661.0, + "step": 12975 + }, + { + "epoch": 1.6506805749904592, + "ewc_loss": 0.04296565800905228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1482828742591664e-05, + "grad_norm": 28.468460083007812, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8602116107940674, + "num_tokens": 494918142.0, + "step": 12976 + }, + { + "epoch": 1.6508077852690497, + "ewc_loss": 0.04295286536216736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1476433175848797e-05, + "grad_norm": 28.41452407836914, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8671510219573975, + "num_tokens": 494956947.0, + "step": 12977 + }, + { + "epoch": 1.6509349955476402, + "ewc_loss": 0.0429999977350235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1499998183571734e-05, + "grad_norm": 28.618663787841797, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.861972451210022, + "num_tokens": 494989850.0, + "step": 12978 + }, + { + "epoch": 1.6510622058262308, + "ewc_loss": 0.04298609495162964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1493047825060785e-05, + "grad_norm": 28.390687942504883, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8594754934310913, + "num_tokens": 495026770.0, + "step": 12979 + }, + { + "epoch": 1.6511894161048213, + "ewc_loss": 0.042897891253232956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1448946426971816e-05, + "grad_norm": 28.462026596069336, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8784396648406982, + "num_tokens": 495065009.0, + "step": 12980 + }, + { + "epoch": 1.6513166263834118, + "ewc_loss": 0.04299401119351387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.14970059460029e-05, + "grad_norm": 28.517654418945312, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.868779182434082, + "num_tokens": 495096914.0, + "step": 12981 + }, + { + "epoch": 1.6514438366620023, + "ewc_loss": 0.043003715574741364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1501857190742157e-05, + "grad_norm": 28.589862823486328, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8708910346031189, + "num_tokens": 495138348.0, + "step": 12982 + }, + { + "epoch": 1.6515710469405929, + "ewc_loss": 0.0429418571293354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1470928913913667e-05, + "grad_norm": 28.456092834472656, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8607938885688782, + "num_tokens": 495177026.0, + "step": 12983 + }, + { + "epoch": 1.6516982572191834, + "ewc_loss": 0.04291461035609245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.145730468328111e-05, + "grad_norm": 28.570615768432617, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8692023754119873, + "num_tokens": 495213850.0, + "step": 12984 + }, + { + "epoch": 1.651825467497774, + "ewc_loss": 0.042913563549518585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.145678263332229e-05, + "grad_norm": 28.334060668945312, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8765961527824402, + "num_tokens": 495257213.0, + "step": 12985 + }, + { + "epoch": 1.6519526777763645, + "ewc_loss": 0.04296942055225372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.148470957763493e-05, + "grad_norm": 28.491477966308594, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.857770562171936, + "num_tokens": 495300018.0, + "step": 12986 + }, + { + "epoch": 1.652079888054955, + "ewc_loss": 0.042965929955244064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.148296516679693e-05, + "grad_norm": 28.461708068847656, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.866213858127594, + "num_tokens": 495336471.0, + "step": 12987 + }, + { + "epoch": 1.6522070983335455, + "ewc_loss": 0.04294582083821297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.147290979337413e-05, + "grad_norm": 28.531015396118164, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8707104921340942, + "num_tokens": 495374680.0, + "step": 12988 + }, + { + "epoch": 1.6523343086121358, + "ewc_loss": 0.04292779415845871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.146389670087956e-05, + "grad_norm": 28.43895721435547, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8704208731651306, + "num_tokens": 495414096.0, + "step": 12989 + }, + { + "epoch": 1.6524615188907263, + "ewc_loss": 0.04289083555340767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1445417587528937e-05, + "grad_norm": 28.388010025024414, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8553445935249329, + "num_tokens": 495452955.0, + "step": 12990 + }, + { + "epoch": 1.6525887291693169, + "ewc_loss": 0.04294528439640999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1472642401931807e-05, + "grad_norm": 28.53856658935547, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8647465109825134, + "num_tokens": 495490091.0, + "step": 12991 + }, + { + "epoch": 1.6527159394479074, + "ewc_loss": 0.042896948754787445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1448473489726894e-05, + "grad_norm": 28.495100021362305, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8753355741500854, + "num_tokens": 495524543.0, + "step": 12992 + }, + { + "epoch": 1.652843149726498, + "ewc_loss": 0.04286991059780121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.143495476047974e-05, + "grad_norm": 28.413015365600586, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8578997850418091, + "num_tokens": 495562068.0, + "step": 12993 + }, + { + "epoch": 1.6529703600050885, + "ewc_loss": 0.042945582419633865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1472791559062898e-05, + "grad_norm": 28.544206619262695, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8750158548355103, + "num_tokens": 495598355.0, + "step": 12994 + }, + { + "epoch": 1.6530975702836788, + "ewc_loss": 0.04297002777457237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1485013348865323e-05, + "grad_norm": 28.52166748046875, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8536223769187927, + "num_tokens": 495634735.0, + "step": 12995 + }, + { + "epoch": 1.6532247805622693, + "ewc_loss": 0.04286285862326622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.143142955901567e-05, + "grad_norm": 28.41399383544922, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8669899702072144, + "num_tokens": 495668804.0, + "step": 12996 + }, + { + "epoch": 1.6533519908408598, + "ewc_loss": 0.04292340949177742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1461704818648286e-05, + "grad_norm": 28.461334228515625, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8719062805175781, + "num_tokens": 495707129.0, + "step": 12997 + }, + { + "epoch": 1.6534792011194503, + "ewc_loss": 0.04295871779322624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.14793581108097e-05, + "grad_norm": 28.512203216552734, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.8460359573364258, + "num_tokens": 495746501.0, + "step": 12998 + }, + { + "epoch": 1.6536064113980409, + "ewc_loss": 0.04295830801129341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.14791543839965e-05, + "grad_norm": 28.52975845336914, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8838824033737183, + "num_tokens": 495789723.0, + "step": 12999 + }, + { + "epoch": 1.6537336216766314, + "ewc_loss": 0.042897287756204605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1448644474730827e-05, + "grad_norm": 28.39975929260254, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8583294749259949, + "num_tokens": 495827073.0, + "step": 13000 + }, + { + "epoch": 1.653860831955222, + "ewc_loss": 0.042933229357004166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1466614271048456e-05, + "grad_norm": 28.519811630249023, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8565329909324646, + "num_tokens": 495862362.0, + "step": 13001 + }, + { + "epoch": 1.6539880422338125, + "ewc_loss": 0.04296194389462471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1480971554410644e-05, + "grad_norm": 28.39213752746582, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8703315854072571, + "num_tokens": 495905546.0, + "step": 13002 + }, + { + "epoch": 1.654115252512403, + "ewc_loss": 0.04302607476711273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.151303669961635e-05, + "grad_norm": 28.49109649658203, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8653320670127869, + "num_tokens": 495942291.0, + "step": 13003 + }, + { + "epoch": 1.6542424627909935, + "ewc_loss": 0.04299573972821236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1497869965969585e-05, + "grad_norm": 28.551050186157227, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8565702438354492, + "num_tokens": 495978631.0, + "step": 13004 + }, + { + "epoch": 1.654369673069584, + "ewc_loss": 0.042952004820108414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1476002075360157e-05, + "grad_norm": 28.459951400756836, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8614163398742676, + "num_tokens": 496016626.0, + "step": 13005 + }, + { + "epoch": 1.6544968833481746, + "ewc_loss": 0.04299934580922127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.14996725844685e-05, + "grad_norm": 28.451629638671875, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8587371110916138, + "num_tokens": 496051949.0, + "step": 13006 + }, + { + "epoch": 1.654624093626765, + "ewc_loss": 0.04301407188177109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1507035853574052e-05, + "grad_norm": 28.433740615844727, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8726692199707031, + "num_tokens": 496092693.0, + "step": 13007 + }, + { + "epoch": 1.6547513039053556, + "ewc_loss": 0.04309682548046112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.154841240553651e-05, + "grad_norm": 28.542770385742188, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8612425923347473, + "num_tokens": 496132853.0, + "step": 13008 + }, + { + "epoch": 1.6548785141839462, + "ewc_loss": 0.043069932609796524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.15349664358655e-05, + "grad_norm": 28.569416046142578, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.852143406867981, + "num_tokens": 496168303.0, + "step": 13009 + }, + { + "epoch": 1.6550057244625367, + "ewc_loss": 0.04301831126213074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1509154976229183e-05, + "grad_norm": 28.422834396362305, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8542327880859375, + "num_tokens": 496206132.0, + "step": 13010 + }, + { + "epoch": 1.6551329347411272, + "ewc_loss": 0.04300454631447792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.150227373931557e-05, + "grad_norm": 28.45114517211914, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8606998920440674, + "num_tokens": 496247542.0, + "step": 13011 + }, + { + "epoch": 1.6552601450197177, + "ewc_loss": 0.043090857565402985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1545429262914695e-05, + "grad_norm": 28.422855377197266, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8760977387428284, + "num_tokens": 496285698.0, + "step": 13012 + }, + { + "epoch": 1.655387355298308, + "ewc_loss": 0.04299868270754814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1499341528397053e-05, + "grad_norm": 28.414791107177734, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8693369626998901, + "num_tokens": 496322216.0, + "step": 13013 + }, + { + "epoch": 1.6555145655768986, + "ewc_loss": 0.04316312074661255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1581559849437326e-05, + "grad_norm": 28.643571853637695, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8641254305839539, + "num_tokens": 496357687.0, + "step": 13014 + }, + { + "epoch": 1.655641775855489, + "ewc_loss": 0.042948681861162186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.147434133803472e-05, + "grad_norm": 28.304880142211914, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8782640695571899, + "num_tokens": 496400875.0, + "step": 13015 + }, + { + "epoch": 1.6557689861340796, + "ewc_loss": 0.04302055016160011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1510275473701768e-05, + "grad_norm": 28.573244094848633, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8698300123214722, + "num_tokens": 496432076.0, + "step": 13016 + }, + { + "epoch": 1.6558961964126702, + "ewc_loss": 0.043044041842222214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1522020688280463e-05, + "grad_norm": 28.375810623168945, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.863824725151062, + "num_tokens": 496466852.0, + "step": 13017 + }, + { + "epoch": 1.6560234066912607, + "ewc_loss": 0.0430532768368721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1526639102376066e-05, + "grad_norm": 28.603961944580078, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8651725649833679, + "num_tokens": 496508890.0, + "step": 13018 + }, + { + "epoch": 1.6561506169698512, + "ewc_loss": 0.04307631775736809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1538158762268722e-05, + "grad_norm": 28.405471801757812, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8821744322776794, + "num_tokens": 496537964.0, + "step": 13019 + }, + { + "epoch": 1.6562778272484415, + "ewc_loss": 0.0429343543946743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1467176338774152e-05, + "grad_norm": 28.434206008911133, + "learning_rate": 1e-06, + "loss": 0.4947, + "mean_token_accuracy": 0.8503240346908569, + "num_tokens": 496571263.0, + "step": 13020 + }, + { + "epoch": 1.656405037527032, + "ewc_loss": 0.04316357150673866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1581785404123366e-05, + "grad_norm": 28.585609436035156, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8748893737792969, + "num_tokens": 496610058.0, + "step": 13021 + }, + { + "epoch": 1.6565322478056226, + "ewc_loss": 0.0430479422211647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1523970644921064e-05, + "grad_norm": 28.55974578857422, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8602890372276306, + "num_tokens": 496640542.0, + "step": 13022 + }, + { + "epoch": 1.656659458084213, + "ewc_loss": 0.04300077632069588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1500387447304092e-05, + "grad_norm": 28.421571731567383, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8660821914672852, + "num_tokens": 496678030.0, + "step": 13023 + }, + { + "epoch": 1.6567866683628036, + "ewc_loss": 0.04299558326601982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1497791749425232e-05, + "grad_norm": 28.4752254486084, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8634715676307678, + "num_tokens": 496714057.0, + "step": 13024 + }, + { + "epoch": 1.6569138786413942, + "ewc_loss": 0.04307374358177185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.153687091777101e-05, + "grad_norm": 28.4777774810791, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.881639838218689, + "num_tokens": 496744501.0, + "step": 13025 + }, + { + "epoch": 1.6570410889199847, + "ewc_loss": 0.04310277849435806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1551390091190115e-05, + "grad_norm": 28.570486068725586, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8711457252502441, + "num_tokens": 496780233.0, + "step": 13026 + }, + { + "epoch": 1.6571682991985752, + "ewc_loss": 0.043089110404253006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.154455432901159e-05, + "grad_norm": 28.492237091064453, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8676365613937378, + "num_tokens": 496809645.0, + "step": 13027 + }, + { + "epoch": 1.6572955094771658, + "ewc_loss": 0.04304337874054909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1521689632209018e-05, + "grad_norm": 28.473987579345703, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8668988347053528, + "num_tokens": 496854871.0, + "step": 13028 + }, + { + "epoch": 1.6574227197557563, + "ewc_loss": 0.04309709370136261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1548547010752372e-05, + "grad_norm": 28.5207462310791, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8833533525466919, + "num_tokens": 496891958.0, + "step": 13029 + }, + { + "epoch": 1.6575499300343468, + "ewc_loss": 0.0430811308324337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1540565285249613e-05, + "grad_norm": 28.512475967407227, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8593984842300415, + "num_tokens": 496928550.0, + "step": 13030 + }, + { + "epoch": 1.6576771403129373, + "ewc_loss": 0.0430290587246418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.151453008991666e-05, + "grad_norm": 28.621679306030273, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8655068278312683, + "num_tokens": 496972009.0, + "step": 13031 + }, + { + "epoch": 1.6578043505915279, + "ewc_loss": 0.043052226305007935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.152611341443844e-05, + "grad_norm": 28.5218448638916, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8651916980743408, + "num_tokens": 497013605.0, + "step": 13032 + }, + { + "epoch": 1.6579315608701184, + "ewc_loss": 0.04298683628439903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1493418898899108e-05, + "grad_norm": 28.700244903564453, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.867798924446106, + "num_tokens": 497048712.0, + "step": 13033 + }, + { + "epoch": 1.658058771148709, + "ewc_loss": 0.04295504093170166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1477520931512117e-05, + "grad_norm": 28.329790115356445, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8806017637252808, + "num_tokens": 497089161.0, + "step": 13034 + }, + { + "epoch": 1.6581859814272994, + "ewc_loss": 0.042933352291584015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1466676116688177e-05, + "grad_norm": 28.625, + "learning_rate": 1e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8391289710998535, + "num_tokens": 497127253.0, + "step": 13035 + }, + { + "epoch": 1.65831319170589, + "ewc_loss": 0.043087951838970184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1543975890381262e-05, + "grad_norm": 28.43452262878418, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8634668588638306, + "num_tokens": 497167637.0, + "step": 13036 + }, + { + "epoch": 1.6584404019844805, + "ewc_loss": 0.04299577698111534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.149788815586362e-05, + "grad_norm": 28.536434173583984, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8664695024490356, + "num_tokens": 497205663.0, + "step": 13037 + }, + { + "epoch": 1.6585676122630708, + "ewc_loss": 0.04313749074935913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1568745069089346e-05, + "grad_norm": 28.561473846435547, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.882817804813385, + "num_tokens": 497242146.0, + "step": 13038 + }, + { + "epoch": 1.6586948225416613, + "ewc_loss": 0.04300286993384361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1501435185200535e-05, + "grad_norm": 28.450075149536133, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8724524974822998, + "num_tokens": 497279339.0, + "step": 13039 + }, + { + "epoch": 1.6588220328202519, + "ewc_loss": 0.043034229427576065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.15171148738591e-05, + "grad_norm": 28.654232025146484, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8584626913070679, + "num_tokens": 497318530.0, + "step": 13040 + }, + { + "epoch": 1.6589492430988424, + "ewc_loss": 0.0430130772292614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1506539269466884e-05, + "grad_norm": 28.52865219116211, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8609635233879089, + "num_tokens": 497352943.0, + "step": 13041 + }, + { + "epoch": 1.659076453377433, + "ewc_loss": 0.04283649101853371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.141824552381877e-05, + "grad_norm": 28.6192569732666, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8608145713806152, + "num_tokens": 497387729.0, + "step": 13042 + }, + { + "epoch": 1.6592036636560235, + "ewc_loss": 0.04304509982466698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1522550014196895e-05, + "grad_norm": 28.5701961517334, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.868370771408081, + "num_tokens": 497427875.0, + "step": 13043 + }, + { + "epoch": 1.6593308739346138, + "ewc_loss": 0.04288731887936592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1443658624775708e-05, + "grad_norm": 28.532981872558594, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8589113354682922, + "num_tokens": 497469076.0, + "step": 13044 + }, + { + "epoch": 1.6594580842132043, + "ewc_loss": 0.04297694191336632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1488471247721463e-05, + "grad_norm": 28.522865295410156, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.868710994720459, + "num_tokens": 497509556.0, + "step": 13045 + }, + { + "epoch": 1.6595852944917948, + "ewc_loss": 0.04287972301244736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1439862393890508e-05, + "grad_norm": 28.50816535949707, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8591747283935547, + "num_tokens": 497548151.0, + "step": 13046 + }, + { + "epoch": 1.6597125047703853, + "ewc_loss": 0.042877811938524246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1438905605464242e-05, + "grad_norm": 28.386131286621094, + "learning_rate": 1e-06, + "loss": 0.5168, + "mean_token_accuracy": 0.8406499624252319, + "num_tokens": 497589088.0, + "step": 13047 + }, + { + "epoch": 1.6598397150489759, + "ewc_loss": 0.04302240535616875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1511203158297576e-05, + "grad_norm": 28.48773193359375, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8760424852371216, + "num_tokens": 497632095.0, + "step": 13048 + }, + { + "epoch": 1.6599669253275664, + "ewc_loss": 0.04292837902903557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.146418955817353e-05, + "grad_norm": 28.5208740234375, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8787585496902466, + "num_tokens": 497665148.0, + "step": 13049 + }, + { + "epoch": 1.660094135606157, + "ewc_loss": 0.04300021007657051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1500105503946543e-05, + "grad_norm": 28.49700355529785, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8613714575767517, + "num_tokens": 497700134.0, + "step": 13050 + }, + { + "epoch": 1.6602213458847475, + "ewc_loss": 0.04293569177389145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1467845726874657e-05, + "grad_norm": 28.440357208251953, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8716531991958618, + "num_tokens": 497743101.0, + "step": 13051 + }, + { + "epoch": 1.660348556163338, + "ewc_loss": 0.0429573580622673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1478679627762176e-05, + "grad_norm": 28.49846839904785, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8605263233184814, + "num_tokens": 497775112.0, + "step": 13052 + }, + { + "epoch": 1.6604757664419285, + "ewc_loss": 0.043043266981840134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1521633243537508e-05, + "grad_norm": 28.637203216552734, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8777284622192383, + "num_tokens": 497813320.0, + "step": 13053 + }, + { + "epoch": 1.660602976720519, + "ewc_loss": 0.042984578758478165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1492289306479506e-05, + "grad_norm": 28.39940643310547, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8619594573974609, + "num_tokens": 497852780.0, + "step": 13054 + }, + { + "epoch": 1.6607301869991096, + "ewc_loss": 0.04292107746005058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1460538846440613e-05, + "grad_norm": 28.520864486694336, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8690666556358337, + "num_tokens": 497886313.0, + "step": 13055 + }, + { + "epoch": 1.6608573972777, + "ewc_loss": 0.042971089482307434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1485544493771158e-05, + "grad_norm": 28.44375228881836, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.865404486656189, + "num_tokens": 497924108.0, + "step": 13056 + }, + { + "epoch": 1.6609846075562906, + "ewc_loss": 0.04298253729939461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1491268853424117e-05, + "grad_norm": 28.551555633544922, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8555721640586853, + "num_tokens": 497964656.0, + "step": 13057 + }, + { + "epoch": 1.6611118178348812, + "ewc_loss": 0.0430278554558754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1513928004424088e-05, + "grad_norm": 28.509502410888672, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8532527089118958, + "num_tokens": 498005436.0, + "step": 13058 + }, + { + "epoch": 1.6612390281134717, + "ewc_loss": 0.042955685406923294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1477842892636545e-05, + "grad_norm": 28.515100479125977, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8630995750427246, + "num_tokens": 498042237.0, + "step": 13059 + }, + { + "epoch": 1.6613662383920622, + "ewc_loss": 0.0429886132478714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.149430656572804e-05, + "grad_norm": 28.49155616760254, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8655222654342651, + "num_tokens": 498083008.0, + "step": 13060 + }, + { + "epoch": 1.6614934486706527, + "ewc_loss": 0.04301586374640465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.150793261535e-05, + "grad_norm": 28.473072052001953, + "learning_rate": 1e-06, + "loss": 0.5065, + "mean_token_accuracy": 0.8438156247138977, + "num_tokens": 498120550.0, + "step": 13061 + }, + { + "epoch": 1.661620658949243, + "ewc_loss": 0.043044958263635635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1522479073610157e-05, + "grad_norm": 28.46477508544922, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8631792068481445, + "num_tokens": 498153033.0, + "step": 13062 + }, + { + "epoch": 1.6617478692278336, + "ewc_loss": 0.043050043284893036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1525022020796314e-05, + "grad_norm": 28.5145263671875, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8689572811126709, + "num_tokens": 498190491.0, + "step": 13063 + }, + { + "epoch": 1.661875079506424, + "ewc_loss": 0.043048858642578125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1524429030250758e-05, + "grad_norm": 28.475034713745117, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8595150113105774, + "num_tokens": 498235889.0, + "step": 13064 + }, + { + "epoch": 1.6620022897850146, + "ewc_loss": 0.04302606359124184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.151303124264814e-05, + "grad_norm": 28.51153564453125, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8639005422592163, + "num_tokens": 498277191.0, + "step": 13065 + }, + { + "epoch": 1.6621295000636052, + "ewc_loss": 0.043146438896656036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.157321978302207e-05, + "grad_norm": 28.536659240722656, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8553037047386169, + "num_tokens": 498318443.0, + "step": 13066 + }, + { + "epoch": 1.6622567103421957, + "ewc_loss": 0.043041881173849106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1520940208574757e-05, + "grad_norm": 28.50747299194336, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8545349836349487, + "num_tokens": 498355955.0, + "step": 13067 + }, + { + "epoch": 1.662383920620786, + "ewc_loss": 0.04305889457464218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.152944762201514e-05, + "grad_norm": 28.48175621032715, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.867658793926239, + "num_tokens": 498388588.0, + "step": 13068 + }, + { + "epoch": 1.6625111308993765, + "ewc_loss": 0.04301759973168373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1508800273295492e-05, + "grad_norm": 28.464487075805664, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8645251989364624, + "num_tokens": 498430661.0, + "step": 13069 + }, + { + "epoch": 1.662638341177967, + "ewc_loss": 0.0430588461458683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.152942215616349e-05, + "grad_norm": 28.509769439697266, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8542890548706055, + "num_tokens": 498470428.0, + "step": 13070 + }, + { + "epoch": 1.6627655514565576, + "ewc_loss": 0.043034039437770844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1517020286410116e-05, + "grad_norm": 28.55034828186035, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8662917613983154, + "num_tokens": 498512031.0, + "step": 13071 + }, + { + "epoch": 1.662892761735148, + "ewc_loss": 0.043091244995594025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.154562207579147e-05, + "grad_norm": 28.608348846435547, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8888810276985168, + "num_tokens": 498548344.0, + "step": 13072 + }, + { + "epoch": 1.6630199720137386, + "ewc_loss": 0.043076325207948685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.153816240024753e-05, + "grad_norm": 28.640586853027344, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8797155618667603, + "num_tokens": 498590115.0, + "step": 13073 + }, + { + "epoch": 1.6631471822923292, + "ewc_loss": 0.043007224798202515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.150361251551658e-05, + "grad_norm": 28.533132553100586, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8661172389984131, + "num_tokens": 498634034.0, + "step": 13074 + }, + { + "epoch": 1.6632743925709197, + "ewc_loss": 0.042917750775814056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.145887447113637e-05, + "grad_norm": 28.571334838867188, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8591398596763611, + "num_tokens": 498675086.0, + "step": 13075 + }, + { + "epoch": 1.6634016028495102, + "ewc_loss": 0.04292265698313713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1461328287841752e-05, + "grad_norm": 28.47814178466797, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8726944923400879, + "num_tokens": 498711374.0, + "step": 13076 + }, + { + "epoch": 1.6635288131281007, + "ewc_loss": 0.042943913489580154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.147195664292667e-05, + "grad_norm": 28.528823852539062, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8711725473403931, + "num_tokens": 498753050.0, + "step": 13077 + }, + { + "epoch": 1.6636560234066913, + "ewc_loss": 0.04294567555189133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1472837033797987e-05, + "grad_norm": 28.511751174926758, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8678567409515381, + "num_tokens": 498790406.0, + "step": 13078 + }, + { + "epoch": 1.6637832336852818, + "ewc_loss": 0.04300905764102936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1504529286175966e-05, + "grad_norm": 28.526391983032227, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8655335307121277, + "num_tokens": 498827712.0, + "step": 13079 + }, + { + "epoch": 1.6639104439638723, + "ewc_loss": 0.042939212173223495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.146960650861729e-05, + "grad_norm": 28.516258239746094, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8708422183990479, + "num_tokens": 498862283.0, + "step": 13080 + }, + { + "epoch": 1.6640376542424629, + "ewc_loss": 0.043058130890131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1529065634240396e-05, + "grad_norm": 28.688499450683594, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8675307035446167, + "num_tokens": 498902686.0, + "step": 13081 + }, + { + "epoch": 1.6641648645210534, + "ewc_loss": 0.04287334904074669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.143667370546609e-05, + "grad_norm": 28.548189163208008, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8557084798812866, + "num_tokens": 498942785.0, + "step": 13082 + }, + { + "epoch": 1.664292074799644, + "ewc_loss": 0.04296896606683731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1484482203959487e-05, + "grad_norm": 28.638835906982422, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8718353509902954, + "num_tokens": 498982761.0, + "step": 13083 + }, + { + "epoch": 1.6644192850782344, + "ewc_loss": 0.04285752773284912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1428764739539474e-05, + "grad_norm": 28.495113372802734, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8487826585769653, + "num_tokens": 499025762.0, + "step": 13084 + }, + { + "epoch": 1.664546495356825, + "ewc_loss": 0.04290525242686272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.145262624253519e-05, + "grad_norm": 28.66584587097168, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8578691482543945, + "num_tokens": 499063394.0, + "step": 13085 + }, + { + "epoch": 1.6646737056354155, + "ewc_loss": 0.042874086648225784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.143704296031501e-05, + "grad_norm": 28.529335021972656, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8773680925369263, + "num_tokens": 499105824.0, + "step": 13086 + }, + { + "epoch": 1.6648009159140058, + "ewc_loss": 0.04293428361415863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1467141777975485e-05, + "grad_norm": 28.653772354125977, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8742856979370117, + "num_tokens": 499140935.0, + "step": 13087 + }, + { + "epoch": 1.6649281261925963, + "ewc_loss": 0.042911309748888016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1455654859892093e-05, + "grad_norm": 28.515695571899414, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.866110622882843, + "num_tokens": 499179492.0, + "step": 13088 + }, + { + "epoch": 1.6650553364711869, + "ewc_loss": 0.04289735108613968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1448675397550687e-05, + "grad_norm": 28.653104782104492, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8774327039718628, + "num_tokens": 499214716.0, + "step": 13089 + }, + { + "epoch": 1.6651825467497774, + "ewc_loss": 0.042918749153614044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1459374693222344e-05, + "grad_norm": 28.544307708740234, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8734372854232788, + "num_tokens": 499250337.0, + "step": 13090 + }, + { + "epoch": 1.665309757028368, + "ewc_loss": 0.042891159653663635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1445579477585852e-05, + "grad_norm": 28.538761138916016, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.871558666229248, + "num_tokens": 499283675.0, + "step": 13091 + }, + { + "epoch": 1.6654369673069584, + "ewc_loss": 0.04291906952857971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1459534764289856e-05, + "grad_norm": 28.602033615112305, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8715575933456421, + "num_tokens": 499319153.0, + "step": 13092 + }, + { + "epoch": 1.6655641775855488, + "ewc_loss": 0.042871471494436264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1435735106933862e-05, + "grad_norm": 28.543357849121094, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8638916015625, + "num_tokens": 499356675.0, + "step": 13093 + }, + { + "epoch": 1.6656913878641393, + "ewc_loss": 0.042951565235853195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1475781977642328e-05, + "grad_norm": 28.651369094848633, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8813327550888062, + "num_tokens": 499392129.0, + "step": 13094 + }, + { + "epoch": 1.6658185981427298, + "ewc_loss": 0.04292362183332443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1461810320033692e-05, + "grad_norm": 28.50244903564453, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8786402940750122, + "num_tokens": 499424958.0, + "step": 13095 + }, + { + "epoch": 1.6659458084213203, + "ewc_loss": 0.04297091066837311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1485455363290384e-05, + "grad_norm": 28.61804962158203, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8759772777557373, + "num_tokens": 499467769.0, + "step": 13096 + }, + { + "epoch": 1.6660730186999109, + "ewc_loss": 0.042925067245960236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1462534277816303e-05, + "grad_norm": 28.540719985961914, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8574100136756897, + "num_tokens": 499513828.0, + "step": 13097 + }, + { + "epoch": 1.6662002289785014, + "ewc_loss": 0.042966704815626144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1483352611539885e-05, + "grad_norm": 28.51009178161621, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.87229323387146, + "num_tokens": 499548219.0, + "step": 13098 + }, + { + "epoch": 1.666327439257092, + "ewc_loss": 0.04299675673246384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1498377464013174e-05, + "grad_norm": 28.52245330810547, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8499237895011902, + "num_tokens": 499587158.0, + "step": 13099 + }, + { + "epoch": 1.6664546495356825, + "ewc_loss": 0.04300672560930252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1503363313968293e-05, + "grad_norm": 28.557743072509766, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8696072101593018, + "num_tokens": 499625235.0, + "step": 13100 + }, + { + "epoch": 1.666581859814273, + "ewc_loss": 0.04294225201010704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.147112536476925e-05, + "grad_norm": 28.450963973999023, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8470203280448914, + "num_tokens": 499662842.0, + "step": 13101 + }, + { + "epoch": 1.6667090700928635, + "ewc_loss": 0.04302661120891571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1513305910048075e-05, + "grad_norm": 28.67354965209961, + "learning_rate": 1e-06, + "loss": 0.5022, + "mean_token_accuracy": 0.8438968658447266, + "num_tokens": 499703464.0, + "step": 13102 + }, + { + "epoch": 1.666836280371454, + "ewc_loss": 0.042998652905225754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1499326976481825e-05, + "grad_norm": 28.52653694152832, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8660730719566345, + "num_tokens": 499739160.0, + "step": 13103 + }, + { + "epoch": 1.6669634906500446, + "ewc_loss": 0.042942747473716736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1471374566317536e-05, + "grad_norm": 28.620704650878906, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8534975051879883, + "num_tokens": 499775024.0, + "step": 13104 + }, + { + "epoch": 1.667090700928635, + "ewc_loss": 0.04297667369246483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.14883366425056e-05, + "grad_norm": 28.60076904296875, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.869648277759552, + "num_tokens": 499816479.0, + "step": 13105 + }, + { + "epoch": 1.6672179112072256, + "ewc_loss": 0.042940590530633926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1470295905601233e-05, + "grad_norm": 28.40433120727539, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.868019700050354, + "num_tokens": 499848790.0, + "step": 13106 + }, + { + "epoch": 1.6673451214858162, + "ewc_loss": 0.04298194497823715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.149097235815134e-05, + "grad_norm": 28.378602981567383, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.886040210723877, + "num_tokens": 499890955.0, + "step": 13107 + }, + { + "epoch": 1.6674723317644067, + "ewc_loss": 0.04312580078840256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1562900656135753e-05, + "grad_norm": 28.55686378479004, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8757649660110474, + "num_tokens": 499924813.0, + "step": 13108 + }, + { + "epoch": 1.6675995420429972, + "ewc_loss": 0.04301284998655319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.150642467313446e-05, + "grad_norm": 28.36720085144043, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8638851642608643, + "num_tokens": 499960032.0, + "step": 13109 + }, + { + "epoch": 1.6677267523215877, + "ewc_loss": 0.043060846626758575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.153042260033544e-05, + "grad_norm": 28.577865600585938, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8804677724838257, + "num_tokens": 499995868.0, + "step": 13110 + }, + { + "epoch": 1.667853962600178, + "ewc_loss": 0.04325370118021965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1626850866596214e-05, + "grad_norm": 28.39632225036621, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8713675737380981, + "num_tokens": 500031662.0, + "step": 13111 + }, + { + "epoch": 1.6679811728787686, + "ewc_loss": 0.043059371411800385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1529685909627005e-05, + "grad_norm": 28.610044479370117, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.858124852180481, + "num_tokens": 500070189.0, + "step": 13112 + }, + { + "epoch": 1.668108383157359, + "ewc_loss": 0.04325571656227112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.162785858672578e-05, + "grad_norm": 28.393592834472656, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8637347221374512, + "num_tokens": 500109999.0, + "step": 13113 + }, + { + "epoch": 1.6682355934359496, + "ewc_loss": 0.04303668066859245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.151834087271709e-05, + "grad_norm": 28.57855987548828, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.8507573008537292, + "num_tokens": 500151473.0, + "step": 13114 + }, + { + "epoch": 1.6683628037145402, + "ewc_loss": 0.04324598237872124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.162299097108189e-05, + "grad_norm": 28.557106018066406, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8653567433357239, + "num_tokens": 500187436.0, + "step": 13115 + }, + { + "epoch": 1.6684900139931307, + "ewc_loss": 0.04305240511894226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1526202544919215e-05, + "grad_norm": 28.45197868347168, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8667750954627991, + "num_tokens": 500222433.0, + "step": 13116 + }, + { + "epoch": 1.668617224271721, + "ewc_loss": 0.04318895563483238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.159447831218131e-05, + "grad_norm": 28.590208053588867, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8673665523529053, + "num_tokens": 500257390.0, + "step": 13117 + }, + { + "epoch": 1.6687444345503115, + "ewc_loss": 0.04316694661974907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1583473426289856e-05, + "grad_norm": 28.57683563232422, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8666598796844482, + "num_tokens": 500301260.0, + "step": 13118 + }, + { + "epoch": 1.668871644828902, + "ewc_loss": 0.04307844862341881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.15392246900592e-05, + "grad_norm": 28.4903564453125, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8722269535064697, + "num_tokens": 500343453.0, + "step": 13119 + }, + { + "epoch": 1.6689988551074926, + "ewc_loss": 0.043117910623550415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1558955268119462e-05, + "grad_norm": 28.530717849731445, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8725796341896057, + "num_tokens": 500375133.0, + "step": 13120 + }, + { + "epoch": 1.669126065386083, + "ewc_loss": 0.04314175248146057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.15708769246703e-05, + "grad_norm": 28.49884796142578, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8623529076576233, + "num_tokens": 500408540.0, + "step": 13121 + }, + { + "epoch": 1.6692532756646736, + "ewc_loss": 0.04310404881834984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1552024918491952e-05, + "grad_norm": 28.477237701416016, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.88014155626297, + "num_tokens": 500446243.0, + "step": 13122 + }, + { + "epoch": 1.6693804859432642, + "ewc_loss": 0.04318326711654663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1591633412754163e-05, + "grad_norm": 28.554481506347656, + "learning_rate": 1e-06, + "loss": 0.5174, + "mean_token_accuracy": 0.8424346446990967, + "num_tokens": 500484313.0, + "step": 13123 + }, + { + "epoch": 1.6695076962218547, + "ewc_loss": 0.0432063527405262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1603176719509065e-05, + "grad_norm": 28.570817947387695, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8783097267150879, + "num_tokens": 500525566.0, + "step": 13124 + }, + { + "epoch": 1.6696349065004452, + "ewc_loss": 0.043162375688552856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.15811869566096e-05, + "grad_norm": 28.62163543701172, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.872596800327301, + "num_tokens": 500566274.0, + "step": 13125 + }, + { + "epoch": 1.6697621167790357, + "ewc_loss": 0.043145108968019485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.157255403290037e-05, + "grad_norm": 28.561399459838867, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8599148988723755, + "num_tokens": 500602769.0, + "step": 13126 + }, + { + "epoch": 1.6698893270576263, + "ewc_loss": 0.04311469942331314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.155734910047613e-05, + "grad_norm": 28.472497940063477, + "learning_rate": 1e-06, + "loss": 0.4896, + "mean_token_accuracy": 0.8501768112182617, + "num_tokens": 500642895.0, + "step": 13127 + }, + { + "epoch": 1.6700165373362168, + "ewc_loss": 0.04313213750720024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.156606933567673e-05, + "grad_norm": 28.63734245300293, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8653370141983032, + "num_tokens": 500681461.0, + "step": 13128 + }, + { + "epoch": 1.6701437476148073, + "ewc_loss": 0.04313841462135315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1569207092397846e-05, + "grad_norm": 28.50848388671875, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8504621982574463, + "num_tokens": 500723395.0, + "step": 13129 + }, + { + "epoch": 1.6702709578933979, + "ewc_loss": 0.04305117577314377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1525587726500817e-05, + "grad_norm": 28.643367767333984, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8672245740890503, + "num_tokens": 500760571.0, + "step": 13130 + }, + { + "epoch": 1.6703981681719884, + "ewc_loss": 0.04316339269280434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1581696273642592e-05, + "grad_norm": 28.570663452148438, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8572668433189392, + "num_tokens": 500797858.0, + "step": 13131 + }, + { + "epoch": 1.670525378450579, + "ewc_loss": 0.0430162213742733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1508110876311548e-05, + "grad_norm": 28.589784622192383, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8720327615737915, + "num_tokens": 500840199.0, + "step": 13132 + }, + { + "epoch": 1.6706525887291694, + "ewc_loss": 0.04304654896259308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1523273971979506e-05, + "grad_norm": 28.520343780517578, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8604629635810852, + "num_tokens": 500883290.0, + "step": 13133 + }, + { + "epoch": 1.67077979900776, + "ewc_loss": 0.04305361211299896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1526806449401192e-05, + "grad_norm": 28.559263229370117, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8714303970336914, + "num_tokens": 500923072.0, + "step": 13134 + }, + { + "epoch": 1.6709070092863505, + "ewc_loss": 0.043077461421489716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1538729924941435e-05, + "grad_norm": 28.498865127563477, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8713429570198059, + "num_tokens": 500958726.0, + "step": 13135 + }, + { + "epoch": 1.6710342195649408, + "ewc_loss": 0.04311726614832878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1558633306995034e-05, + "grad_norm": 28.55941390991211, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8627915382385254, + "num_tokens": 500996746.0, + "step": 13136 + }, + { + "epoch": 1.6711614298435313, + "ewc_loss": 0.04310641065239906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1553205442614853e-05, + "grad_norm": 28.550949096679688, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8593126535415649, + "num_tokens": 501034135.0, + "step": 13137 + }, + { + "epoch": 1.6712886401221219, + "ewc_loss": 0.04304426163434982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.152213164663408e-05, + "grad_norm": 28.478282928466797, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8689584732055664, + "num_tokens": 501069810.0, + "step": 13138 + }, + { + "epoch": 1.6714158504007124, + "ewc_loss": 0.043051645159721375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1525822376133874e-05, + "grad_norm": 28.627721786499023, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8617092967033386, + "num_tokens": 501110597.0, + "step": 13139 + }, + { + "epoch": 1.671543060679303, + "ewc_loss": 0.04312092810869217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1560463210335e-05, + "grad_norm": 28.542394638061523, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8567193746566772, + "num_tokens": 501145927.0, + "step": 13140 + }, + { + "epoch": 1.6716702709578934, + "ewc_loss": 0.04301220551133156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1506102712010033e-05, + "grad_norm": 28.554628372192383, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8477044105529785, + "num_tokens": 501188473.0, + "step": 13141 + }, + { + "epoch": 1.6717974812364838, + "ewc_loss": 0.04312834143638611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1564170310739428e-05, + "grad_norm": 28.584854125976562, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8788959383964539, + "num_tokens": 501226143.0, + "step": 13142 + }, + { + "epoch": 1.6719246915150743, + "ewc_loss": 0.04305592179298401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1527961507672444e-05, + "grad_norm": 28.569957733154297, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.857575535774231, + "num_tokens": 501266980.0, + "step": 13143 + }, + { + "epoch": 1.6720519017936648, + "ewc_loss": 0.043075770139694214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.153788591385819e-05, + "grad_norm": 28.548513412475586, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8627141118049622, + "num_tokens": 501305414.0, + "step": 13144 + }, + { + "epoch": 1.6721791120722553, + "ewc_loss": 0.04307131469249725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1535657651838847e-05, + "grad_norm": 28.518253326416016, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8585979342460632, + "num_tokens": 501343009.0, + "step": 13145 + }, + { + "epoch": 1.6723063223508459, + "ewc_loss": 0.04306582361459732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1532910977839492e-05, + "grad_norm": 28.501834869384766, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8730717897415161, + "num_tokens": 501383571.0, + "step": 13146 + }, + { + "epoch": 1.6724335326294364, + "ewc_loss": 0.04304180666804314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1520903828786686e-05, + "grad_norm": 28.49787139892578, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8676227331161499, + "num_tokens": 501425444.0, + "step": 13147 + }, + { + "epoch": 1.672560742908027, + "ewc_loss": 0.04309454560279846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.154727371816989e-05, + "grad_norm": 28.53787612915039, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8794553279876709, + "num_tokens": 501466536.0, + "step": 13148 + }, + { + "epoch": 1.6726879531866174, + "ewc_loss": 0.04311202093958855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.155601032427512e-05, + "grad_norm": 28.517213821411133, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.869205117225647, + "num_tokens": 501507346.0, + "step": 13149 + }, + { + "epoch": 1.672815163465208, + "ewc_loss": 0.04311954230070114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1559771994361654e-05, + "grad_norm": 28.59002113342285, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8732694387435913, + "num_tokens": 501545515.0, + "step": 13150 + }, + { + "epoch": 1.6729423737437985, + "ewc_loss": 0.043074507266283035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1537252905545756e-05, + "grad_norm": 28.537254333496094, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8605674505233765, + "num_tokens": 501588793.0, + "step": 13151 + }, + { + "epoch": 1.673069584022389, + "ewc_loss": 0.043071404099464417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.153570130758453e-05, + "grad_norm": 28.521503448486328, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.86304771900177, + "num_tokens": 501624384.0, + "step": 13152 + }, + { + "epoch": 1.6731967943009796, + "ewc_loss": 0.04311579465866089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1557896616286598e-05, + "grad_norm": 28.544416427612305, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8665046095848083, + "num_tokens": 501663150.0, + "step": 13153 + }, + { + "epoch": 1.67332400457957, + "ewc_loss": 0.043098513036966324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1549256416619755e-05, + "grad_norm": 28.557334899902344, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8597435355186462, + "num_tokens": 501706177.0, + "step": 13154 + }, + { + "epoch": 1.6734512148581606, + "ewc_loss": 0.043128300458192825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.156415030185599e-05, + "grad_norm": 28.613080978393555, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.878045916557312, + "num_tokens": 501745267.0, + "step": 13155 + }, + { + "epoch": 1.6735784251367511, + "ewc_loss": 0.04306085780262947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1530428057303652e-05, + "grad_norm": 28.50834846496582, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8614517450332642, + "num_tokens": 501783591.0, + "step": 13156 + }, + { + "epoch": 1.6737056354153417, + "ewc_loss": 0.04303234815597534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1516174456337467e-05, + "grad_norm": 28.4398136138916, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8570604920387268, + "num_tokens": 501825190.0, + "step": 13157 + }, + { + "epoch": 1.6738328456939322, + "ewc_loss": 0.04307648912072182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.153824425477069e-05, + "grad_norm": 28.596843719482422, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8666898608207703, + "num_tokens": 501855813.0, + "step": 13158 + }, + { + "epoch": 1.6739600559725227, + "ewc_loss": 0.0431072898209095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.155364563805051e-05, + "grad_norm": 28.572494506835938, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8632282018661499, + "num_tokens": 501890106.0, + "step": 13159 + }, + { + "epoch": 1.674087266251113, + "ewc_loss": 0.04305258020758629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1526289856410585e-05, + "grad_norm": 28.687231063842773, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.852828860282898, + "num_tokens": 501935437.0, + "step": 13160 + }, + { + "epoch": 1.6742144765297036, + "ewc_loss": 0.04306831210851669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1534155166591518e-05, + "grad_norm": 28.451915740966797, + "learning_rate": 1e-06, + "loss": 0.4693, + "mean_token_accuracy": 0.8536819815635681, + "num_tokens": 501980134.0, + "step": 13161 + }, + { + "epoch": 1.674341686808294, + "ewc_loss": 0.042976733297109604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.148836756532546e-05, + "grad_norm": 28.582773208618164, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8596060276031494, + "num_tokens": 502015977.0, + "step": 13162 + }, + { + "epoch": 1.6744688970868846, + "ewc_loss": 0.04309587553143501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1547937649302185e-05, + "grad_norm": 28.522708892822266, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.869316041469574, + "num_tokens": 502061717.0, + "step": 13163 + }, + { + "epoch": 1.6745961073654752, + "ewc_loss": 0.0430527999997139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.15264008147642e-05, + "grad_norm": 28.685949325561523, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8648852109909058, + "num_tokens": 502101594.0, + "step": 13164 + }, + { + "epoch": 1.6747233176440657, + "ewc_loss": 0.04311713948845863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.155856964236591e-05, + "grad_norm": 28.57282066345215, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8669765591621399, + "num_tokens": 502136521.0, + "step": 13165 + }, + { + "epoch": 1.674850527922656, + "ewc_loss": 0.043034542351961136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1517271306947805e-05, + "grad_norm": 28.60678482055664, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.8461292386054993, + "num_tokens": 502174821.0, + "step": 13166 + }, + { + "epoch": 1.6749777382012465, + "ewc_loss": 0.04306592047214508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1532960090553388e-05, + "grad_norm": 28.583847045898438, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8603250980377197, + "num_tokens": 502210615.0, + "step": 13167 + }, + { + "epoch": 1.675104948479837, + "ewc_loss": 0.04302726313471794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.151363150915131e-05, + "grad_norm": 28.48306655883789, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8658020496368408, + "num_tokens": 502248220.0, + "step": 13168 + }, + { + "epoch": 1.6752321587584276, + "ewc_loss": 0.043075986206531525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1537993234233e-05, + "grad_norm": 28.497278213500977, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8636188507080078, + "num_tokens": 502290686.0, + "step": 13169 + }, + { + "epoch": 1.675359369037018, + "ewc_loss": 0.043114740401506424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1557370928348973e-05, + "grad_norm": 28.586566925048828, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8547155261039734, + "num_tokens": 502331789.0, + "step": 13170 + }, + { + "epoch": 1.6754865793156086, + "ewc_loss": 0.043092161417007446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1546080461121164e-05, + "grad_norm": 28.442459106445312, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8716718554496765, + "num_tokens": 502371490.0, + "step": 13171 + }, + { + "epoch": 1.6756137895941992, + "ewc_loss": 0.04297730326652527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1488651327672414e-05, + "grad_norm": 28.543237686157227, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.865250825881958, + "num_tokens": 502415606.0, + "step": 13172 + }, + { + "epoch": 1.6757409998727897, + "ewc_loss": 0.04319646582007408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1598232706310228e-05, + "grad_norm": 28.492326736450195, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8643258810043335, + "num_tokens": 502456203.0, + "step": 13173 + }, + { + "epoch": 1.6758682101513802, + "ewc_loss": 0.043071720749139786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.153585955966264e-05, + "grad_norm": 28.509502410888672, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8815553188323975, + "num_tokens": 502491508.0, + "step": 13174 + }, + { + "epoch": 1.6759954204299707, + "ewc_loss": 0.04318970441818237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.159485302399844e-05, + "grad_norm": 28.54083824157715, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8539519906044006, + "num_tokens": 502533408.0, + "step": 13175 + }, + { + "epoch": 1.6761226307085613, + "ewc_loss": 0.043063245713710785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1531623133341782e-05, + "grad_norm": 28.539958953857422, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8592332005500793, + "num_tokens": 502569824.0, + "step": 13176 + }, + { + "epoch": 1.6762498409871518, + "ewc_loss": 0.04306071624159813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1530358935706317e-05, + "grad_norm": 28.41104507446289, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8737994432449341, + "num_tokens": 502604776.0, + "step": 13177 + }, + { + "epoch": 1.6763770512657423, + "ewc_loss": 0.043127089738845825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1563544578384608e-05, + "grad_norm": 28.5628719329834, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8482929468154907, + "num_tokens": 502645732.0, + "step": 13178 + }, + { + "epoch": 1.6765042615443329, + "ewc_loss": 0.043113354593515396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1556677893386222e-05, + "grad_norm": 28.434410095214844, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8692748546600342, + "num_tokens": 502686044.0, + "step": 13179 + }, + { + "epoch": 1.6766314718229234, + "ewc_loss": 0.04303080588579178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1515403204830363e-05, + "grad_norm": 28.47663116455078, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.856818437576294, + "num_tokens": 502728288.0, + "step": 13180 + }, + { + "epoch": 1.676758682101514, + "ewc_loss": 0.04320938512682915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.160469193768222e-05, + "grad_norm": 28.651784896850586, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8581815958023071, + "num_tokens": 502760772.0, + "step": 13181 + }, + { + "epoch": 1.6768858923801044, + "ewc_loss": 0.04297836124897003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1489180653588846e-05, + "grad_norm": 28.49078369140625, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8652170896530151, + "num_tokens": 502791585.0, + "step": 13182 + }, + { + "epoch": 1.677013102658695, + "ewc_loss": 0.04312768206000328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1563841073657386e-05, + "grad_norm": 28.628910064697266, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8825809955596924, + "num_tokens": 502826339.0, + "step": 13183 + }, + { + "epoch": 1.6771403129372855, + "ewc_loss": 0.0431598462164402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1579922758974135e-05, + "grad_norm": 28.587844848632812, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8827111124992371, + "num_tokens": 502868716.0, + "step": 13184 + }, + { + "epoch": 1.6772675232158758, + "ewc_loss": 0.043015189468860626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.150759428332094e-05, + "grad_norm": 28.605255126953125, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8654999732971191, + "num_tokens": 502907992.0, + "step": 13185 + }, + { + "epoch": 1.6773947334944663, + "ewc_loss": 0.043109141290187836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1554571503656916e-05, + "grad_norm": 28.59742546081543, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8605421185493469, + "num_tokens": 502947197.0, + "step": 13186 + }, + { + "epoch": 1.6775219437730569, + "ewc_loss": 0.043028756976127625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1514379113796167e-05, + "grad_norm": 28.48383903503418, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.875474214553833, + "num_tokens": 502985890.0, + "step": 13187 + }, + { + "epoch": 1.6776491540516474, + "ewc_loss": 0.04308445751667023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1542229660553858e-05, + "grad_norm": 28.53789710998535, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8600714206695557, + "num_tokens": 503033476.0, + "step": 13188 + }, + { + "epoch": 1.677776364330238, + "ewc_loss": 0.04305571690201759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1527857825276442e-05, + "grad_norm": 28.556169509887695, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8742958307266235, + "num_tokens": 503068002.0, + "step": 13189 + }, + { + "epoch": 1.6779035746088284, + "ewc_loss": 0.04310048744082451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.155024412786588e-05, + "grad_norm": 28.47210121154785, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8588351011276245, + "num_tokens": 503110059.0, + "step": 13190 + }, + { + "epoch": 1.6780307848874187, + "ewc_loss": 0.04305845871567726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1529229343286715e-05, + "grad_norm": 28.556209564208984, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8590317964553833, + "num_tokens": 503146617.0, + "step": 13191 + }, + { + "epoch": 1.6781579951660093, + "ewc_loss": 0.04313620552420616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1568102965829894e-05, + "grad_norm": 28.615510940551758, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8627043962478638, + "num_tokens": 503186436.0, + "step": 13192 + }, + { + "epoch": 1.6782852054445998, + "ewc_loss": 0.043129730969667435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1564865164691582e-05, + "grad_norm": 28.58708953857422, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8710634112358093, + "num_tokens": 503227983.0, + "step": 13193 + }, + { + "epoch": 1.6784124157231903, + "ewc_loss": 0.04302925616502762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1514628315344453e-05, + "grad_norm": 28.609760284423828, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8529778122901917, + "num_tokens": 503269301.0, + "step": 13194 + }, + { + "epoch": 1.6785396260017809, + "ewc_loss": 0.04309966787695885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.154983303626068e-05, + "grad_norm": 28.534713745117188, + "learning_rate": 1e-06, + "loss": 0.4952, + "mean_token_accuracy": 0.8474780321121216, + "num_tokens": 503308919.0, + "step": 13195 + }, + { + "epoch": 1.6786668362803714, + "ewc_loss": 0.04302125796675682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1510628357646056e-05, + "grad_norm": 28.586095809936523, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8752878904342651, + "num_tokens": 503345030.0, + "step": 13196 + }, + { + "epoch": 1.678794046558962, + "ewc_loss": 0.04310827702283859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1554138584178872e-05, + "grad_norm": 28.68766975402832, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8580429553985596, + "num_tokens": 503379142.0, + "step": 13197 + }, + { + "epoch": 1.6789212568375524, + "ewc_loss": 0.043041981756687164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1520991140278056e-05, + "grad_norm": 28.55137825012207, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8662620186805725, + "num_tokens": 503411810.0, + "step": 13198 + }, + { + "epoch": 1.679048467116143, + "ewc_loss": 0.0430925227701664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1546260541072115e-05, + "grad_norm": 28.615652084350586, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8619056940078735, + "num_tokens": 503450266.0, + "step": 13199 + }, + { + "epoch": 1.6791756773947335, + "ewc_loss": 0.043095167726278305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1547584765357897e-05, + "grad_norm": 28.481834411621094, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8734787702560425, + "num_tokens": 503491015.0, + "step": 13200 + }, + { + "epoch": 1.679302887673324, + "ewc_loss": 0.043071430176496506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.153571585949976e-05, + "grad_norm": 28.555639266967773, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8745689392089844, + "num_tokens": 503531964.0, + "step": 13201 + }, + { + "epoch": 1.6794300979519146, + "ewc_loss": 0.04312513396143913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1562567781074904e-05, + "grad_norm": 28.526805877685547, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8695845007896423, + "num_tokens": 503576069.0, + "step": 13202 + }, + { + "epoch": 1.679557308230505, + "ewc_loss": 0.04310069978237152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1550349629251286e-05, + "grad_norm": 28.630157470703125, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8668782711029053, + "num_tokens": 503607895.0, + "step": 13203 + }, + { + "epoch": 1.6796845185090956, + "ewc_loss": 0.0432036817073822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1601841581286862e-05, + "grad_norm": 28.549610137939453, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8603202104568481, + "num_tokens": 503636891.0, + "step": 13204 + }, + { + "epoch": 1.6798117287876861, + "ewc_loss": 0.043120238929986954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.156011942133773e-05, + "grad_norm": 28.550447463989258, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8797954320907593, + "num_tokens": 503682037.0, + "step": 13205 + }, + { + "epoch": 1.6799389390662767, + "ewc_loss": 0.04313060641288757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1565303541137837e-05, + "grad_norm": 28.551410675048828, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8687899112701416, + "num_tokens": 503724008.0, + "step": 13206 + }, + { + "epoch": 1.6800661493448672, + "ewc_loss": 0.043157100677490234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.157855124096386e-05, + "grad_norm": 28.621721267700195, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.873860239982605, + "num_tokens": 503759008.0, + "step": 13207 + }, + { + "epoch": 1.6801933596234577, + "ewc_loss": 0.043103210628032684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1551604731939733e-05, + "grad_norm": 28.5373477935791, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.862180233001709, + "num_tokens": 503796599.0, + "step": 13208 + }, + { + "epoch": 1.680320569902048, + "ewc_loss": 0.043219584971666336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.160979238396976e-05, + "grad_norm": 28.642065048217773, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.863624095916748, + "num_tokens": 503835539.0, + "step": 13209 + }, + { + "epoch": 1.6804477801806386, + "ewc_loss": 0.04316315799951553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1581579858320765e-05, + "grad_norm": 28.615341186523438, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8626543283462524, + "num_tokens": 503875933.0, + "step": 13210 + }, + { + "epoch": 1.680574990459229, + "ewc_loss": 0.043034862726926804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1517431378015317e-05, + "grad_norm": 28.530439376831055, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.862729549407959, + "num_tokens": 503913740.0, + "step": 13211 + }, + { + "epoch": 1.6807022007378196, + "ewc_loss": 0.04310145229101181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.155072616005782e-05, + "grad_norm": 28.57815933227539, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8692618608474731, + "num_tokens": 503955105.0, + "step": 13212 + }, + { + "epoch": 1.6808294110164101, + "ewc_loss": 0.04314400255680084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1572001060121693e-05, + "grad_norm": 28.589780807495117, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8655552268028259, + "num_tokens": 503987641.0, + "step": 13213 + }, + { + "epoch": 1.6809566212950007, + "ewc_loss": 0.043153055012226105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1576526705757715e-05, + "grad_norm": 28.55791473388672, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8571633100509644, + "num_tokens": 504028699.0, + "step": 13214 + }, + { + "epoch": 1.681083831573591, + "ewc_loss": 0.04319646954536438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.159823452529963e-05, + "grad_norm": 28.67367172241211, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8597074747085571, + "num_tokens": 504065597.0, + "step": 13215 + }, + { + "epoch": 1.6812110418521815, + "ewc_loss": 0.04313569515943527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1567848307313398e-05, + "grad_norm": 28.572711944580078, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8517369031906128, + "num_tokens": 504105968.0, + "step": 13216 + }, + { + "epoch": 1.681338252130772, + "ewc_loss": 0.04304949939250946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1524749172385782e-05, + "grad_norm": 28.583534240722656, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8682759404182434, + "num_tokens": 504143743.0, + "step": 13217 + }, + { + "epoch": 1.6814654624093626, + "ewc_loss": 0.04313677176833153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1568386728176847e-05, + "grad_norm": 28.6020450592041, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8677821159362793, + "num_tokens": 504177023.0, + "step": 13218 + }, + { + "epoch": 1.681592672687953, + "ewc_loss": 0.04309866577386856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1549332814174704e-05, + "grad_norm": 28.443361282348633, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8630998134613037, + "num_tokens": 504210492.0, + "step": 13219 + }, + { + "epoch": 1.6817198829665436, + "ewc_loss": 0.04313064366579056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1565321731031872e-05, + "grad_norm": 28.637361526489258, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8663042187690735, + "num_tokens": 504248322.0, + "step": 13220 + }, + { + "epoch": 1.6818470932451342, + "ewc_loss": 0.043213315308094025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1606658265227452e-05, + "grad_norm": 28.62672233581543, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.850892186164856, + "num_tokens": 504285915.0, + "step": 13221 + }, + { + "epoch": 1.6819743035237247, + "ewc_loss": 0.04315037652850151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1575187929556705e-05, + "grad_norm": 28.562742233276367, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.863264799118042, + "num_tokens": 504325578.0, + "step": 13222 + }, + { + "epoch": 1.6821015138023152, + "ewc_loss": 0.0431937538087368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1596877559204586e-05, + "grad_norm": 28.643577575683594, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8857103586196899, + "num_tokens": 504362876.0, + "step": 13223 + }, + { + "epoch": 1.6822287240809057, + "ewc_loss": 0.043208006769418716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1604002540698275e-05, + "grad_norm": 28.68740463256836, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.863377034664154, + "num_tokens": 504402303.0, + "step": 13224 + }, + { + "epoch": 1.6823559343594963, + "ewc_loss": 0.043139249086380005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1569623640971258e-05, + "grad_norm": 28.67710304260254, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8665335178375244, + "num_tokens": 504440039.0, + "step": 13225 + }, + { + "epoch": 1.6824831446380868, + "ewc_loss": 0.04316670447587967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.158335155399982e-05, + "grad_norm": 28.65266227722168, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8564746379852295, + "num_tokens": 504485102.0, + "step": 13226 + }, + { + "epoch": 1.6826103549166773, + "ewc_loss": 0.04318605363368988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.159302675863728e-05, + "grad_norm": 28.60475730895996, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8652861714363098, + "num_tokens": 504522762.0, + "step": 13227 + }, + { + "epoch": 1.6827375651952678, + "ewc_loss": 0.04308057576417923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.154028697987087e-05, + "grad_norm": 28.71227264404297, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8707067966461182, + "num_tokens": 504562818.0, + "step": 13228 + }, + { + "epoch": 1.6828647754738584, + "ewc_loss": 0.04310924932360649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.155462425434962e-05, + "grad_norm": 28.5821475982666, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8555260896682739, + "num_tokens": 504602914.0, + "step": 13229 + }, + { + "epoch": 1.682991985752449, + "ewc_loss": 0.04311049357056618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1555246348725632e-05, + "grad_norm": 28.699676513671875, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8726159334182739, + "num_tokens": 504641376.0, + "step": 13230 + }, + { + "epoch": 1.6831191960310394, + "ewc_loss": 0.04312627762556076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1563138943747617e-05, + "grad_norm": 28.59044647216797, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8728514909744263, + "num_tokens": 504675031.0, + "step": 13231 + }, + { + "epoch": 1.68324640630963, + "ewc_loss": 0.04305226728320122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.152613342332188e-05, + "grad_norm": 28.654356002807617, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8717049360275269, + "num_tokens": 504709624.0, + "step": 13232 + }, + { + "epoch": 1.6833736165882205, + "ewc_loss": 0.04314778745174408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1573894628090784e-05, + "grad_norm": 28.660877227783203, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8698312640190125, + "num_tokens": 504749310.0, + "step": 13233 + }, + { + "epoch": 1.6835008268668108, + "ewc_loss": 0.04307517036795616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1537585780606605e-05, + "grad_norm": 28.59193992614746, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8669489622116089, + "num_tokens": 504791700.0, + "step": 13234 + }, + { + "epoch": 1.6836280371454013, + "ewc_loss": 0.04308591037988663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1542955437325872e-05, + "grad_norm": 28.577911376953125, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8544539213180542, + "num_tokens": 504833246.0, + "step": 13235 + }, + { + "epoch": 1.6837552474239919, + "ewc_loss": 0.043086398392915726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1543199181905948e-05, + "grad_norm": 28.62082290649414, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8701214790344238, + "num_tokens": 504872901.0, + "step": 13236 + }, + { + "epoch": 1.6838824577025824, + "ewc_loss": 0.04310612380504608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1553061742451973e-05, + "grad_norm": 28.649904251098633, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8605138659477234, + "num_tokens": 504906138.0, + "step": 13237 + }, + { + "epoch": 1.684009667981173, + "ewc_loss": 0.04308290034532547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1541449314099737e-05, + "grad_norm": 28.621858596801758, + "learning_rate": 1e-06, + "loss": 0.5068, + "mean_token_accuracy": 0.8455753326416016, + "num_tokens": 504947147.0, + "step": 13238 + }, + { + "epoch": 1.6841368782597634, + "ewc_loss": 0.04305023327469826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1525116608245298e-05, + "grad_norm": 28.47738265991211, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8675296306610107, + "num_tokens": 504985332.0, + "step": 13239 + }, + { + "epoch": 1.6842640885383537, + "ewc_loss": 0.043137077242136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1568539523286745e-05, + "grad_norm": 28.724061965942383, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.857792854309082, + "num_tokens": 505027995.0, + "step": 13240 + }, + { + "epoch": 1.6843912988169443, + "ewc_loss": 0.043149836361408234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.157491871912498e-05, + "grad_norm": 28.463329315185547, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8693303465843201, + "num_tokens": 505065311.0, + "step": 13241 + }, + { + "epoch": 1.6845185090955348, + "ewc_loss": 0.043123260140419006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1561629182542674e-05, + "grad_norm": 28.588748931884766, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.861614465713501, + "num_tokens": 505105810.0, + "step": 13242 + }, + { + "epoch": 1.6846457193741253, + "ewc_loss": 0.04323040321469307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1615202058455907e-05, + "grad_norm": 28.560443878173828, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8713250160217285, + "num_tokens": 505141003.0, + "step": 13243 + }, + { + "epoch": 1.6847729296527159, + "ewc_loss": 0.04311623051762581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1558114895015024e-05, + "grad_norm": 28.610177993774414, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8552894592285156, + "num_tokens": 505188025.0, + "step": 13244 + }, + { + "epoch": 1.6849001399313064, + "ewc_loss": 0.04316508769989014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1582543922704645e-05, + "grad_norm": 28.652502059936523, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8794247508049011, + "num_tokens": 505226101.0, + "step": 13245 + }, + { + "epoch": 1.685027350209897, + "ewc_loss": 0.04317013919353485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.158507049898617e-05, + "grad_norm": 28.629440307617188, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8554635047912598, + "num_tokens": 505264880.0, + "step": 13246 + }, + { + "epoch": 1.6851545604884874, + "ewc_loss": 0.04314976930618286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1574884158326313e-05, + "grad_norm": 28.606220245361328, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8590108752250671, + "num_tokens": 505306208.0, + "step": 13247 + }, + { + "epoch": 1.685281770767078, + "ewc_loss": 0.04316982626914978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1584914065897465e-05, + "grad_norm": 28.76480484008789, + "learning_rate": 1e-06, + "loss": 0.3661, + "mean_token_accuracy": 0.8868769407272339, + "num_tokens": 505346024.0, + "step": 13248 + }, + { + "epoch": 1.6854089810456685, + "ewc_loss": 0.04312858358025551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1564292183029465e-05, + "grad_norm": 28.557546615600586, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8884262442588806, + "num_tokens": 505383644.0, + "step": 13249 + }, + { + "epoch": 1.685536191324259, + "ewc_loss": 0.04302242025732994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.151121043425519e-05, + "grad_norm": 28.5689697265625, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8695249557495117, + "num_tokens": 505420365.0, + "step": 13250 + }, + { + "epoch": 1.6856634016028496, + "ewc_loss": 0.04316147416830063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1580737666226923e-05, + "grad_norm": 28.606595993041992, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8673238158226013, + "num_tokens": 505458211.0, + "step": 13251 + }, + { + "epoch": 1.68579061188144, + "ewc_loss": 0.043119609355926514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1559804736170918e-05, + "grad_norm": 28.662185668945312, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8606648445129395, + "num_tokens": 505501649.0, + "step": 13252 + }, + { + "epoch": 1.6859178221600306, + "ewc_loss": 0.0431935079395771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1596753867925145e-05, + "grad_norm": 28.597579956054688, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8676261305809021, + "num_tokens": 505538763.0, + "step": 13253 + }, + { + "epoch": 1.6860450324386211, + "ewc_loss": 0.04306617006659508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1533085600822233e-05, + "grad_norm": 28.678396224975586, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8880168795585632, + "num_tokens": 505574562.0, + "step": 13254 + }, + { + "epoch": 1.6861722427172117, + "ewc_loss": 0.04310610890388489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.155305446649436e-05, + "grad_norm": 28.490079879760742, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8792029619216919, + "num_tokens": 505609718.0, + "step": 13255 + }, + { + "epoch": 1.6862994529958022, + "ewc_loss": 0.04304729774594307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1523648683796637e-05, + "grad_norm": 28.627273559570312, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8791231513023376, + "num_tokens": 505644337.0, + "step": 13256 + }, + { + "epoch": 1.6864266632743927, + "ewc_loss": 0.04309072345495224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1545361960306764e-05, + "grad_norm": 28.443571090698242, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8764075636863708, + "num_tokens": 505681929.0, + "step": 13257 + }, + { + "epoch": 1.686553873552983, + "ewc_loss": 0.04308446869254112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.154223511752207e-05, + "grad_norm": 28.68303871154785, + "learning_rate": 1e-06, + "loss": 0.5027, + "mean_token_accuracy": 0.8437456488609314, + "num_tokens": 505719373.0, + "step": 13258 + }, + { + "epoch": 1.6866810838315736, + "ewc_loss": 0.04320518672466278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1602592823910527e-05, + "grad_norm": 28.41470718383789, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8516526818275452, + "num_tokens": 505757767.0, + "step": 13259 + }, + { + "epoch": 1.686808294110164, + "ewc_loss": 0.04313623905181885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1568119336734526e-05, + "grad_norm": 28.749744415283203, + "learning_rate": 1e-06, + "loss": 0.5595, + "mean_token_accuracy": 0.8360986709594727, + "num_tokens": 505794777.0, + "step": 13260 + }, + { + "epoch": 1.6869355043887546, + "ewc_loss": 0.04326950013637543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1634750737575814e-05, + "grad_norm": 28.49860191345215, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.872873067855835, + "num_tokens": 505832672.0, + "step": 13261 + }, + { + "epoch": 1.6870627146673451, + "ewc_loss": 0.04303276538848877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.151638182112947e-05, + "grad_norm": 28.582490921020508, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.854712963104248, + "num_tokens": 505873142.0, + "step": 13262 + }, + { + "epoch": 1.6871899249459357, + "ewc_loss": 0.043196436017751694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1598218154395e-05, + "grad_norm": 28.51080894470215, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8692948222160339, + "num_tokens": 505911994.0, + "step": 13263 + }, + { + "epoch": 1.687317135224526, + "ewc_loss": 0.04314931854605675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1574658603640273e-05, + "grad_norm": 28.670330047607422, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8702691793441772, + "num_tokens": 505948874.0, + "step": 13264 + }, + { + "epoch": 1.6874443455031165, + "ewc_loss": 0.043178707361221313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.158935421903152e-05, + "grad_norm": 28.531885147094727, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8724815249443054, + "num_tokens": 505989679.0, + "step": 13265 + }, + { + "epoch": 1.687571555781707, + "ewc_loss": 0.043133340775966644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.15666696021799e-05, + "grad_norm": 28.657859802246094, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8737542629241943, + "num_tokens": 506033209.0, + "step": 13266 + }, + { + "epoch": 1.6876987660602976, + "ewc_loss": 0.043209485709667206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1604742869385518e-05, + "grad_norm": 28.615415573120117, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8754905462265015, + "num_tokens": 506065379.0, + "step": 13267 + }, + { + "epoch": 1.687825976338888, + "ewc_loss": 0.04312889277935028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1564446797128767e-05, + "grad_norm": 28.625905990600586, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8576826453208923, + "num_tokens": 506099484.0, + "step": 13268 + }, + { + "epoch": 1.6879531866174786, + "ewc_loss": 0.043320003896951675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1660001948475838e-05, + "grad_norm": 28.703598022460938, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8503006100654602, + "num_tokens": 506136559.0, + "step": 13269 + }, + { + "epoch": 1.6880803968960691, + "ewc_loss": 0.04313935711979866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1569678210653365e-05, + "grad_norm": 28.549354553222656, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8646467328071594, + "num_tokens": 506172997.0, + "step": 13270 + }, + { + "epoch": 1.6882076071746597, + "ewc_loss": 0.043154604732990265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1577301595243625e-05, + "grad_norm": 28.56599235534668, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8800109624862671, + "num_tokens": 506209858.0, + "step": 13271 + }, + { + "epoch": 1.6883348174532502, + "ewc_loss": 0.043130770325660706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1565385395660996e-05, + "grad_norm": 28.546409606933594, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8778256177902222, + "num_tokens": 506242260.0, + "step": 13272 + }, + { + "epoch": 1.6884620277318407, + "ewc_loss": 0.043211810290813446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1605905203614384e-05, + "grad_norm": 28.59269142150879, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.853588879108429, + "num_tokens": 506276856.0, + "step": 13273 + }, + { + "epoch": 1.6885892380104313, + "ewc_loss": 0.043263137340545654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1631569325109012e-05, + "grad_norm": 28.56253433227539, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8853400349617004, + "num_tokens": 506311469.0, + "step": 13274 + }, + { + "epoch": 1.6887164482890218, + "ewc_loss": 0.04324737936258316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1623689463012852e-05, + "grad_norm": 28.62438201904297, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8781758546829224, + "num_tokens": 506348240.0, + "step": 13275 + }, + { + "epoch": 1.6888436585676123, + "ewc_loss": 0.04323331266641617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.161665543098934e-05, + "grad_norm": 28.581884384155273, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8677685856819153, + "num_tokens": 506387203.0, + "step": 13276 + }, + { + "epoch": 1.6889708688462028, + "ewc_loss": 0.043248455971479416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.16242278838763e-05, + "grad_norm": 28.6350040435791, + "learning_rate": 1e-06, + "loss": 0.5095, + "mean_token_accuracy": 0.8446698188781738, + "num_tokens": 506421869.0, + "step": 13277 + }, + { + "epoch": 1.6890980791247934, + "ewc_loss": 0.04334721714258194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.167360798921436e-05, + "grad_norm": 28.610097885131836, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.865982711315155, + "num_tokens": 506458873.0, + "step": 13278 + }, + { + "epoch": 1.689225289403384, + "ewc_loss": 0.04328960180282593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1644800654030405e-05, + "grad_norm": 28.662702560424805, + "learning_rate": 1e-06, + "loss": 0.4714, + "mean_token_accuracy": 0.8537486791610718, + "num_tokens": 506495947.0, + "step": 13279 + }, + { + "epoch": 1.6893524996819744, + "ewc_loss": 0.04335939511656761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1679697965737432e-05, + "grad_norm": 28.72091293334961, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8488616943359375, + "num_tokens": 506533098.0, + "step": 13280 + }, + { + "epoch": 1.689479709960565, + "ewc_loss": 0.04323362186551094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.161681004508864e-05, + "grad_norm": 28.628755569458008, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8772478103637695, + "num_tokens": 506571346.0, + "step": 13281 + }, + { + "epoch": 1.6896069202391555, + "ewc_loss": 0.043341461569070816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.167073034797795e-05, + "grad_norm": 28.757583618164062, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8677353858947754, + "num_tokens": 506607912.0, + "step": 13282 + }, + { + "epoch": 1.6897341305177458, + "ewc_loss": 0.04325345158576965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.162672535632737e-05, + "grad_norm": 28.616168975830078, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.856503427028656, + "num_tokens": 506654258.0, + "step": 13283 + }, + { + "epoch": 1.6898613407963363, + "ewc_loss": 0.04325253516435623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1626266970997676e-05, + "grad_norm": 28.591392517089844, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8730340003967285, + "num_tokens": 506693436.0, + "step": 13284 + }, + { + "epoch": 1.6899885510749268, + "ewc_loss": 0.04325179383158684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1625897716148756e-05, + "grad_norm": 28.6843318939209, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8689348101615906, + "num_tokens": 506737383.0, + "step": 13285 + }, + { + "epoch": 1.6901157613535174, + "ewc_loss": 0.04325491189956665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1627456590067595e-05, + "grad_norm": 28.660646438598633, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8759269118309021, + "num_tokens": 506772282.0, + "step": 13286 + }, + { + "epoch": 1.690242971632108, + "ewc_loss": 0.043246787041425705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1623392967740074e-05, + "grad_norm": 28.64472007751465, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8784126043319702, + "num_tokens": 506807208.0, + "step": 13287 + }, + { + "epoch": 1.6903701819106984, + "ewc_loss": 0.04313890263438225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.156945083697792e-05, + "grad_norm": 28.69312286376953, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8734128475189209, + "num_tokens": 506847952.0, + "step": 13288 + }, + { + "epoch": 1.6904973921892887, + "ewc_loss": 0.04323058947920799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1615294826915488e-05, + "grad_norm": 28.735193252563477, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8697003722190857, + "num_tokens": 506888265.0, + "step": 13289 + }, + { + "epoch": 1.6906246024678793, + "ewc_loss": 0.04324152320623398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1620760890073143e-05, + "grad_norm": 28.84758186340332, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8701577186584473, + "num_tokens": 506926096.0, + "step": 13290 + }, + { + "epoch": 1.6907518127464698, + "ewc_loss": 0.04311690106987953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.155845140805468e-05, + "grad_norm": 28.698217391967773, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8595393896102905, + "num_tokens": 506962252.0, + "step": 13291 + }, + { + "epoch": 1.6908790230250603, + "ewc_loss": 0.043092031031847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1546014977502637e-05, + "grad_norm": 28.70918083190918, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8717731833457947, + "num_tokens": 507004709.0, + "step": 13292 + }, + { + "epoch": 1.6910062333036509, + "ewc_loss": 0.04312925040721893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1564625058090314e-05, + "grad_norm": 28.80217170715332, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8822773098945618, + "num_tokens": 507041041.0, + "step": 13293 + }, + { + "epoch": 1.6911334435822414, + "ewc_loss": 0.04305662214756012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1528310753637925e-05, + "grad_norm": 28.657503128051758, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8602590560913086, + "num_tokens": 507080251.0, + "step": 13294 + }, + { + "epoch": 1.691260653860832, + "ewc_loss": 0.04308309033513069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1541545720538124e-05, + "grad_norm": 28.686851501464844, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8656527996063232, + "num_tokens": 507120813.0, + "step": 13295 + }, + { + "epoch": 1.6913878641394224, + "ewc_loss": 0.04306903854012489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1534518964472227e-05, + "grad_norm": 28.67961311340332, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8543683290481567, + "num_tokens": 507160284.0, + "step": 13296 + }, + { + "epoch": 1.691515074418013, + "ewc_loss": 0.043049149215221405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.152457454940304e-05, + "grad_norm": 28.632301330566406, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8593698740005493, + "num_tokens": 507205069.0, + "step": 13297 + }, + { + "epoch": 1.6916422846966035, + "ewc_loss": 0.04309580847620964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.154790490749292e-05, + "grad_norm": 28.564279556274414, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8752788305282593, + "num_tokens": 507244689.0, + "step": 13298 + }, + { + "epoch": 1.691769494975194, + "ewc_loss": 0.04306478425860405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.153239256585948e-05, + "grad_norm": 28.69398307800293, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8685883283615112, + "num_tokens": 507283917.0, + "step": 13299 + }, + { + "epoch": 1.6918967052537845, + "ewc_loss": 0.04311443492770195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1557218133239076e-05, + "grad_norm": 28.599864959716797, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8691277503967285, + "num_tokens": 507327937.0, + "step": 13300 + }, + { + "epoch": 1.692023915532375, + "ewc_loss": 0.043042704463005066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1521353119169362e-05, + "grad_norm": 28.597911834716797, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8810978531837463, + "num_tokens": 507369521.0, + "step": 13301 + }, + { + "epoch": 1.6921511258109656, + "ewc_loss": 0.04309660568833351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1548303266172297e-05, + "grad_norm": 28.61858367919922, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8773916959762573, + "num_tokens": 507408157.0, + "step": 13302 + }, + { + "epoch": 1.6922783360895561, + "ewc_loss": 0.043071143329143524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.153557215933688e-05, + "grad_norm": 28.566251754760742, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8725426197052002, + "num_tokens": 507448003.0, + "step": 13303 + }, + { + "epoch": 1.6924055463681467, + "ewc_loss": 0.042987160384655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1493580788956024e-05, + "grad_norm": 28.56938362121582, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8747882843017578, + "num_tokens": 507489970.0, + "step": 13304 + }, + { + "epoch": 1.6925327566467372, + "ewc_loss": 0.0431118868291378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.155594302166719e-05, + "grad_norm": 28.593717575073242, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8690681457519531, + "num_tokens": 507524536.0, + "step": 13305 + }, + { + "epoch": 1.6926599669253277, + "ewc_loss": 0.04310053214430809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1550265955738723e-05, + "grad_norm": 28.734668731689453, + "learning_rate": 1e-06, + "loss": 0.3531, + "mean_token_accuracy": 0.8875365853309631, + "num_tokens": 507561575.0, + "step": 13306 + }, + { + "epoch": 1.692787177203918, + "ewc_loss": 0.0430840328335762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1542016838793643e-05, + "grad_norm": 28.691879272460938, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8695425987243652, + "num_tokens": 507597212.0, + "step": 13307 + }, + { + "epoch": 1.6929143874825086, + "ewc_loss": 0.043053749948740005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1526875570998527e-05, + "grad_norm": 28.692676544189453, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8706809282302856, + "num_tokens": 507634251.0, + "step": 13308 + }, + { + "epoch": 1.693041597761099, + "ewc_loss": 0.04300958290696144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1504791220650077e-05, + "grad_norm": 28.612552642822266, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8733485341072083, + "num_tokens": 507671345.0, + "step": 13309 + }, + { + "epoch": 1.6931688080396896, + "ewc_loss": 0.042963799089193344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1481899239006452e-05, + "grad_norm": 28.572118759155273, + "learning_rate": 1e-06, + "loss": 0.4713, + "mean_token_accuracy": 0.8561300039291382, + "num_tokens": 507709797.0, + "step": 13310 + }, + { + "epoch": 1.6932960183182801, + "ewc_loss": 0.04303136095404625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1515679691219702e-05, + "grad_norm": 28.78241539001465, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8667638301849365, + "num_tokens": 507747047.0, + "step": 13311 + }, + { + "epoch": 1.6934232285968707, + "ewc_loss": 0.04305513575673103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1527568605961278e-05, + "grad_norm": 28.530887603759766, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8673577904701233, + "num_tokens": 507782187.0, + "step": 13312 + }, + { + "epoch": 1.693550438875461, + "ewc_loss": 0.043104227632284164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1552114048972726e-05, + "grad_norm": 28.65520668029785, + "learning_rate": 1e-06, + "loss": 0.4772, + "mean_token_accuracy": 0.8543239235877991, + "num_tokens": 507826714.0, + "step": 13313 + }, + { + "epoch": 1.6936776491540515, + "ewc_loss": 0.04318265989422798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.159132964152377e-05, + "grad_norm": 28.63669204711914, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8589413166046143, + "num_tokens": 507869423.0, + "step": 13314 + }, + { + "epoch": 1.693804859432642, + "ewc_loss": 0.04302390292286873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1511950762942433e-05, + "grad_norm": 28.53387451171875, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8767683506011963, + "num_tokens": 507904711.0, + "step": 13315 + }, + { + "epoch": 1.6939320697112326, + "ewc_loss": 0.04318867623806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1594338249997236e-05, + "grad_norm": 28.65111541748047, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8622547388076782, + "num_tokens": 507947760.0, + "step": 13316 + }, + { + "epoch": 1.694059279989823, + "ewc_loss": 0.04317711666226387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1588557501672767e-05, + "grad_norm": 28.543758392333984, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8636837005615234, + "num_tokens": 507984249.0, + "step": 13317 + }, + { + "epoch": 1.6941864902684136, + "ewc_loss": 0.04314596578478813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1572983314399607e-05, + "grad_norm": 28.57085418701172, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8638731241226196, + "num_tokens": 508017206.0, + "step": 13318 + }, + { + "epoch": 1.6943137005470041, + "ewc_loss": 0.043226469308137894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.161323391192127e-05, + "grad_norm": 28.674129486083984, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8613994121551514, + "num_tokens": 508056357.0, + "step": 13319 + }, + { + "epoch": 1.6944409108255947, + "ewc_loss": 0.043231479823589325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1615740479319356e-05, + "grad_norm": 28.593204498291016, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8587435483932495, + "num_tokens": 508094870.0, + "step": 13320 + }, + { + "epoch": 1.6945681211041852, + "ewc_loss": 0.04324229434132576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1621146515826695e-05, + "grad_norm": 28.7033748626709, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8659378290176392, + "num_tokens": 508120323.0, + "step": 13321 + }, + { + "epoch": 1.6946953313827757, + "ewc_loss": 0.0431927889585495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1596393708023243e-05, + "grad_norm": 28.579830169677734, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8584383726119995, + "num_tokens": 508160018.0, + "step": 13322 + }, + { + "epoch": 1.6948225416613663, + "ewc_loss": 0.04319494962692261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.159747418772895e-05, + "grad_norm": 28.604774475097656, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8847635984420776, + "num_tokens": 508200947.0, + "step": 13323 + }, + { + "epoch": 1.6949497519399568, + "ewc_loss": 0.043295614421367645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1647807443514466e-05, + "grad_norm": 28.725141525268555, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8538839817047119, + "num_tokens": 508246457.0, + "step": 13324 + }, + { + "epoch": 1.6950769622185473, + "ewc_loss": 0.04326895624399185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1634477889165282e-05, + "grad_norm": 28.709871292114258, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8622794151306152, + "num_tokens": 508281951.0, + "step": 13325 + }, + { + "epoch": 1.6952041724971378, + "ewc_loss": 0.04320507496595383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.160253825422842e-05, + "grad_norm": 28.65030288696289, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8496153950691223, + "num_tokens": 508322571.0, + "step": 13326 + }, + { + "epoch": 1.6953313827757284, + "ewc_loss": 0.04328979179263115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.164489524147939e-05, + "grad_norm": 28.671945571899414, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8737574219703674, + "num_tokens": 508357347.0, + "step": 13327 + }, + { + "epoch": 1.695458593054319, + "ewc_loss": 0.04327893629670143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.163946737709921e-05, + "grad_norm": 28.706134796142578, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8685476779937744, + "num_tokens": 508401813.0, + "step": 13328 + }, + { + "epoch": 1.6955858033329094, + "ewc_loss": 0.043296851217746735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1648425899911672e-05, + "grad_norm": 28.5809326171875, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8553318977355957, + "num_tokens": 508440457.0, + "step": 13329 + }, + { + "epoch": 1.6957130136115, + "ewc_loss": 0.04327783361077309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1638916223309934e-05, + "grad_norm": 28.716384887695312, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8576330542564392, + "num_tokens": 508472272.0, + "step": 13330 + }, + { + "epoch": 1.6958402238900905, + "ewc_loss": 0.04330166056752205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.165083060390316e-05, + "grad_norm": 28.613628387451172, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8793361186981201, + "num_tokens": 508509352.0, + "step": 13331 + }, + { + "epoch": 1.6959674341686808, + "ewc_loss": 0.04325246810913086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1626234229188412e-05, + "grad_norm": 28.695716857910156, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8753565549850464, + "num_tokens": 508552515.0, + "step": 13332 + }, + { + "epoch": 1.6960946444472713, + "ewc_loss": 0.043276309967041016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.163815406674985e-05, + "grad_norm": 28.711748123168945, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8791217803955078, + "num_tokens": 508591399.0, + "step": 13333 + }, + { + "epoch": 1.6962218547258618, + "ewc_loss": 0.043213002383708954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1606501832138747e-05, + "grad_norm": 28.587085723876953, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8719905018806458, + "num_tokens": 508630427.0, + "step": 13334 + }, + { + "epoch": 1.6963490650044524, + "ewc_loss": 0.043234530836343765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1617264792439528e-05, + "grad_norm": 28.694055557250977, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8741788268089294, + "num_tokens": 508667469.0, + "step": 13335 + }, + { + "epoch": 1.696476275283043, + "ewc_loss": 0.04326833039522171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1634165022987872e-05, + "grad_norm": 28.696273803710938, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8708521127700806, + "num_tokens": 508706308.0, + "step": 13336 + }, + { + "epoch": 1.6966034855616334, + "ewc_loss": 0.043258294463157654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.162914643122349e-05, + "grad_norm": 28.650747299194336, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8553597927093506, + "num_tokens": 508738968.0, + "step": 13337 + }, + { + "epoch": 1.6967306958402237, + "ewc_loss": 0.043269429355859756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1634714357787743e-05, + "grad_norm": 28.85870361328125, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8725003004074097, + "num_tokens": 508769879.0, + "step": 13338 + }, + { + "epoch": 1.6968579061188143, + "ewc_loss": 0.04325242340564728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.162621240131557e-05, + "grad_norm": 28.67363929748535, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8520700335502625, + "num_tokens": 508812277.0, + "step": 13339 + }, + { + "epoch": 1.6969851163974048, + "ewc_loss": 0.043177489191293716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1588744857581332e-05, + "grad_norm": 28.653501510620117, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.856957733631134, + "num_tokens": 508849293.0, + "step": 13340 + }, + { + "epoch": 1.6971123266759953, + "ewc_loss": 0.04318637400865555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.159318682970479e-05, + "grad_norm": 28.722450256347656, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.867985188961029, + "num_tokens": 508885798.0, + "step": 13341 + }, + { + "epoch": 1.6972395369545858, + "ewc_loss": 0.043166887015104294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.15834443224594e-05, + "grad_norm": 28.6893310546875, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.864787220954895, + "num_tokens": 508920491.0, + "step": 13342 + }, + { + "epoch": 1.6973667472331764, + "ewc_loss": 0.043143317103385925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1571659090113826e-05, + "grad_norm": 28.68324851989746, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8665340542793274, + "num_tokens": 508956161.0, + "step": 13343 + }, + { + "epoch": 1.697493957511767, + "ewc_loss": 0.04319782927632332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1598914827336557e-05, + "grad_norm": 28.498701095581055, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8631781339645386, + "num_tokens": 508995696.0, + "step": 13344 + }, + { + "epoch": 1.6976211677903574, + "ewc_loss": 0.04323193430900574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.16159678529948e-05, + "grad_norm": 28.7072811126709, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.871817946434021, + "num_tokens": 509029505.0, + "step": 13345 + }, + { + "epoch": 1.697748378068948, + "ewc_loss": 0.04331466555595398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1657333491020836e-05, + "grad_norm": 28.531339645385742, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8740524649620056, + "num_tokens": 509062815.0, + "step": 13346 + }, + { + "epoch": 1.6978755883475385, + "ewc_loss": 0.04320093244314194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1600466425297782e-05, + "grad_norm": 28.730342864990234, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.873042106628418, + "num_tokens": 509103376.0, + "step": 13347 + }, + { + "epoch": 1.698002798626129, + "ewc_loss": 0.0433066189289093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1653309886460193e-05, + "grad_norm": 28.534324645996094, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8767709732055664, + "num_tokens": 509142797.0, + "step": 13348 + }, + { + "epoch": 1.6981300089047195, + "ewc_loss": 0.043227337300777435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1613668650388718e-05, + "grad_norm": 28.58954620361328, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8555136919021606, + "num_tokens": 509182724.0, + "step": 13349 + }, + { + "epoch": 1.69825721918331, + "ewc_loss": 0.04325196146965027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.162598138966132e-05, + "grad_norm": 28.518335342407227, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8758043050765991, + "num_tokens": 509217856.0, + "step": 13350 + }, + { + "epoch": 1.6983844294619006, + "ewc_loss": 0.043296296149492264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.164814759453293e-05, + "grad_norm": 28.610347747802734, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8676959872245789, + "num_tokens": 509251581.0, + "step": 13351 + }, + { + "epoch": 1.6985116397404911, + "ewc_loss": 0.04328369349241257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1641846615239047e-05, + "grad_norm": 28.62162208557129, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8836724162101746, + "num_tokens": 509286687.0, + "step": 13352 + }, + { + "epoch": 1.6986388500190817, + "ewc_loss": 0.04333633929491043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.166816921089776e-05, + "grad_norm": 28.53536605834961, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.86133873462677, + "num_tokens": 509323372.0, + "step": 13353 + }, + { + "epoch": 1.6987660602976722, + "ewc_loss": 0.043290406465530396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.164520265068859e-05, + "grad_norm": 28.571807861328125, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.858948826789856, + "num_tokens": 509362472.0, + "step": 13354 + }, + { + "epoch": 1.6988932705762627, + "ewc_loss": 0.043406229466199875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.170311563531868e-05, + "grad_norm": 28.611299514770508, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8676257133483887, + "num_tokens": 509397676.0, + "step": 13355 + }, + { + "epoch": 1.699020480854853, + "ewc_loss": 0.04328456148505211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1642281353706494e-05, + "grad_norm": 28.464174270629883, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8703489303588867, + "num_tokens": 509441100.0, + "step": 13356 + }, + { + "epoch": 1.6991476911334435, + "ewc_loss": 0.04332690313458443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.166345075238496e-05, + "grad_norm": 28.611684799194336, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8638278245925903, + "num_tokens": 509485430.0, + "step": 13357 + }, + { + "epoch": 1.699274901412034, + "ewc_loss": 0.043383222073316574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1691610527341254e-05, + "grad_norm": 28.570941925048828, + "learning_rate": 1e-06, + "loss": 0.5312, + "mean_token_accuracy": 0.8364307880401611, + "num_tokens": 509520102.0, + "step": 13358 + }, + { + "epoch": 1.6994021116906246, + "ewc_loss": 0.043356433510780334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1678217308362946e-05, + "grad_norm": 28.674619674682617, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8690167665481567, + "num_tokens": 509556525.0, + "step": 13359 + }, + { + "epoch": 1.6995293219692151, + "ewc_loss": 0.04336485266685486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.168242644984275e-05, + "grad_norm": 28.571598052978516, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8605052828788757, + "num_tokens": 509596899.0, + "step": 13360 + }, + { + "epoch": 1.6996565322478057, + "ewc_loss": 0.04334016516804695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.167008278775029e-05, + "grad_norm": 28.703765869140625, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8673729300498962, + "num_tokens": 509632125.0, + "step": 13361 + }, + { + "epoch": 1.699783742526396, + "ewc_loss": 0.04338619485497475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.169309664168395e-05, + "grad_norm": 28.519845962524414, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.862363338470459, + "num_tokens": 509668131.0, + "step": 13362 + }, + { + "epoch": 1.6999109528049865, + "ewc_loss": 0.043342527002096176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.167126331187319e-05, + "grad_norm": 28.681106567382812, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8796195387840271, + "num_tokens": 509704078.0, + "step": 13363 + }, + { + "epoch": 1.700038163083577, + "ewc_loss": 0.043448567390441895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1724283215007745e-05, + "grad_norm": 28.594938278198242, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8800451755523682, + "num_tokens": 509739464.0, + "step": 13364 + }, + { + "epoch": 1.7001653733621676, + "ewc_loss": 0.04329356923699379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1646785171469674e-05, + "grad_norm": 28.551883697509766, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8659140467643738, + "num_tokens": 509777970.0, + "step": 13365 + }, + { + "epoch": 1.700292583640758, + "ewc_loss": 0.043461985886096954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1730993466917425e-05, + "grad_norm": 28.608549118041992, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8540509939193726, + "num_tokens": 509817010.0, + "step": 13366 + }, + { + "epoch": 1.7004197939193486, + "ewc_loss": 0.04343286529183388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.171643245674204e-05, + "grad_norm": 28.65752601623535, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.872728705406189, + "num_tokens": 509854192.0, + "step": 13367 + }, + { + "epoch": 1.7005470041979391, + "ewc_loss": 0.04344207048416138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1721034499933012e-05, + "grad_norm": 28.741010665893555, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8803802728652954, + "num_tokens": 509888633.0, + "step": 13368 + }, + { + "epoch": 1.7006742144765297, + "ewc_loss": 0.04342123121023178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.17106153286295e-05, + "grad_norm": 28.608272552490234, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.867560863494873, + "num_tokens": 509928474.0, + "step": 13369 + }, + { + "epoch": 1.7008014247551202, + "ewc_loss": 0.04331471025943756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.165735531889368e-05, + "grad_norm": 28.57314682006836, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8592464923858643, + "num_tokens": 509968875.0, + "step": 13370 + }, + { + "epoch": 1.7009286350337107, + "ewc_loss": 0.04341111332178116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.170555671909824e-05, + "grad_norm": 28.535770416259766, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8674744963645935, + "num_tokens": 510006021.0, + "step": 13371 + }, + { + "epoch": 1.7010558453123013, + "ewc_loss": 0.04336220771074295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1681104044546373e-05, + "grad_norm": 28.619020462036133, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8714895844459534, + "num_tokens": 510042111.0, + "step": 13372 + }, + { + "epoch": 1.7011830555908918, + "ewc_loss": 0.04345392808318138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.172696440538857e-05, + "grad_norm": 28.630373001098633, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.873686671257019, + "num_tokens": 510074894.0, + "step": 13373 + }, + { + "epoch": 1.7013102658694823, + "ewc_loss": 0.04334346204996109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.16717307921499e-05, + "grad_norm": 28.631702423095703, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8585197925567627, + "num_tokens": 510111516.0, + "step": 13374 + }, + { + "epoch": 1.7014374761480728, + "ewc_loss": 0.04338940605521202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1694702809327282e-05, + "grad_norm": 28.602766036987305, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8695173263549805, + "num_tokens": 510151549.0, + "step": 13375 + }, + { + "epoch": 1.7015646864266634, + "ewc_loss": 0.04333050176501274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.166525155189447e-05, + "grad_norm": 28.66830825805664, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8673235177993774, + "num_tokens": 510189215.0, + "step": 13376 + }, + { + "epoch": 1.701691896705254, + "ewc_loss": 0.043334171175956726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1667085093213245e-05, + "grad_norm": 28.537097930908203, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8871086835861206, + "num_tokens": 510231189.0, + "step": 13377 + }, + { + "epoch": 1.7018191069838444, + "ewc_loss": 0.04338588938117027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1692943846574053e-05, + "grad_norm": 28.70638084411621, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.871048092842102, + "num_tokens": 510273956.0, + "step": 13378 + }, + { + "epoch": 1.701946317262435, + "ewc_loss": 0.043341897428035736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1670948626706377e-05, + "grad_norm": 28.598529815673828, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8834810256958008, + "num_tokens": 510315095.0, + "step": 13379 + }, + { + "epoch": 1.7020735275410255, + "ewc_loss": 0.04333167150616646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.166583544749301e-05, + "grad_norm": 28.643705368041992, + "learning_rate": 1e-06, + "loss": 0.5033, + "mean_token_accuracy": 0.8466658592224121, + "num_tokens": 510350753.0, + "step": 13380 + }, + { + "epoch": 1.7022007378196158, + "ewc_loss": 0.043408527970314026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.170426341763232e-05, + "grad_norm": 28.59180450439453, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8728824257850647, + "num_tokens": 510389257.0, + "step": 13381 + }, + { + "epoch": 1.7023279480982063, + "ewc_loss": 0.043338652700185776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1669326088158414e-05, + "grad_norm": 28.53523063659668, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.868861973285675, + "num_tokens": 510428879.0, + "step": 13382 + }, + { + "epoch": 1.7024551583767968, + "ewc_loss": 0.04343244060873985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1716219634981826e-05, + "grad_norm": 28.7458438873291, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8705940246582031, + "num_tokens": 510468409.0, + "step": 13383 + }, + { + "epoch": 1.7025823686553874, + "ewc_loss": 0.04337679222226143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.168839637306519e-05, + "grad_norm": 28.586280822753906, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8830081820487976, + "num_tokens": 510503988.0, + "step": 13384 + }, + { + "epoch": 1.702709578933978, + "ewc_loss": 0.04328684136271477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1643420041073114e-05, + "grad_norm": 28.600027084350586, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8657513856887817, + "num_tokens": 510539009.0, + "step": 13385 + }, + { + "epoch": 1.7028367892125684, + "ewc_loss": 0.04340200126171112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1701000150642358e-05, + "grad_norm": 28.711015701293945, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8820171356201172, + "num_tokens": 510576177.0, + "step": 13386 + }, + { + "epoch": 1.7029639994911587, + "ewc_loss": 0.04334982484579086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1674912204616703e-05, + "grad_norm": 28.582237243652344, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8667399287223816, + "num_tokens": 510622194.0, + "step": 13387 + }, + { + "epoch": 1.7030912097697493, + "ewc_loss": 0.043250922113657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1625461158691905e-05, + "grad_norm": 28.615562438964844, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8844643831253052, + "num_tokens": 510663564.0, + "step": 13388 + }, + { + "epoch": 1.7032184200483398, + "ewc_loss": 0.043299075216054916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.164953730243724e-05, + "grad_norm": 28.684555053710938, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8771669864654541, + "num_tokens": 510702153.0, + "step": 13389 + }, + { + "epoch": 1.7033456303269303, + "ewc_loss": 0.04335835203528404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1679175915778615e-05, + "grad_norm": 28.79129409790039, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8639789819717407, + "num_tokens": 510742812.0, + "step": 13390 + }, + { + "epoch": 1.7034728406055208, + "ewc_loss": 0.04332668334245682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1663341613020748e-05, + "grad_norm": 28.693344116210938, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8611992597579956, + "num_tokens": 510783363.0, + "step": 13391 + }, + { + "epoch": 1.7036000508841114, + "ewc_loss": 0.04324296861886978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1621484847855754e-05, + "grad_norm": 28.672609329223633, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.853514552116394, + "num_tokens": 510818362.0, + "step": 13392 + }, + { + "epoch": 1.703727261162702, + "ewc_loss": 0.0433109775185585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.165548903576564e-05, + "grad_norm": 28.68977165222168, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8799943923950195, + "num_tokens": 510854732.0, + "step": 13393 + }, + { + "epoch": 1.7038544714412924, + "ewc_loss": 0.04318918287754059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.159459108952433e-05, + "grad_norm": 28.610843658447266, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8609964847564697, + "num_tokens": 510895107.0, + "step": 13394 + }, + { + "epoch": 1.703981681719883, + "ewc_loss": 0.04320991784334183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.160495932912454e-05, + "grad_norm": 28.578142166137695, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8641358613967896, + "num_tokens": 510930737.0, + "step": 13395 + }, + { + "epoch": 1.7041088919984735, + "ewc_loss": 0.043353915214538574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.167695674870629e-05, + "grad_norm": 28.744585037231445, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8751802444458008, + "num_tokens": 510964646.0, + "step": 13396 + }, + { + "epoch": 1.704236102277064, + "ewc_loss": 0.04330848902463913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1654244847013615e-05, + "grad_norm": 28.578340530395508, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8715047836303711, + "num_tokens": 511000583.0, + "step": 13397 + }, + { + "epoch": 1.7043633125556545, + "ewc_loss": 0.043276578187942505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.163828867196571e-05, + "grad_norm": 28.640817642211914, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8701270818710327, + "num_tokens": 511033729.0, + "step": 13398 + }, + { + "epoch": 1.704490522834245, + "ewc_loss": 0.04336761310696602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1683807062800042e-05, + "grad_norm": 28.62035369873047, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8725541234016418, + "num_tokens": 511074099.0, + "step": 13399 + }, + { + "epoch": 1.7046177331128356, + "ewc_loss": 0.04333648085594177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1668240151484497e-05, + "grad_norm": 28.760597229003906, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8550572395324707, + "num_tokens": 511112473.0, + "step": 13400 + }, + { + "epoch": 1.7047449433914261, + "ewc_loss": 0.04329048469662666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1645242668455467e-05, + "grad_norm": 28.56416893005371, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8671413064002991, + "num_tokens": 511149839.0, + "step": 13401 + }, + { + "epoch": 1.7048721536700167, + "ewc_loss": 0.043340012431144714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.167000639019534e-05, + "grad_norm": 28.583402633666992, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8570789098739624, + "num_tokens": 511183054.0, + "step": 13402 + }, + { + "epoch": 1.7049993639486072, + "ewc_loss": 0.043422941118478775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1711470253649168e-05, + "grad_norm": 28.78654670715332, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8709642887115479, + "num_tokens": 511218498.0, + "step": 13403 + }, + { + "epoch": 1.7051265742271977, + "ewc_loss": 0.043301209807395935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.165060504921712e-05, + "grad_norm": 28.56366539001465, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8630274534225464, + "num_tokens": 511259776.0, + "step": 13404 + }, + { + "epoch": 1.705253784505788, + "ewc_loss": 0.043339669704437256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1669835405191407e-05, + "grad_norm": 28.723981857299805, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8748924732208252, + "num_tokens": 511297331.0, + "step": 13405 + }, + { + "epoch": 1.7053809947843785, + "ewc_loss": 0.043334029614925385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1667014152626507e-05, + "grad_norm": 28.63414192199707, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8791941404342651, + "num_tokens": 511331055.0, + "step": 13406 + }, + { + "epoch": 1.705508205062969, + "ewc_loss": 0.04332408681511879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1662042854586616e-05, + "grad_norm": 28.601057052612305, + "learning_rate": 1e-06, + "loss": 0.3745, + "mean_token_accuracy": 0.8857132196426392, + "num_tokens": 511368651.0, + "step": 13407 + }, + { + "epoch": 1.7056354153415596, + "ewc_loss": 0.04342826083302498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1714129616157152e-05, + "grad_norm": 28.730348587036133, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8729008436203003, + "num_tokens": 511402163.0, + "step": 13408 + }, + { + "epoch": 1.7057626256201501, + "ewc_loss": 0.04331573098897934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.165786463592667e-05, + "grad_norm": 28.515419006347656, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8606459498405457, + "num_tokens": 511443769.0, + "step": 13409 + }, + { + "epoch": 1.7058898358987407, + "ewc_loss": 0.04341012239456177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1705061953980476e-05, + "grad_norm": 28.735397338867188, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8642199039459229, + "num_tokens": 511481038.0, + "step": 13410 + }, + { + "epoch": 1.706017046177331, + "ewc_loss": 0.04334666579961777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1673333321814425e-05, + "grad_norm": 28.476240158081055, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8696751594543457, + "num_tokens": 511522940.0, + "step": 13411 + }, + { + "epoch": 1.7061442564559215, + "ewc_loss": 0.04334109276533127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1670546630048193e-05, + "grad_norm": 28.636104583740234, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8644826412200928, + "num_tokens": 511562630.0, + "step": 13412 + }, + { + "epoch": 1.706271466734512, + "ewc_loss": 0.043388526886701584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1694262613891624e-05, + "grad_norm": 28.48670768737793, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8529871106147766, + "num_tokens": 511598986.0, + "step": 13413 + }, + { + "epoch": 1.7063986770131025, + "ewc_loss": 0.0434340164065361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1717009076382965e-05, + "grad_norm": 28.699243545532227, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8701343536376953, + "num_tokens": 511633582.0, + "step": 13414 + }, + { + "epoch": 1.706525887291693, + "ewc_loss": 0.04348831623792648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1744157493230887e-05, + "grad_norm": 28.60525131225586, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8676327466964722, + "num_tokens": 511673296.0, + "step": 13415 + }, + { + "epoch": 1.7066530975702836, + "ewc_loss": 0.04341105371713638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1705527615267783e-05, + "grad_norm": 28.660276412963867, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8595271706581116, + "num_tokens": 511707890.0, + "step": 13416 + }, + { + "epoch": 1.7067803078488741, + "ewc_loss": 0.043411243706941605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1705622202716768e-05, + "grad_norm": 28.60658073425293, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8737998008728027, + "num_tokens": 511741597.0, + "step": 13417 + }, + { + "epoch": 1.7069075181274647, + "ewc_loss": 0.04339195042848587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1695976101909764e-05, + "grad_norm": 28.60875129699707, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8718563914299011, + "num_tokens": 511776816.0, + "step": 13418 + }, + { + "epoch": 1.7070347284060552, + "ewc_loss": 0.04348071292042732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1740355805377476e-05, + "grad_norm": 28.763355255126953, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8614213466644287, + "num_tokens": 511814866.0, + "step": 13419 + }, + { + "epoch": 1.7071619386846457, + "ewc_loss": 0.0433921180665493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1696059775422327e-05, + "grad_norm": 28.619014739990234, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8685038685798645, + "num_tokens": 511852458.0, + "step": 13420 + }, + { + "epoch": 1.7072891489632362, + "ewc_loss": 0.04334088787436485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1670444766641594e-05, + "grad_norm": 28.552623748779297, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8635567426681519, + "num_tokens": 511891201.0, + "step": 13421 + }, + { + "epoch": 1.7074163592418268, + "ewc_loss": 0.04344642907381058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1723215468227863e-05, + "grad_norm": 28.73332405090332, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8676234483718872, + "num_tokens": 511930862.0, + "step": 13422 + }, + { + "epoch": 1.7075435695204173, + "ewc_loss": 0.04341081902384758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1705409380956553e-05, + "grad_norm": 28.62452507019043, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8749910593032837, + "num_tokens": 511967695.0, + "step": 13423 + }, + { + "epoch": 1.7076707797990078, + "ewc_loss": 0.04336369410157204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.168184619222302e-05, + "grad_norm": 28.576427459716797, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.881080687046051, + "num_tokens": 512005976.0, + "step": 13424 + }, + { + "epoch": 1.7077979900775984, + "ewc_loss": 0.04344406723976135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.172203312511556e-05, + "grad_norm": 28.574968338012695, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8689581155776978, + "num_tokens": 512041575.0, + "step": 13425 + }, + { + "epoch": 1.7079252003561889, + "ewc_loss": 0.04349812492728233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.174906330765225e-05, + "grad_norm": 28.600173950195312, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8635450601577759, + "num_tokens": 512076171.0, + "step": 13426 + }, + { + "epoch": 1.7080524106347794, + "ewc_loss": 0.043526481837034225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1763240511063486e-05, + "grad_norm": 28.740657806396484, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8636927604675293, + "num_tokens": 512112082.0, + "step": 13427 + }, + { + "epoch": 1.70817962091337, + "ewc_loss": 0.04342372342944145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.171186133637093e-05, + "grad_norm": 28.587467193603516, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8540747165679932, + "num_tokens": 512149126.0, + "step": 13428 + }, + { + "epoch": 1.7083068311919605, + "ewc_loss": 0.04340798035264015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1703990569221787e-05, + "grad_norm": 28.593215942382812, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8699260950088501, + "num_tokens": 512182368.0, + "step": 13429 + }, + { + "epoch": 1.7084340414705508, + "ewc_loss": 0.04349811375141144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1749056031694636e-05, + "grad_norm": 28.69775390625, + "learning_rate": 1e-06, + "loss": 0.4869, + "mean_token_accuracy": 0.8483941555023193, + "num_tokens": 512218224.0, + "step": 13430 + }, + { + "epoch": 1.7085612517491413, + "ewc_loss": 0.0434303916990757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.171519554394763e-05, + "grad_norm": 28.579151153564453, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8674774169921875, + "num_tokens": 512255523.0, + "step": 13431 + }, + { + "epoch": 1.7086884620277318, + "ewc_loss": 0.04348684847354889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1743424440501258e-05, + "grad_norm": 28.70899200439453, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.876901388168335, + "num_tokens": 512290514.0, + "step": 13432 + }, + { + "epoch": 1.7088156723063224, + "ewc_loss": 0.04347892105579376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.173946086259093e-05, + "grad_norm": 28.691551208496094, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8768078088760376, + "num_tokens": 512328468.0, + "step": 13433 + }, + { + "epoch": 1.708942882584913, + "ewc_loss": 0.04337998479604721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1689991626772098e-05, + "grad_norm": 28.554763793945312, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8671716451644897, + "num_tokens": 512359786.0, + "step": 13434 + }, + { + "epoch": 1.7090700928635034, + "ewc_loss": 0.043410006910562515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1705003746319562e-05, + "grad_norm": 28.584922790527344, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8463003635406494, + "num_tokens": 512395679.0, + "step": 13435 + }, + { + "epoch": 1.7091973031420937, + "ewc_loss": 0.04341978207230568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.170989137084689e-05, + "grad_norm": 28.643430709838867, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8656274676322937, + "num_tokens": 512433491.0, + "step": 13436 + }, + { + "epoch": 1.7093245134206843, + "ewc_loss": 0.04343476518988609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1717381969210692e-05, + "grad_norm": 28.62071418762207, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8616570234298706, + "num_tokens": 512470215.0, + "step": 13437 + }, + { + "epoch": 1.7094517236992748, + "ewc_loss": 0.04341007396578789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1705036488128826e-05, + "grad_norm": 28.603891372680664, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8631004095077515, + "num_tokens": 512507055.0, + "step": 13438 + }, + { + "epoch": 1.7095789339778653, + "ewc_loss": 0.04343203455209732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1716017727158032e-05, + "grad_norm": 28.710878372192383, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8496541976928711, + "num_tokens": 512547877.0, + "step": 13439 + }, + { + "epoch": 1.7097061442564558, + "ewc_loss": 0.043489158153533936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.174457949877251e-05, + "grad_norm": 28.622596740722656, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8534358739852905, + "num_tokens": 512586558.0, + "step": 13440 + }, + { + "epoch": 1.7098333545350464, + "ewc_loss": 0.04353844374418259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1769221348222345e-05, + "grad_norm": 28.757280349731445, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8651731014251709, + "num_tokens": 512631181.0, + "step": 13441 + }, + { + "epoch": 1.709960564813637, + "ewc_loss": 0.0434170626103878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1708530766773038e-05, + "grad_norm": 28.604530334472656, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8771172761917114, + "num_tokens": 512667925.0, + "step": 13442 + }, + { + "epoch": 1.7100877750922274, + "ewc_loss": 0.043468546122312546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.173427310481202e-05, + "grad_norm": 28.72865867614746, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8497722148895264, + "num_tokens": 512703136.0, + "step": 13443 + }, + { + "epoch": 1.710214985370818, + "ewc_loss": 0.043452031910419464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1726016711909324e-05, + "grad_norm": 28.651023864746094, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8698426485061646, + "num_tokens": 512739001.0, + "step": 13444 + }, + { + "epoch": 1.7103421956494085, + "ewc_loss": 0.04350404068827629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1752020984422415e-05, + "grad_norm": 28.715606689453125, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8714560270309448, + "num_tokens": 512780517.0, + "step": 13445 + }, + { + "epoch": 1.710469405927999, + "ewc_loss": 0.04343518987298012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1717594790970907e-05, + "grad_norm": 28.611032485961914, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8733228445053101, + "num_tokens": 512815964.0, + "step": 13446 + }, + { + "epoch": 1.7105966162065895, + "ewc_loss": 0.04351300001144409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1756499336333945e-05, + "grad_norm": 28.799091339111328, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8694735765457153, + "num_tokens": 512855451.0, + "step": 13447 + }, + { + "epoch": 1.71072382648518, + "ewc_loss": 0.043462321162223816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.173116081394255e-05, + "grad_norm": 28.641138076782227, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8710135221481323, + "num_tokens": 512895809.0, + "step": 13448 + }, + { + "epoch": 1.7108510367637706, + "ewc_loss": 0.04342370107769966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.171185042243451e-05, + "grad_norm": 28.76129150390625, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8719481229782104, + "num_tokens": 512936424.0, + "step": 13449 + }, + { + "epoch": 1.7109782470423611, + "ewc_loss": 0.043424274772405624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.171213782276027e-05, + "grad_norm": 28.652727127075195, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8823555707931519, + "num_tokens": 512976486.0, + "step": 13450 + }, + { + "epoch": 1.7111054573209517, + "ewc_loss": 0.043389804661273956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1694902898161672e-05, + "grad_norm": 28.66368293762207, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8587768077850342, + "num_tokens": 513014131.0, + "step": 13451 + }, + { + "epoch": 1.7112326675995422, + "ewc_loss": 0.04348091036081314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.174045584979467e-05, + "grad_norm": 28.56630516052246, + "learning_rate": 1e-06, + "loss": 0.5149, + "mean_token_accuracy": 0.8398101329803467, + "num_tokens": 513054706.0, + "step": 13452 + }, + { + "epoch": 1.7113598778781327, + "ewc_loss": 0.0434136837720871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1706842744606547e-05, + "grad_norm": 28.68151092529297, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8724371194839478, + "num_tokens": 513094089.0, + "step": 13453 + }, + { + "epoch": 1.711487088156723, + "ewc_loss": 0.04356765374541283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1783826014143415e-05, + "grad_norm": 28.591571807861328, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8660548329353333, + "num_tokens": 513136306.0, + "step": 13454 + }, + { + "epoch": 1.7116142984353135, + "ewc_loss": 0.04337264224886894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1686320906155743e-05, + "grad_norm": 28.69411849975586, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8640762567520142, + "num_tokens": 513174376.0, + "step": 13455 + }, + { + "epoch": 1.711741508713904, + "ewc_loss": 0.04353143274784088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1765716155641712e-05, + "grad_norm": 28.701068878173828, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8705002069473267, + "num_tokens": 513211834.0, + "step": 13456 + }, + { + "epoch": 1.7118687189924946, + "ewc_loss": 0.043417613953351974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1708807253162377e-05, + "grad_norm": 28.595699310302734, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8585165739059448, + "num_tokens": 513241765.0, + "step": 13457 + }, + { + "epoch": 1.7119959292710851, + "ewc_loss": 0.04346301779150963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.173150824091863e-05, + "grad_norm": 28.67061996459961, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8543960452079773, + "num_tokens": 513283510.0, + "step": 13458 + }, + { + "epoch": 1.7121231395496757, + "ewc_loss": 0.0434991717338562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1749585357611068e-05, + "grad_norm": 28.527509689331055, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8638386130332947, + "num_tokens": 513320746.0, + "step": 13459 + }, + { + "epoch": 1.712250349828266, + "ewc_loss": 0.043431270867586136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1715635739383288e-05, + "grad_norm": 28.630584716796875, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8639750480651855, + "num_tokens": 513358036.0, + "step": 13460 + }, + { + "epoch": 1.7123775601068565, + "ewc_loss": 0.04355756565928459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1778781956527382e-05, + "grad_norm": 28.663005828857422, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.872185230255127, + "num_tokens": 513395156.0, + "step": 13461 + }, + { + "epoch": 1.712504770385447, + "ewc_loss": 0.04350591450929642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.175295776396524e-05, + "grad_norm": 28.622121810913086, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8615171313285828, + "num_tokens": 513438631.0, + "step": 13462 + }, + { + "epoch": 1.7126319806640375, + "ewc_loss": 0.04352182894945145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.176091402361635e-05, + "grad_norm": 28.77642059326172, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8577629327774048, + "num_tokens": 513471225.0, + "step": 13463 + }, + { + "epoch": 1.712759190942628, + "ewc_loss": 0.0435667559504509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1783378542750143e-05, + "grad_norm": 28.691659927368164, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8584476113319397, + "num_tokens": 513513824.0, + "step": 13464 + }, + { + "epoch": 1.7128864012212186, + "ewc_loss": 0.043497297912836075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.174864857806824e-05, + "grad_norm": 28.718122482299805, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8595848679542542, + "num_tokens": 513556263.0, + "step": 13465 + }, + { + "epoch": 1.7130136114998091, + "ewc_loss": 0.04347885772585869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1739428120781668e-05, + "grad_norm": 28.61326789855957, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8709089159965515, + "num_tokens": 513595093.0, + "step": 13466 + }, + { + "epoch": 1.7131408217783997, + "ewc_loss": 0.043554916977882385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.17774577322416e-05, + "grad_norm": 28.61818504333496, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8597726821899414, + "num_tokens": 513633761.0, + "step": 13467 + }, + { + "epoch": 1.7132680320569902, + "ewc_loss": 0.0434596985578537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1729849322582595e-05, + "grad_norm": 28.581531524658203, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8641347289085388, + "num_tokens": 513671404.0, + "step": 13468 + }, + { + "epoch": 1.7133952423355807, + "ewc_loss": 0.04354153946042061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1770769308204763e-05, + "grad_norm": 28.68548583984375, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8647346496582031, + "num_tokens": 513713752.0, + "step": 13469 + }, + { + "epoch": 1.7135224526141712, + "ewc_loss": 0.04359372332692146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1796860892209224e-05, + "grad_norm": 28.726696014404297, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8627568483352661, + "num_tokens": 513755261.0, + "step": 13470 + }, + { + "epoch": 1.7136496628927618, + "ewc_loss": 0.04356967657804489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1784837372251786e-05, + "grad_norm": 28.739044189453125, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8716768026351929, + "num_tokens": 513799042.0, + "step": 13471 + }, + { + "epoch": 1.7137768731713523, + "ewc_loss": 0.04349526762962341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1747633581981063e-05, + "grad_norm": 28.82682991027832, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8568158149719238, + "num_tokens": 513838148.0, + "step": 13472 + }, + { + "epoch": 1.7139040834499428, + "ewc_loss": 0.04344431683421135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1722158635384403e-05, + "grad_norm": 28.614904403686523, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8723213076591492, + "num_tokens": 513878960.0, + "step": 13473 + }, + { + "epoch": 1.7140312937285334, + "ewc_loss": 0.04345269873738289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1726349586970173e-05, + "grad_norm": 28.867040634155273, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8684937357902527, + "num_tokens": 513914083.0, + "step": 13474 + }, + { + "epoch": 1.7141585040071239, + "ewc_loss": 0.04345698282122612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1728490537498146e-05, + "grad_norm": 28.614295959472656, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8553966283798218, + "num_tokens": 513951591.0, + "step": 13475 + }, + { + "epoch": 1.7142857142857144, + "ewc_loss": 0.043353188782930374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1676594769814983e-05, + "grad_norm": 28.741548538208008, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.8525377511978149, + "num_tokens": 513986127.0, + "step": 13476 + }, + { + "epoch": 1.714412924564305, + "ewc_loss": 0.04347624629735947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1738123905379325e-05, + "grad_norm": 28.575653076171875, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.861367404460907, + "num_tokens": 514028248.0, + "step": 13477 + }, + { + "epoch": 1.7145401348428955, + "ewc_loss": 0.04338677600026131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.169338767998852e-05, + "grad_norm": 28.740110397338867, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8698595762252808, + "num_tokens": 514062757.0, + "step": 13478 + }, + { + "epoch": 1.7146673451214858, + "ewc_loss": 0.04346311837434769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1731559172621928e-05, + "grad_norm": 28.61880111694336, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8542630672454834, + "num_tokens": 514102635.0, + "step": 13479 + }, + { + "epoch": 1.7147945554000763, + "ewc_loss": 0.043454207479953766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1727104467572644e-05, + "grad_norm": 28.68458366394043, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8657140731811523, + "num_tokens": 514140883.0, + "step": 13480 + }, + { + "epoch": 1.7149217656786668, + "ewc_loss": 0.04355504363775253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1777521396870725e-05, + "grad_norm": 28.76055908203125, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8746045827865601, + "num_tokens": 514176536.0, + "step": 13481 + }, + { + "epoch": 1.7150489759572574, + "ewc_loss": 0.043510206043720245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1755102352472022e-05, + "grad_norm": 28.73827362060547, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8600800633430481, + "num_tokens": 514209969.0, + "step": 13482 + }, + { + "epoch": 1.7151761862358479, + "ewc_loss": 0.043392714112997055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1696356270695105e-05, + "grad_norm": 28.562952041625977, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8653429746627808, + "num_tokens": 514249679.0, + "step": 13483 + }, + { + "epoch": 1.7153033965144384, + "ewc_loss": 0.043467193841934204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.17335964407539e-05, + "grad_norm": 28.773656845092773, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8611800670623779, + "num_tokens": 514292719.0, + "step": 13484 + }, + { + "epoch": 1.7154306067930287, + "ewc_loss": 0.043567247688770294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.178362410631962e-05, + "grad_norm": 28.73561668395996, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8594768047332764, + "num_tokens": 514323396.0, + "step": 13485 + }, + { + "epoch": 1.7155578170716193, + "ewc_loss": 0.04339181259274483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.169590698031243e-05, + "grad_norm": 28.618085861206055, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8716804385185242, + "num_tokens": 514360951.0, + "step": 13486 + }, + { + "epoch": 1.7156850273502098, + "ewc_loss": 0.043537236750125885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.176861926272977e-05, + "grad_norm": 28.90796661376953, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8502999544143677, + "num_tokens": 514403879.0, + "step": 13487 + }, + { + "epoch": 1.7158122376288003, + "ewc_loss": 0.04353175684809685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1765878045698628e-05, + "grad_norm": 28.571475982666016, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8731738924980164, + "num_tokens": 514446628.0, + "step": 13488 + }, + { + "epoch": 1.7159394479073908, + "ewc_loss": 0.04335916414856911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1679581550415605e-05, + "grad_norm": 28.75664710998535, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8676935434341431, + "num_tokens": 514486132.0, + "step": 13489 + }, + { + "epoch": 1.7160666581859814, + "ewc_loss": 0.04361468553543091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1807341909152456e-05, + "grad_norm": 28.73369598388672, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8552758693695068, + "num_tokens": 514526384.0, + "step": 13490 + }, + { + "epoch": 1.716193868464572, + "ewc_loss": 0.04336469620466232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.16823482332984e-05, + "grad_norm": 28.715578079223633, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8643648624420166, + "num_tokens": 514565828.0, + "step": 13491 + }, + { + "epoch": 1.7163210787431624, + "ewc_loss": 0.04348883777856827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1744419427704997e-05, + "grad_norm": 28.634374618530273, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8618122935295105, + "num_tokens": 514605260.0, + "step": 13492 + }, + { + "epoch": 1.716448289021753, + "ewc_loss": 0.04345053434371948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1725267288275063e-05, + "grad_norm": 28.78483772277832, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8679893016815186, + "num_tokens": 514643433.0, + "step": 13493 + }, + { + "epoch": 1.7165754993003435, + "ewc_loss": 0.043459702283144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1729851141572e-05, + "grad_norm": 28.649707794189453, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8681727647781372, + "num_tokens": 514683264.0, + "step": 13494 + }, + { + "epoch": 1.716702709578934, + "ewc_loss": 0.043469544500112534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.173477150790859e-05, + "grad_norm": 28.811086654663086, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8736873865127563, + "num_tokens": 514719775.0, + "step": 13495 + }, + { + "epoch": 1.7168299198575245, + "ewc_loss": 0.043420687317848206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1710344299208373e-05, + "grad_norm": 28.63892936706543, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8595327138900757, + "num_tokens": 514766287.0, + "step": 13496 + }, + { + "epoch": 1.716957130136115, + "ewc_loss": 0.043395910412073135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1697955162380822e-05, + "grad_norm": 28.765743255615234, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8755927681922913, + "num_tokens": 514806725.0, + "step": 13497 + }, + { + "epoch": 1.7170843404147056, + "ewc_loss": 0.04348651319742203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.174325709347613e-05, + "grad_norm": 28.698976516723633, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8699603080749512, + "num_tokens": 514844008.0, + "step": 13498 + }, + { + "epoch": 1.7172115506932961, + "ewc_loss": 0.04335234314203262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1676170945283957e-05, + "grad_norm": 28.7121639251709, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8543296456336975, + "num_tokens": 514880356.0, + "step": 13499 + }, + { + "epoch": 1.7173387609718866, + "ewc_loss": 0.04338236153125763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1691181245842017e-05, + "grad_norm": 28.70850372314453, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8775256276130676, + "num_tokens": 514915309.0, + "step": 13500 + }, + { + "epoch": 1.7174659712504772, + "ewc_loss": 0.04335913434624672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1679566998500377e-05, + "grad_norm": 28.708120346069336, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8593922257423401, + "num_tokens": 514956496.0, + "step": 13501 + }, + { + "epoch": 1.7175931815290677, + "ewc_loss": 0.04337746277451515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.168873106711544e-05, + "grad_norm": 28.708759307861328, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8679336309432983, + "num_tokens": 514992247.0, + "step": 13502 + }, + { + "epoch": 1.717720391807658, + "ewc_loss": 0.0433901771903038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1695088435080834e-05, + "grad_norm": 28.796249389648438, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8587117195129395, + "num_tokens": 515035443.0, + "step": 13503 + }, + { + "epoch": 1.7178476020862485, + "ewc_loss": 0.04331647604703903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1658237528754398e-05, + "grad_norm": 28.731155395507812, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8704311847686768, + "num_tokens": 515072279.0, + "step": 13504 + }, + { + "epoch": 1.717974812364839, + "ewc_loss": 0.043358173221349716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.167908678529784e-05, + "grad_norm": 28.657804489135742, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8659642338752747, + "num_tokens": 515112547.0, + "step": 13505 + }, + { + "epoch": 1.7181020226434296, + "ewc_loss": 0.04335169866681099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.167584898415953e-05, + "grad_norm": 28.703237533569336, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8535977602005005, + "num_tokens": 515146814.0, + "step": 13506 + }, + { + "epoch": 1.7182292329220201, + "ewc_loss": 0.04333370551466942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.166685226256959e-05, + "grad_norm": 28.647184371948242, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.867779016494751, + "num_tokens": 515184393.0, + "step": 13507 + }, + { + "epoch": 1.7183564432006107, + "ewc_loss": 0.04334653168916702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1673266019206494e-05, + "grad_norm": 28.767690658569336, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8593426942825317, + "num_tokens": 515225312.0, + "step": 13508 + }, + { + "epoch": 1.718483653479201, + "ewc_loss": 0.04337357357144356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.168678656744305e-05, + "grad_norm": 28.786415100097656, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.8439749479293823, + "num_tokens": 515261330.0, + "step": 13509 + }, + { + "epoch": 1.7186108637577915, + "ewc_loss": 0.04335378110408783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.167689126508776e-05, + "grad_norm": 28.671873092651367, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8717403411865234, + "num_tokens": 515299325.0, + "step": 13510 + }, + { + "epoch": 1.718738074036382, + "ewc_loss": 0.043333426117897034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.166671220038552e-05, + "grad_norm": 28.680810928344727, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8671408891677856, + "num_tokens": 515335213.0, + "step": 13511 + }, + { + "epoch": 1.7188652843149725, + "ewc_loss": 0.04333715885877609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.166858030250296e-05, + "grad_norm": 28.768388748168945, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8784052729606628, + "num_tokens": 515365699.0, + "step": 13512 + }, + { + "epoch": 1.718992494593563, + "ewc_loss": 0.04333558306097984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.166779086110182e-05, + "grad_norm": 28.666446685791016, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.87514728307724, + "num_tokens": 515396025.0, + "step": 13513 + }, + { + "epoch": 1.7191197048721536, + "ewc_loss": 0.04337168484926224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.168584251194261e-05, + "grad_norm": 28.597919464111328, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8664695024490356, + "num_tokens": 515429724.0, + "step": 13514 + }, + { + "epoch": 1.7192469151507441, + "ewc_loss": 0.04345712065696716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.172855965909548e-05, + "grad_norm": 28.637605667114258, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8620423674583435, + "num_tokens": 515465894.0, + "step": 13515 + }, + { + "epoch": 1.7193741254293347, + "ewc_loss": 0.043452680110931396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1726340492023155e-05, + "grad_norm": 28.638826370239258, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8852923512458801, + "num_tokens": 515498894.0, + "step": 13516 + }, + { + "epoch": 1.7195013357079252, + "ewc_loss": 0.0434238575398922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.171192863897886e-05, + "grad_norm": 28.582054138183594, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8657292723655701, + "num_tokens": 515538783.0, + "step": 13517 + }, + { + "epoch": 1.7196285459865157, + "ewc_loss": 0.043474555015563965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1737278075306676e-05, + "grad_norm": 28.716548919677734, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8648035526275635, + "num_tokens": 515573107.0, + "step": 13518 + }, + { + "epoch": 1.7197557562651062, + "ewc_loss": 0.04352231323719025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1761155949207023e-05, + "grad_norm": 28.633377075195312, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8683066368103027, + "num_tokens": 515614901.0, + "step": 13519 + }, + { + "epoch": 1.7198829665436968, + "ewc_loss": 0.0435238741338253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1761936295661144e-05, + "grad_norm": 28.780933380126953, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8699562549591064, + "num_tokens": 515654197.0, + "step": 13520 + }, + { + "epoch": 1.7200101768222873, + "ewc_loss": 0.043664176017045975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1832087440998293e-05, + "grad_norm": 28.622318267822266, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8846248388290405, + "num_tokens": 515697003.0, + "step": 13521 + }, + { + "epoch": 1.7201373871008778, + "ewc_loss": 0.043507158756256104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1753579858341254e-05, + "grad_norm": 28.678457260131836, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8621078133583069, + "num_tokens": 515737801.0, + "step": 13522 + }, + { + "epoch": 1.7202645973794684, + "ewc_loss": 0.043576594442129135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.178829709009733e-05, + "grad_norm": 28.598623275756836, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8585994839668274, + "num_tokens": 515774711.0, + "step": 13523 + }, + { + "epoch": 1.7203918076580589, + "ewc_loss": 0.0435645654797554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1782283511129208e-05, + "grad_norm": 28.801685333251953, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8581079244613647, + "num_tokens": 515811728.0, + "step": 13524 + }, + { + "epoch": 1.7205190179366494, + "ewc_loss": 0.04364727810025215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1823638235218823e-05, + "grad_norm": 28.585966110229492, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.872323751449585, + "num_tokens": 515844425.0, + "step": 13525 + }, + { + "epoch": 1.72064622821524, + "ewc_loss": 0.0434982068836689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1749103325419128e-05, + "grad_norm": 28.690404891967773, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8687921762466431, + "num_tokens": 515882439.0, + "step": 13526 + }, + { + "epoch": 1.7207734384938305, + "ewc_loss": 0.043635644018650055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1817821107106283e-05, + "grad_norm": 28.663747787475586, + "learning_rate": 1e-06, + "loss": 0.5096, + "mean_token_accuracy": 0.8440176844596863, + "num_tokens": 515922168.0, + "step": 13527 + }, + { + "epoch": 1.7209006487724208, + "ewc_loss": 0.043512582778930664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.175629197154194e-05, + "grad_norm": 28.696372985839844, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.873375415802002, + "num_tokens": 515961840.0, + "step": 13528 + }, + { + "epoch": 1.7210278590510113, + "ewc_loss": 0.04367737099528313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1838684915564954e-05, + "grad_norm": 28.681346893310547, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8660292029380798, + "num_tokens": 515997586.0, + "step": 13529 + }, + { + "epoch": 1.7211550693296018, + "ewc_loss": 0.043534670025110245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1767335056210868e-05, + "grad_norm": 28.753856658935547, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8720448017120361, + "num_tokens": 516036660.0, + "step": 13530 + }, + { + "epoch": 1.7212822796081924, + "ewc_loss": 0.043603263795375824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1801632101414725e-05, + "grad_norm": 28.68705177307129, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8588724136352539, + "num_tokens": 516070846.0, + "step": 13531 + }, + { + "epoch": 1.7214094898867829, + "ewc_loss": 0.04351979121565819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1759895389550366e-05, + "grad_norm": 28.707548141479492, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8875415325164795, + "num_tokens": 516107027.0, + "step": 13532 + }, + { + "epoch": 1.7215367001653734, + "ewc_loss": 0.043533965945243835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.176698217226658e-05, + "grad_norm": 28.680185317993164, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8676694631576538, + "num_tokens": 516143971.0, + "step": 13533 + }, + { + "epoch": 1.7216639104439637, + "ewc_loss": 0.04355144128203392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1775720597361214e-05, + "grad_norm": 28.690792083740234, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8684325218200684, + "num_tokens": 516180617.0, + "step": 13534 + }, + { + "epoch": 1.7217911207225542, + "ewc_loss": 0.043538883328437805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1769441445940174e-05, + "grad_norm": 28.701120376586914, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8675646781921387, + "num_tokens": 516220371.0, + "step": 13535 + }, + { + "epoch": 1.7219183310011448, + "ewc_loss": 0.043605927377939224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.180296360165812e-05, + "grad_norm": 28.646717071533203, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8691627979278564, + "num_tokens": 516258642.0, + "step": 13536 + }, + { + "epoch": 1.7220455412797353, + "ewc_loss": 0.0434945672750473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1747284336015582e-05, + "grad_norm": 28.642253875732422, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8601064682006836, + "num_tokens": 516298971.0, + "step": 13537 + }, + { + "epoch": 1.7221727515583258, + "ewc_loss": 0.043599359691143036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.179968032578472e-05, + "grad_norm": 28.712684631347656, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8786448836326599, + "num_tokens": 516334457.0, + "step": 13538 + }, + { + "epoch": 1.7222999618369164, + "ewc_loss": 0.04355140030384064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1775700588477775e-05, + "grad_norm": 28.62765121459961, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8924219608306885, + "num_tokens": 516374585.0, + "step": 13539 + }, + { + "epoch": 1.7224271721155069, + "ewc_loss": 0.04362066462635994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1810332327731885e-05, + "grad_norm": 28.714645385742188, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8528659343719482, + "num_tokens": 516414180.0, + "step": 13540 + }, + { + "epoch": 1.7225543823940974, + "ewc_loss": 0.04361293092370033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1806465156259947e-05, + "grad_norm": 28.660837173461914, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8753538131713867, + "num_tokens": 516450189.0, + "step": 13541 + }, + { + "epoch": 1.722681592672688, + "ewc_loss": 0.04355359822511673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1776799258077517e-05, + "grad_norm": 28.674108505249023, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8649044632911682, + "num_tokens": 516487549.0, + "step": 13542 + }, + { + "epoch": 1.7228088029512785, + "ewc_loss": 0.043631844222545624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.181592208216898e-05, + "grad_norm": 28.696027755737305, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8539257049560547, + "num_tokens": 516526649.0, + "step": 13543 + }, + { + "epoch": 1.722936013229869, + "ewc_loss": 0.043544966727495193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1772482796222903e-05, + "grad_norm": 28.67344856262207, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.872252345085144, + "num_tokens": 516563829.0, + "step": 13544 + }, + { + "epoch": 1.7230632235084595, + "ewc_loss": 0.04358217492699623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.179108741984237e-05, + "grad_norm": 28.667367935180664, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8504478931427002, + "num_tokens": 516605179.0, + "step": 13545 + }, + { + "epoch": 1.72319043378705, + "ewc_loss": 0.04354647547006607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1773237676825374e-05, + "grad_norm": 28.667747497558594, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.864891767501831, + "num_tokens": 516641576.0, + "step": 13546 + }, + { + "epoch": 1.7233176440656406, + "ewc_loss": 0.04355500638484955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.177750320697669e-05, + "grad_norm": 28.702529907226562, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.864267110824585, + "num_tokens": 516678687.0, + "step": 13547 + }, + { + "epoch": 1.7234448543442311, + "ewc_loss": 0.04362146556377411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1810732505400665e-05, + "grad_norm": 28.75908088684082, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8623126149177551, + "num_tokens": 516711484.0, + "step": 13548 + }, + { + "epoch": 1.7235720646228216, + "ewc_loss": 0.04351798817515373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.175899498979561e-05, + "grad_norm": 28.671857833862305, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8623641729354858, + "num_tokens": 516750598.0, + "step": 13549 + }, + { + "epoch": 1.7236992749014122, + "ewc_loss": 0.04352056235074997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1760281015303917e-05, + "grad_norm": 28.79281997680664, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8734938502311707, + "num_tokens": 516793438.0, + "step": 13550 + }, + { + "epoch": 1.7238264851800027, + "ewc_loss": 0.04355642944574356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1778214431833476e-05, + "grad_norm": 28.647090911865234, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8812155723571777, + "num_tokens": 516828692.0, + "step": 13551 + }, + { + "epoch": 1.723953695458593, + "ewc_loss": 0.043537385761737823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1768692022305913e-05, + "grad_norm": 28.778240203857422, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8677978515625, + "num_tokens": 516867592.0, + "step": 13552 + }, + { + "epoch": 1.7240809057371835, + "ewc_loss": 0.043572600930929184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1786299839732237e-05, + "grad_norm": 28.622207641601562, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8544521927833557, + "num_tokens": 516907668.0, + "step": 13553 + }, + { + "epoch": 1.724208116015774, + "ewc_loss": 0.043571844696998596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.17859214899363e-05, + "grad_norm": 28.75069808959961, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8657408356666565, + "num_tokens": 516944628.0, + "step": 13554 + }, + { + "epoch": 1.7243353262943646, + "ewc_loss": 0.04354391247034073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1771957108285278e-05, + "grad_norm": 28.664396286010742, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8611884117126465, + "num_tokens": 516989027.0, + "step": 13555 + }, + { + "epoch": 1.7244625365729551, + "ewc_loss": 0.04355435073375702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.177717578888405e-05, + "grad_norm": 28.677017211914062, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8613352179527283, + "num_tokens": 517024464.0, + "step": 13556 + }, + { + "epoch": 1.7245897468515456, + "ewc_loss": 0.04356648027896881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1783240299555473e-05, + "grad_norm": 28.676156997680664, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8699225187301636, + "num_tokens": 517067714.0, + "step": 13557 + }, + { + "epoch": 1.724716957130136, + "ewc_loss": 0.043682534247636795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.184126788051799e-05, + "grad_norm": 28.726333618164062, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8703123927116394, + "num_tokens": 517107564.0, + "step": 13558 + }, + { + "epoch": 1.7248441674087265, + "ewc_loss": 0.04351062327623367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.175531153625343e-05, + "grad_norm": 28.602937698364258, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8719924688339233, + "num_tokens": 517148240.0, + "step": 13559 + }, + { + "epoch": 1.724971377687317, + "ewc_loss": 0.043520279228687286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.176013913413044e-05, + "grad_norm": 28.674888610839844, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8594841361045837, + "num_tokens": 517187841.0, + "step": 13560 + }, + { + "epoch": 1.7250985879659075, + "ewc_loss": 0.04355447739362717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1777239453513175e-05, + "grad_norm": 28.777456283569336, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8509101867675781, + "num_tokens": 517225732.0, + "step": 13561 + }, + { + "epoch": 1.725225798244498, + "ewc_loss": 0.043609775602817535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1804888092447072e-05, + "grad_norm": 28.792865753173828, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8500231504440308, + "num_tokens": 517260423.0, + "step": 13562 + }, + { + "epoch": 1.7253530085230886, + "ewc_loss": 0.043535370379686356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.176768430217635e-05, + "grad_norm": 28.657941818237305, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8539940714836121, + "num_tokens": 517300862.0, + "step": 13563 + }, + { + "epoch": 1.7254802188016791, + "ewc_loss": 0.04345942288637161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1729711079387926e-05, + "grad_norm": 28.662446975708008, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8709709644317627, + "num_tokens": 517341010.0, + "step": 13564 + }, + { + "epoch": 1.7256074290802697, + "ewc_loss": 0.04357355460524559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1786776414955966e-05, + "grad_norm": 28.744604110717773, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8505401015281677, + "num_tokens": 517376853.0, + "step": 13565 + }, + { + "epoch": 1.7257346393588602, + "ewc_loss": 0.043468352407217026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.173417669837363e-05, + "grad_norm": 28.689428329467773, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8604334592819214, + "num_tokens": 517421854.0, + "step": 13566 + }, + { + "epoch": 1.7258618496374507, + "ewc_loss": 0.04353061690926552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1765308702015318e-05, + "grad_norm": 28.665302276611328, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8615168333053589, + "num_tokens": 517461711.0, + "step": 13567 + }, + { + "epoch": 1.7259890599160412, + "ewc_loss": 0.04347899928689003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1739499061368406e-05, + "grad_norm": 28.6420955657959, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.873079776763916, + "num_tokens": 517494755.0, + "step": 13568 + }, + { + "epoch": 1.7261162701946318, + "ewc_loss": 0.04355937987565994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.177968963223975e-05, + "grad_norm": 28.850858688354492, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8567017316818237, + "num_tokens": 517539061.0, + "step": 13569 + }, + { + "epoch": 1.7262434804732223, + "ewc_loss": 0.04353960230946541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1769801605842076e-05, + "grad_norm": 28.687522888183594, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8620383739471436, + "num_tokens": 517579768.0, + "step": 13570 + }, + { + "epoch": 1.7263706907518128, + "ewc_loss": 0.04353856295347214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1769281374872662e-05, + "grad_norm": 28.78476333618164, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8781300187110901, + "num_tokens": 517612006.0, + "step": 13571 + }, + { + "epoch": 1.7264979010304033, + "ewc_loss": 0.043503060936927795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1751529857283458e-05, + "grad_norm": 28.748371124267578, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8657708168029785, + "num_tokens": 517647547.0, + "step": 13572 + }, + { + "epoch": 1.7266251113089939, + "ewc_loss": 0.04354778304696083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1773890694021247e-05, + "grad_norm": 28.929903030395508, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.85649573802948, + "num_tokens": 517689067.0, + "step": 13573 + }, + { + "epoch": 1.7267523215875844, + "ewc_loss": 0.04347142204642296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1735711925430223e-05, + "grad_norm": 28.592844009399414, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8722608089447021, + "num_tokens": 517729967.0, + "step": 13574 + }, + { + "epoch": 1.726879531866175, + "ewc_loss": 0.04344587400555611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.172293716284912e-05, + "grad_norm": 28.725017547607422, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.865009605884552, + "num_tokens": 517770427.0, + "step": 13575 + }, + { + "epoch": 1.7270067421447655, + "ewc_loss": 0.04354611411690712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1773057596874423e-05, + "grad_norm": 28.72139549255371, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8640356063842773, + "num_tokens": 517804202.0, + "step": 13576 + }, + { + "epoch": 1.7271339524233558, + "ewc_loss": 0.043497614562511444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.174880683014635e-05, + "grad_norm": 28.69453239440918, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8582562208175659, + "num_tokens": 517834825.0, + "step": 13577 + }, + { + "epoch": 1.7272611627019463, + "ewc_loss": 0.04354363679885864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.177181886509061e-05, + "grad_norm": 28.786191940307617, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8577021956443787, + "num_tokens": 517874600.0, + "step": 13578 + }, + { + "epoch": 1.7273883729805368, + "ewc_loss": 0.043503906577825546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1751953681814484e-05, + "grad_norm": 28.686927795410156, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8563605546951294, + "num_tokens": 517913488.0, + "step": 13579 + }, + { + "epoch": 1.7275155832591274, + "ewc_loss": 0.043558698147535324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.177934948122129e-05, + "grad_norm": 28.824596405029297, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8802537322044373, + "num_tokens": 517951094.0, + "step": 13580 + }, + { + "epoch": 1.7276427935377179, + "ewc_loss": 0.04353851079940796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1769255909021012e-05, + "grad_norm": 28.72733497619629, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8550239205360413, + "num_tokens": 517993811.0, + "step": 13581 + }, + { + "epoch": 1.7277700038163084, + "ewc_loss": 0.0435497872531414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.17748929571826e-05, + "grad_norm": 28.793622970581055, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8691072463989258, + "num_tokens": 518032860.0, + "step": 13582 + }, + { + "epoch": 1.7278972140948987, + "ewc_loss": 0.043537430465221405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.176871566916816e-05, + "grad_norm": 28.822938919067383, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.860946536064148, + "num_tokens": 518070256.0, + "step": 13583 + }, + { + "epoch": 1.7280244243734892, + "ewc_loss": 0.04354545474052429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1772726540802978e-05, + "grad_norm": 28.790985107421875, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8719841241836548, + "num_tokens": 518102828.0, + "step": 13584 + }, + { + "epoch": 1.7281516346520798, + "ewc_loss": 0.04350604861974716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1753025066573173e-05, + "grad_norm": 28.636335372924805, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8789396286010742, + "num_tokens": 518146859.0, + "step": 13585 + }, + { + "epoch": 1.7282788449306703, + "ewc_loss": 0.043514546006917953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1757272406830452e-05, + "grad_norm": 28.781169891357422, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.873160183429718, + "num_tokens": 518186173.0, + "step": 13586 + }, + { + "epoch": 1.7284060552092608, + "ewc_loss": 0.04359355941414833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1796779037686065e-05, + "grad_norm": 28.70064353942871, + "learning_rate": 1e-06, + "loss": 0.4823, + "mean_token_accuracy": 0.8498173356056213, + "num_tokens": 518221903.0, + "step": 13587 + }, + { + "epoch": 1.7285332654878514, + "ewc_loss": 0.04352480173110962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1762400137959048e-05, + "grad_norm": 28.782730102539062, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8565884232521057, + "num_tokens": 518258048.0, + "step": 13588 + }, + { + "epoch": 1.7286604757664419, + "ewc_loss": 0.04364364594221115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1821822883794084e-05, + "grad_norm": 28.777957916259766, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.857063889503479, + "num_tokens": 518299127.0, + "step": 13589 + }, + { + "epoch": 1.7287876860450324, + "ewc_loss": 0.04357264190912247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.178632166760508e-05, + "grad_norm": 28.739818572998047, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8761110305786133, + "num_tokens": 518336520.0, + "step": 13590 + }, + { + "epoch": 1.728914896323623, + "ewc_loss": 0.04347444698214531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.173722350562457e-05, + "grad_norm": 28.809406280517578, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8612711429595947, + "num_tokens": 518368484.0, + "step": 13591 + }, + { + "epoch": 1.7290421066022135, + "ewc_loss": 0.0435786247253418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.178931208618451e-05, + "grad_norm": 28.782621383666992, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8629852533340454, + "num_tokens": 518408439.0, + "step": 13592 + }, + { + "epoch": 1.729169316880804, + "ewc_loss": 0.043452996760606766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1726498744101264e-05, + "grad_norm": 28.819852828979492, + "learning_rate": 1e-06, + "loss": 0.4931, + "mean_token_accuracy": 0.8461626768112183, + "num_tokens": 518447414.0, + "step": 13593 + }, + { + "epoch": 1.7292965271593945, + "ewc_loss": 0.043565548956394196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1782774638268165e-05, + "grad_norm": 28.86014175415039, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.86147141456604, + "num_tokens": 518486455.0, + "step": 13594 + }, + { + "epoch": 1.729423737437985, + "ewc_loss": 0.04343057796359062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.171528831240721e-05, + "grad_norm": 28.753129959106445, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8822952508926392, + "num_tokens": 518527023.0, + "step": 13595 + }, + { + "epoch": 1.7295509477165756, + "ewc_loss": 0.043499529361724854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.174976543756202e-05, + "grad_norm": 28.851421356201172, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8598341941833496, + "num_tokens": 518563285.0, + "step": 13596 + }, + { + "epoch": 1.729678157995166, + "ewc_loss": 0.04346660524606705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1733301764470525e-05, + "grad_norm": 28.672266006469727, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8671271800994873, + "num_tokens": 518607610.0, + "step": 13597 + }, + { + "epoch": 1.7298053682737566, + "ewc_loss": 0.04345414787530899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1727073544752784e-05, + "grad_norm": 28.749757766723633, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8741233348846436, + "num_tokens": 518649434.0, + "step": 13598 + }, + { + "epoch": 1.7299325785523472, + "ewc_loss": 0.04350941255688667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.175470581278205e-05, + "grad_norm": 28.70023536682129, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8734430074691772, + "num_tokens": 518684508.0, + "step": 13599 + }, + { + "epoch": 1.7300597888309377, + "ewc_loss": 0.043429795652627945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1714897229685448e-05, + "grad_norm": 28.759342193603516, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8717451095581055, + "num_tokens": 518717883.0, + "step": 13600 + }, + { + "epoch": 1.730186999109528, + "ewc_loss": 0.04345368221402168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.172684071410913e-05, + "grad_norm": 28.828628540039062, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8716030716896057, + "num_tokens": 518756158.0, + "step": 13601 + }, + { + "epoch": 1.7303142093881185, + "ewc_loss": 0.04347160458564758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.17358028749004e-05, + "grad_norm": 28.71285057067871, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8660378456115723, + "num_tokens": 518793003.0, + "step": 13602 + }, + { + "epoch": 1.730441419666709, + "ewc_loss": 0.04349469766020775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1747348000644706e-05, + "grad_norm": 28.90030860900879, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8601295351982117, + "num_tokens": 518838485.0, + "step": 13603 + }, + { + "epoch": 1.7305686299452996, + "ewc_loss": 0.043592847883701324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1796424334752373e-05, + "grad_norm": 28.80497169494629, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8596489429473877, + "num_tokens": 518885936.0, + "step": 13604 + }, + { + "epoch": 1.7306958402238901, + "ewc_loss": 0.04345185309648514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1725925762439147e-05, + "grad_norm": 28.76018524169922, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.853521466255188, + "num_tokens": 518925684.0, + "step": 13605 + }, + { + "epoch": 1.7308230505024806, + "ewc_loss": 0.04347413033246994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.173706525354646e-05, + "grad_norm": 28.80686378479004, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8687641620635986, + "num_tokens": 518966438.0, + "step": 13606 + }, + { + "epoch": 1.730950260781071, + "ewc_loss": 0.04351906478404999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1759531591669656e-05, + "grad_norm": 28.702924728393555, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8680540323257446, + "num_tokens": 519013576.0, + "step": 13607 + }, + { + "epoch": 1.7310774710596615, + "ewc_loss": 0.04354403167963028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.177201531594619e-05, + "grad_norm": 28.7972354888916, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8628679513931274, + "num_tokens": 519045342.0, + "step": 13608 + }, + { + "epoch": 1.731204681338252, + "ewc_loss": 0.043503496795892715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1751748136011884e-05, + "grad_norm": 28.75922393798828, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8515641093254089, + "num_tokens": 519088566.0, + "step": 13609 + }, + { + "epoch": 1.7313318916168425, + "ewc_loss": 0.04353276267647743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.176638190576341e-05, + "grad_norm": 28.881988525390625, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8687343597412109, + "num_tokens": 519126516.0, + "step": 13610 + }, + { + "epoch": 1.731459101895433, + "ewc_loss": 0.043442774564027786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.17213873838773e-05, + "grad_norm": 28.747623443603516, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8743492364883423, + "num_tokens": 519165431.0, + "step": 13611 + }, + { + "epoch": 1.7315863121740236, + "ewc_loss": 0.04337251931428909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1686259060516022e-05, + "grad_norm": 28.696481704711914, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8813841342926025, + "num_tokens": 519200991.0, + "step": 13612 + }, + { + "epoch": 1.7317135224526141, + "ewc_loss": 0.04352175071835518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1760875824838877e-05, + "grad_norm": 28.811981201171875, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8730312585830688, + "num_tokens": 519245988.0, + "step": 13613 + }, + { + "epoch": 1.7318407327312046, + "ewc_loss": 0.04343315213918686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.171657615690492e-05, + "grad_norm": 28.789634704589844, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8708436489105225, + "num_tokens": 519282595.0, + "step": 13614 + }, + { + "epoch": 1.7319679430097952, + "ewc_loss": 0.043494679033756256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.174733890569769e-05, + "grad_norm": 28.706066131591797, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8595463037490845, + "num_tokens": 519319061.0, + "step": 13615 + }, + { + "epoch": 1.7320951532883857, + "ewc_loss": 0.0434315949678421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1715797629440203e-05, + "grad_norm": 28.78898811340332, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8630753755569458, + "num_tokens": 519358617.0, + "step": 13616 + }, + { + "epoch": 1.7322223635669762, + "ewc_loss": 0.043588265776634216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1794132408103906e-05, + "grad_norm": 28.799345016479492, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8648682236671448, + "num_tokens": 519392461.0, + "step": 13617 + }, + { + "epoch": 1.7323495738455668, + "ewc_loss": 0.04347306117415428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.173653047066182e-05, + "grad_norm": 28.725400924682617, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8736956119537354, + "num_tokens": 519432893.0, + "step": 13618 + }, + { + "epoch": 1.7324767841241573, + "ewc_loss": 0.04357011243700981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1785055650980212e-05, + "grad_norm": 28.811193466186523, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8667311668395996, + "num_tokens": 519473445.0, + "step": 13619 + }, + { + "epoch": 1.7326039944027478, + "ewc_loss": 0.043630678206682205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1815340005559847e-05, + "grad_norm": 28.70549201965332, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8665032386779785, + "num_tokens": 519518883.0, + "step": 13620 + }, + { + "epoch": 1.7327312046813383, + "ewc_loss": 0.043453484773635864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.172674248868134e-05, + "grad_norm": 28.667306900024414, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.870846688747406, + "num_tokens": 519550123.0, + "step": 13621 + }, + { + "epoch": 1.7328584149599289, + "ewc_loss": 0.04359891265630722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1799456590088084e-05, + "grad_norm": 28.843782424926758, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8561994433403015, + "num_tokens": 519592538.0, + "step": 13622 + }, + { + "epoch": 1.7329856252385194, + "ewc_loss": 0.04359803348779678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1799016394652426e-05, + "grad_norm": 28.62442398071289, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8657034039497375, + "num_tokens": 519633466.0, + "step": 13623 + }, + { + "epoch": 1.73311283551711, + "ewc_loss": 0.04353594407439232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.176797170250211e-05, + "grad_norm": 28.79817008972168, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8622808456420898, + "num_tokens": 519672569.0, + "step": 13624 + }, + { + "epoch": 1.7332400457957005, + "ewc_loss": 0.043637506663799286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.18187524296809e-05, + "grad_norm": 28.734224319458008, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8716200590133667, + "num_tokens": 519707383.0, + "step": 13625 + }, + { + "epoch": 1.7333672560742908, + "ewc_loss": 0.04350657016038895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.175328518205788e-05, + "grad_norm": 28.7243595123291, + "learning_rate": 1e-06, + "loss": 0.4926, + "mean_token_accuracy": 0.847322404384613, + "num_tokens": 519748187.0, + "step": 13626 + }, + { + "epoch": 1.7334944663528813, + "ewc_loss": 0.0435861200094223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1793059204355814e-05, + "grad_norm": 28.70348358154297, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8630369901657104, + "num_tokens": 519790215.0, + "step": 13627 + }, + { + "epoch": 1.7336216766314718, + "ewc_loss": 0.043545715510845184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1772857508040033e-05, + "grad_norm": 28.724153518676758, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8579447269439697, + "num_tokens": 519827278.0, + "step": 13628 + }, + { + "epoch": 1.7337488869100623, + "ewc_loss": 0.04353330284357071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1766651116195135e-05, + "grad_norm": 28.635910034179688, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8715513944625854, + "num_tokens": 519860216.0, + "step": 13629 + }, + { + "epoch": 1.7338760971886529, + "ewc_loss": 0.04349694401025772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1748472136096098e-05, + "grad_norm": 28.721206665039062, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8654615879058838, + "num_tokens": 519894442.0, + "step": 13630 + }, + { + "epoch": 1.7340033074672434, + "ewc_loss": 0.043564461171627045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1782230760436505e-05, + "grad_norm": 28.700376510620117, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8659181594848633, + "num_tokens": 519932524.0, + "step": 13631 + }, + { + "epoch": 1.7341305177458337, + "ewc_loss": 0.043574873358011246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1787436708109453e-05, + "grad_norm": 28.7053165435791, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8653887510299683, + "num_tokens": 519974366.0, + "step": 13632 + }, + { + "epoch": 1.7342577280244242, + "ewc_loss": 0.043594177812337875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1797088265884668e-05, + "grad_norm": 28.831449508666992, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8759497404098511, + "num_tokens": 520013983.0, + "step": 13633 + }, + { + "epoch": 1.7343849383030148, + "ewc_loss": 0.04359592869877815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1797965018777177e-05, + "grad_norm": 28.835832595825195, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8692120313644409, + "num_tokens": 520047671.0, + "step": 13634 + }, + { + "epoch": 1.7345121485816053, + "ewc_loss": 0.04360842704772949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1804213247378357e-05, + "grad_norm": 28.901037216186523, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8622344732284546, + "num_tokens": 520086574.0, + "step": 13635 + }, + { + "epoch": 1.7346393588601958, + "ewc_loss": 0.04359034448862076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1795172870042734e-05, + "grad_norm": 28.763805389404297, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8513484597206116, + "num_tokens": 520130613.0, + "step": 13636 + }, + { + "epoch": 1.7347665691387864, + "ewc_loss": 0.04355848208069801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1779240341857076e-05, + "grad_norm": 28.813873291015625, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8831735253334045, + "num_tokens": 520163763.0, + "step": 13637 + }, + { + "epoch": 1.7348937794173769, + "ewc_loss": 0.04363992437720299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1819962057634257e-05, + "grad_norm": 28.860830307006836, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8631840348243713, + "num_tokens": 520198663.0, + "step": 13638 + }, + { + "epoch": 1.7350209896959674, + "ewc_loss": 0.04349704459309578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1748523067799397e-05, + "grad_norm": 28.828296661376953, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8685183525085449, + "num_tokens": 520231737.0, + "step": 13639 + }, + { + "epoch": 1.735148199974558, + "ewc_loss": 0.043553948402404785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1776973881060258e-05, + "grad_norm": 28.784271240234375, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8709489703178406, + "num_tokens": 520275664.0, + "step": 13640 + }, + { + "epoch": 1.7352754102531485, + "ewc_loss": 0.04351503774523735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.175751797039993e-05, + "grad_norm": 28.760108947753906, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8545382022857666, + "num_tokens": 520316100.0, + "step": 13641 + }, + { + "epoch": 1.735402620531739, + "ewc_loss": 0.04359428584575653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1797142835566774e-05, + "grad_norm": 28.98806381225586, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8632266521453857, + "num_tokens": 520348167.0, + "step": 13642 + }, + { + "epoch": 1.7355298308103295, + "ewc_loss": 0.043523356318473816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.176167799916584e-05, + "grad_norm": 28.62784194946289, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8520506620407104, + "num_tokens": 520385234.0, + "step": 13643 + }, + { + "epoch": 1.73565704108892, + "ewc_loss": 0.043567344546318054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1783671400044113e-05, + "grad_norm": 28.965618133544922, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8500112295150757, + "num_tokens": 520423051.0, + "step": 13644 + }, + { + "epoch": 1.7357842513675106, + "ewc_loss": 0.043636031448841095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1818015738972463e-05, + "grad_norm": 28.813085556030273, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8563051819801331, + "num_tokens": 520467202.0, + "step": 13645 + }, + { + "epoch": 1.735911461646101, + "ewc_loss": 0.04346904158592224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.17345204873709e-05, + "grad_norm": 28.865625381469727, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8637796640396118, + "num_tokens": 520501434.0, + "step": 13646 + }, + { + "epoch": 1.7360386719246916, + "ewc_loss": 0.04360178858041763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1800893591716886e-05, + "grad_norm": 28.80437660217285, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8617544770240784, + "num_tokens": 520542757.0, + "step": 13647 + }, + { + "epoch": 1.7361658822032822, + "ewc_loss": 0.043425917625427246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1712958186981268e-05, + "grad_norm": 28.766254425048828, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.856269359588623, + "num_tokens": 520585216.0, + "step": 13648 + }, + { + "epoch": 1.7362930924818727, + "ewc_loss": 0.043614838272333145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1807418306707405e-05, + "grad_norm": 28.930004119873047, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8661765456199646, + "num_tokens": 520620066.0, + "step": 13649 + }, + { + "epoch": 1.736420302760463, + "ewc_loss": 0.043559085577726364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1779542294098064e-05, + "grad_norm": 28.78461456298828, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8732645511627197, + "num_tokens": 520660917.0, + "step": 13650 + }, + { + "epoch": 1.7365475130390535, + "ewc_loss": 0.043490342795848846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1745170670328662e-05, + "grad_norm": 28.90618896484375, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.857651948928833, + "num_tokens": 520704255.0, + "step": 13651 + }, + { + "epoch": 1.736674723317644, + "ewc_loss": 0.0435488298535347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.177441456296947e-05, + "grad_norm": 28.726642608642578, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8525207042694092, + "num_tokens": 520743036.0, + "step": 13652 + }, + { + "epoch": 1.7368019335962346, + "ewc_loss": 0.043503500521183014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1751749955001287e-05, + "grad_norm": 28.88421630859375, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8597883582115173, + "num_tokens": 520778325.0, + "step": 13653 + }, + { + "epoch": 1.736929143874825, + "ewc_loss": 0.0435832254588604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1791613107779995e-05, + "grad_norm": 28.894250869750977, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8777323365211487, + "num_tokens": 520815226.0, + "step": 13654 + }, + { + "epoch": 1.7370563541534156, + "ewc_loss": 0.043567195534706116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.178359864046797e-05, + "grad_norm": 28.861848831176758, + "learning_rate": 1e-06, + "loss": 0.5055, + "mean_token_accuracy": 0.8473877310752869, + "num_tokens": 520856606.0, + "step": 13655 + }, + { + "epoch": 1.737183564432006, + "ewc_loss": 0.04348449409008026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1742247554357164e-05, + "grad_norm": 28.764995574951172, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8560714721679688, + "num_tokens": 520893493.0, + "step": 13656 + }, + { + "epoch": 1.7373107747105965, + "ewc_loss": 0.04353935644030571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1769677914562635e-05, + "grad_norm": 28.826154708862305, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8738878965377808, + "num_tokens": 520930391.0, + "step": 13657 + }, + { + "epoch": 1.737437984989187, + "ewc_loss": 0.04355065897107124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1775329514639452e-05, + "grad_norm": 28.886322021484375, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8656286597251892, + "num_tokens": 520970512.0, + "step": 13658 + }, + { + "epoch": 1.7375651952677775, + "ewc_loss": 0.04351738840341568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1758694856544025e-05, + "grad_norm": 28.728870391845703, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8582882881164551, + "num_tokens": 521011787.0, + "step": 13659 + }, + { + "epoch": 1.737692405546368, + "ewc_loss": 0.043452393263578415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1726196791860275e-05, + "grad_norm": 28.8714656829834, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8659045696258545, + "num_tokens": 521053494.0, + "step": 13660 + }, + { + "epoch": 1.7378196158249586, + "ewc_loss": 0.04356744512915611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1783722331747413e-05, + "grad_norm": 28.786081314086914, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8541824221611023, + "num_tokens": 521089291.0, + "step": 13661 + }, + { + "epoch": 1.7379468261035491, + "ewc_loss": 0.043456416577100754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1728208594140597e-05, + "grad_norm": 28.89657974243164, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8768389225006104, + "num_tokens": 521128910.0, + "step": 13662 + }, + { + "epoch": 1.7380740363821396, + "ewc_loss": 0.043557360768318176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1778680093120784e-05, + "grad_norm": 28.873607635498047, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8625196218490601, + "num_tokens": 521168175.0, + "step": 13663 + }, + { + "epoch": 1.7382012466607302, + "ewc_loss": 0.043449483811855316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1724741600337438e-05, + "grad_norm": 28.89163589477539, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8615211248397827, + "num_tokens": 521208020.0, + "step": 13664 + }, + { + "epoch": 1.7383284569393207, + "ewc_loss": 0.043479204177856445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1739602743764408e-05, + "grad_norm": 28.802213668823242, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8779270648956299, + "num_tokens": 521249828.0, + "step": 13665 + }, + { + "epoch": 1.7384556672179112, + "ewc_loss": 0.043486274778842926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1743137040175498e-05, + "grad_norm": 28.861316680908203, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8692275285720825, + "num_tokens": 521289940.0, + "step": 13666 + }, + { + "epoch": 1.7385828774965018, + "ewc_loss": 0.04348352923989296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1741765522165224e-05, + "grad_norm": 28.72121810913086, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8830053806304932, + "num_tokens": 521327426.0, + "step": 13667 + }, + { + "epoch": 1.7387100877750923, + "ewc_loss": 0.0435267798602581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1763389668194577e-05, + "grad_norm": 28.977893829345703, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8557002544403076, + "num_tokens": 521372112.0, + "step": 13668 + }, + { + "epoch": 1.7388372980536828, + "ewc_loss": 0.04356066882610321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1780333554488607e-05, + "grad_norm": 28.78550148010254, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8644805550575256, + "num_tokens": 521410842.0, + "step": 13669 + }, + { + "epoch": 1.7389645083322733, + "ewc_loss": 0.043450064957141876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1725032638642006e-05, + "grad_norm": 28.874662399291992, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8755319714546204, + "num_tokens": 521446854.0, + "step": 13670 + }, + { + "epoch": 1.7390917186108639, + "ewc_loss": 0.04353345185518265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.176672569476068e-05, + "grad_norm": 28.911104202270508, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8558807373046875, + "num_tokens": 521486284.0, + "step": 13671 + }, + { + "epoch": 1.7392189288894544, + "ewc_loss": 0.04349983483552933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1749918232671916e-05, + "grad_norm": 28.800790786743164, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8532819747924805, + "num_tokens": 521522716.0, + "step": 13672 + }, + { + "epoch": 1.739346139168045, + "ewc_loss": 0.043472230434417725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.173611574107781e-05, + "grad_norm": 28.8780460357666, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8591599464416504, + "num_tokens": 521551549.0, + "step": 13673 + }, + { + "epoch": 1.7394733494466355, + "ewc_loss": 0.04351063445210457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.175531699322164e-05, + "grad_norm": 28.58664894104004, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8678137063980103, + "num_tokens": 521589552.0, + "step": 13674 + }, + { + "epoch": 1.7396005597252258, + "ewc_loss": 0.04350493848323822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1752468455815688e-05, + "grad_norm": 28.883573532104492, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8772780895233154, + "num_tokens": 521627906.0, + "step": 13675 + }, + { + "epoch": 1.7397277700038163, + "ewc_loss": 0.043654266744852066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1827132513863035e-05, + "grad_norm": 28.719131469726562, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.849758505821228, + "num_tokens": 521666615.0, + "step": 13676 + }, + { + "epoch": 1.7398549802824068, + "ewc_loss": 0.04344220459461212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1721101802540943e-05, + "grad_norm": 28.90583610534668, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8618295192718506, + "num_tokens": 521707742.0, + "step": 13677 + }, + { + "epoch": 1.7399821905609973, + "ewc_loss": 0.04353934898972511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1769674276583828e-05, + "grad_norm": 28.650402069091797, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8554564714431763, + "num_tokens": 521746848.0, + "step": 13678 + }, + { + "epoch": 1.7401094008395879, + "ewc_loss": 0.04350091889500618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1750460291514173e-05, + "grad_norm": 28.98402214050293, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8671091198921204, + "num_tokens": 521782917.0, + "step": 13679 + }, + { + "epoch": 1.7402366111181784, + "ewc_loss": 0.04363321140408516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1816606022184715e-05, + "grad_norm": 28.687562942504883, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.862525999546051, + "num_tokens": 521825609.0, + "step": 13680 + }, + { + "epoch": 1.7403638213967687, + "ewc_loss": 0.04348978027701378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1744890545960516e-05, + "grad_norm": 28.789424896240234, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8786089420318604, + "num_tokens": 521862277.0, + "step": 13681 + }, + { + "epoch": 1.7404910316753592, + "ewc_loss": 0.043666332960128784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1833166101714596e-05, + "grad_norm": 28.890913009643555, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8832066655158997, + "num_tokens": 521894951.0, + "step": 13682 + }, + { + "epoch": 1.7406182419539498, + "ewc_loss": 0.04354085400700569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1770427338196896e-05, + "grad_norm": 28.786619186401367, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.865575909614563, + "num_tokens": 521932158.0, + "step": 13683 + }, + { + "epoch": 1.7407454522325403, + "ewc_loss": 0.04353228583931923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1766143618151546e-05, + "grad_norm": 28.783842086791992, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8714726567268372, + "num_tokens": 521971050.0, + "step": 13684 + }, + { + "epoch": 1.7408726625111308, + "ewc_loss": 0.04357880353927612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1789401216665283e-05, + "grad_norm": 28.750661849975586, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8763724565505981, + "num_tokens": 522007617.0, + "step": 13685 + }, + { + "epoch": 1.7409998727897213, + "ewc_loss": 0.04362731799483299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.181365925935097e-05, + "grad_norm": 28.921842575073242, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8534521460533142, + "num_tokens": 522053484.0, + "step": 13686 + }, + { + "epoch": 1.7411270830683119, + "ewc_loss": 0.043557628989219666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1778814698336646e-05, + "grad_norm": 28.58578872680664, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8615012764930725, + "num_tokens": 522088047.0, + "step": 13687 + }, + { + "epoch": 1.7412542933469024, + "ewc_loss": 0.04356703907251358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1783518604934216e-05, + "grad_norm": 28.971593856811523, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8640580773353577, + "num_tokens": 522129865.0, + "step": 13688 + }, + { + "epoch": 1.741381503625493, + "ewc_loss": 0.04352937266230583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1764686607639305e-05, + "grad_norm": 28.575355529785156, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8605517148971558, + "num_tokens": 522170154.0, + "step": 13689 + }, + { + "epoch": 1.7415087139040835, + "ewc_loss": 0.0435190312564373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1759515220765024e-05, + "grad_norm": 29.015289306640625, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8890787363052368, + "num_tokens": 522204586.0, + "step": 13690 + }, + { + "epoch": 1.741635924182674, + "ewc_loss": 0.04371843487024307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.185921766795218e-05, + "grad_norm": 28.690839767456055, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8698223829269409, + "num_tokens": 522241942.0, + "step": 13691 + }, + { + "epoch": 1.7417631344612645, + "ewc_loss": 0.04343826323747635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1719131837016903e-05, + "grad_norm": 28.826492309570312, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8616548776626587, + "num_tokens": 522275800.0, + "step": 13692 + }, + { + "epoch": 1.741890344739855, + "ewc_loss": 0.043725017458200455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1862508219783194e-05, + "grad_norm": 28.77372169494629, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8594879508018494, + "num_tokens": 522317024.0, + "step": 13693 + }, + { + "epoch": 1.7420175550184456, + "ewc_loss": 0.043533362448215485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1766682039014995e-05, + "grad_norm": 28.79448699951172, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8759154677391052, + "num_tokens": 522358510.0, + "step": 13694 + }, + { + "epoch": 1.742144765297036, + "ewc_loss": 0.04357142746448517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1785714125144295e-05, + "grad_norm": 28.829437255859375, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.858899712562561, + "num_tokens": 522395960.0, + "step": 13695 + }, + { + "epoch": 1.7422719755756266, + "ewc_loss": 0.043519627302885056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1759813535027206e-05, + "grad_norm": 28.67156982421875, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.872969925403595, + "num_tokens": 522428807.0, + "step": 13696 + }, + { + "epoch": 1.7423991858542172, + "ewc_loss": 0.04363620653748512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1818103050463833e-05, + "grad_norm": 28.92159080505371, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8663733005523682, + "num_tokens": 522464472.0, + "step": 13697 + }, + { + "epoch": 1.7425263961328077, + "ewc_loss": 0.04352802038192749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1764009943581186e-05, + "grad_norm": 28.665760040283203, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8789052963256836, + "num_tokens": 522509522.0, + "step": 13698 + }, + { + "epoch": 1.742653606411398, + "ewc_loss": 0.043528299778699875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.176415000576526e-05, + "grad_norm": 28.703832626342773, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8508802652359009, + "num_tokens": 522545581.0, + "step": 13699 + }, + { + "epoch": 1.7427808166899885, + "ewc_loss": 0.043608855456113815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1804427888127975e-05, + "grad_norm": 28.797279357910156, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8688359260559082, + "num_tokens": 522582072.0, + "step": 13700 + }, + { + "epoch": 1.742908026968579, + "ewc_loss": 0.04359272122383118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.179636067012325e-05, + "grad_norm": 28.777711868286133, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8697773814201355, + "num_tokens": 522618183.0, + "step": 13701 + }, + { + "epoch": 1.7430352372471696, + "ewc_loss": 0.04359400272369385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1797000954393297e-05, + "grad_norm": 28.726341247558594, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8541104793548584, + "num_tokens": 522660578.0, + "step": 13702 + }, + { + "epoch": 1.74316244752576, + "ewc_loss": 0.04352497309446335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1762485630461015e-05, + "grad_norm": 28.766897201538086, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.877730667591095, + "num_tokens": 522701934.0, + "step": 13703 + }, + { + "epoch": 1.7432896578043506, + "ewc_loss": 0.04366041347384453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1830206605955027e-05, + "grad_norm": 28.738080978393555, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8835365176200867, + "num_tokens": 522737404.0, + "step": 13704 + }, + { + "epoch": 1.743416868082941, + "ewc_loss": 0.04357397183775902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1786985598737374e-05, + "grad_norm": 28.760318756103516, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8614635467529297, + "num_tokens": 522774188.0, + "step": 13705 + }, + { + "epoch": 1.7435440783615315, + "ewc_loss": 0.043655551970005035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.182777643611189e-05, + "grad_norm": 28.817697525024414, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8741657733917236, + "num_tokens": 522810467.0, + "step": 13706 + }, + { + "epoch": 1.743671288640122, + "ewc_loss": 0.04368284344673157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.184142249461729e-05, + "grad_norm": 28.887897491455078, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.855774998664856, + "num_tokens": 522845246.0, + "step": 13707 + }, + { + "epoch": 1.7437984989187125, + "ewc_loss": 0.04360094666481018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1800473405164666e-05, + "grad_norm": 28.758617401123047, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8760013580322266, + "num_tokens": 522877150.0, + "step": 13708 + }, + { + "epoch": 1.743925709197303, + "ewc_loss": 0.04359514266252518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.179757211706601e-05, + "grad_norm": 28.770545959472656, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8806408643722534, + "num_tokens": 522913161.0, + "step": 13709 + }, + { + "epoch": 1.7440529194758936, + "ewc_loss": 0.04359142482280731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1795713109895587e-05, + "grad_norm": 28.76671600341797, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8528159856796265, + "num_tokens": 522953464.0, + "step": 13710 + }, + { + "epoch": 1.744180129754484, + "ewc_loss": 0.04360799491405487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1803996787639335e-05, + "grad_norm": 28.799427032470703, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8697813749313354, + "num_tokens": 522996874.0, + "step": 13711 + }, + { + "epoch": 1.7443073400330746, + "ewc_loss": 0.04361755773425102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1808778910781257e-05, + "grad_norm": 28.76381492614746, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8690524101257324, + "num_tokens": 523031349.0, + "step": 13712 + }, + { + "epoch": 1.7444345503116652, + "ewc_loss": 0.043622937053442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.18114691961091e-05, + "grad_norm": 28.688295364379883, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8591360449790955, + "num_tokens": 523068910.0, + "step": 13713 + }, + { + "epoch": 1.7445617605902557, + "ewc_loss": 0.04367436468601227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1837182430317625e-05, + "grad_norm": 28.65048599243164, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8793155550956726, + "num_tokens": 523106179.0, + "step": 13714 + }, + { + "epoch": 1.7446889708688462, + "ewc_loss": 0.04367499426007271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.183749711548444e-05, + "grad_norm": 28.686031341552734, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8683377504348755, + "num_tokens": 523140827.0, + "step": 13715 + }, + { + "epoch": 1.7448161811474368, + "ewc_loss": 0.043750833719968796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.187541758758016e-05, + "grad_norm": 28.711591720581055, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8796377182006836, + "num_tokens": 523182880.0, + "step": 13716 + }, + { + "epoch": 1.7449433914260273, + "ewc_loss": 0.04367698356509209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.183849210268818e-05, + "grad_norm": 28.828479766845703, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8687861561775208, + "num_tokens": 523213059.0, + "step": 13717 + }, + { + "epoch": 1.7450706017046178, + "ewc_loss": 0.04374397173523903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1871985154575668e-05, + "grad_norm": 28.835275650024414, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8643238544464111, + "num_tokens": 523254398.0, + "step": 13718 + }, + { + "epoch": 1.7451978119832083, + "ewc_loss": 0.043668437749147415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.183421929657925e-05, + "grad_norm": 28.728845596313477, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8780506253242493, + "num_tokens": 523293325.0, + "step": 13719 + }, + { + "epoch": 1.7453250222617989, + "ewc_loss": 0.04369505122303963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1847525204066187e-05, + "grad_norm": 28.798601150512695, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8561538457870483, + "num_tokens": 523334402.0, + "step": 13720 + }, + { + "epoch": 1.7454522325403894, + "ewc_loss": 0.04373232647776604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1866162569494918e-05, + "grad_norm": 28.82172966003418, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8713197708129883, + "num_tokens": 523376238.0, + "step": 13721 + }, + { + "epoch": 1.74557944281898, + "ewc_loss": 0.043620843440294266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.181042145821266e-05, + "grad_norm": 28.75899887084961, + "learning_rate": 1e-06, + "loss": 0.3644, + "mean_token_accuracy": 0.8880898952484131, + "num_tokens": 523417074.0, + "step": 13722 + }, + { + "epoch": 1.7457066530975704, + "ewc_loss": 0.043713610619306564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1856805688003078e-05, + "grad_norm": 28.787261962890625, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8734408617019653, + "num_tokens": 523457984.0, + "step": 13723 + }, + { + "epoch": 1.7458338633761608, + "ewc_loss": 0.04367585480213165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.183792821597308e-05, + "grad_norm": 28.8072566986084, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8611351251602173, + "num_tokens": 523490071.0, + "step": 13724 + }, + { + "epoch": 1.7459610736547513, + "ewc_loss": 0.043679364025592804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1839681721758097e-05, + "grad_norm": 28.747100830078125, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8595074415206909, + "num_tokens": 523530105.0, + "step": 13725 + }, + { + "epoch": 1.7460882839333418, + "ewc_loss": 0.043678976595401764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.183948890888132e-05, + "grad_norm": 28.638212203979492, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.880466639995575, + "num_tokens": 523576698.0, + "step": 13726 + }, + { + "epoch": 1.7462154942119323, + "ewc_loss": 0.043706294149160385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1853147700312547e-05, + "grad_norm": 28.84654998779297, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8664800524711609, + "num_tokens": 523614387.0, + "step": 13727 + }, + { + "epoch": 1.7463427044905229, + "ewc_loss": 0.04380958899855614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1904794266447425e-05, + "grad_norm": 28.73351287841797, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8621900081634521, + "num_tokens": 523651149.0, + "step": 13728 + }, + { + "epoch": 1.7464699147691134, + "ewc_loss": 0.04365590587258339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1827952878084034e-05, + "grad_norm": 28.82697296142578, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8781489133834839, + "num_tokens": 523688448.0, + "step": 13729 + }, + { + "epoch": 1.7465971250477037, + "ewc_loss": 0.043755631893873215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1877816834603436e-05, + "grad_norm": 28.875755310058594, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8705015182495117, + "num_tokens": 523725805.0, + "step": 13730 + }, + { + "epoch": 1.7467243353262942, + "ewc_loss": 0.04361601173877716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.180800584028475e-05, + "grad_norm": 28.849096298217773, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.867098331451416, + "num_tokens": 523763564.0, + "step": 13731 + }, + { + "epoch": 1.7468515456048848, + "ewc_loss": 0.0436655655503273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1832782294950448e-05, + "grad_norm": 28.748144149780273, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8610663414001465, + "num_tokens": 523804422.0, + "step": 13732 + }, + { + "epoch": 1.7469787558834753, + "ewc_loss": 0.04363361746072769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1816807930008508e-05, + "grad_norm": 28.89504051208496, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8754606246948242, + "num_tokens": 523845889.0, + "step": 13733 + }, + { + "epoch": 1.7471059661620658, + "ewc_loss": 0.0436447374522686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1822368580615148e-05, + "grad_norm": 28.71240234375, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8779795169830322, + "num_tokens": 523883623.0, + "step": 13734 + }, + { + "epoch": 1.7472331764406563, + "ewc_loss": 0.04359506070613861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.179753028030973e-05, + "grad_norm": 28.915637969970703, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8736890554428101, + "num_tokens": 523917273.0, + "step": 13735 + }, + { + "epoch": 1.7473603867192469, + "ewc_loss": 0.0437304824590683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.186524034186732e-05, + "grad_norm": 28.759735107421875, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8636513352394104, + "num_tokens": 523956149.0, + "step": 13736 + }, + { + "epoch": 1.7474875969978374, + "ewc_loss": 0.04354991763830185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.177495844080113e-05, + "grad_norm": 28.88291358947754, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8635377287864685, + "num_tokens": 523997229.0, + "step": 13737 + }, + { + "epoch": 1.747614807276428, + "ewc_loss": 0.04361489415168762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.180744741053786e-05, + "grad_norm": 28.567140579223633, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8693689703941345, + "num_tokens": 524032921.0, + "step": 13738 + }, + { + "epoch": 1.7477420175550185, + "ewc_loss": 0.043642137199640274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1821068003191613e-05, + "grad_norm": 28.90520477294922, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8690999746322632, + "num_tokens": 524065016.0, + "step": 13739 + }, + { + "epoch": 1.747869227833609, + "ewc_loss": 0.04383076727390289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.191538442275487e-05, + "grad_norm": 28.812503814697266, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8555839657783508, + "num_tokens": 524107001.0, + "step": 13740 + }, + { + "epoch": 1.7479964381121995, + "ewc_loss": 0.04361414536833763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.180707269872073e-05, + "grad_norm": 28.854854583740234, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8604736924171448, + "num_tokens": 524149283.0, + "step": 13741 + }, + { + "epoch": 1.74812364839079, + "ewc_loss": 0.04367638751864433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1838193788425997e-05, + "grad_norm": 28.78774070739746, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8670529723167419, + "num_tokens": 524187240.0, + "step": 13742 + }, + { + "epoch": 1.7482508586693806, + "ewc_loss": 0.04362179711461067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1810898033436388e-05, + "grad_norm": 28.859241485595703, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8757467269897461, + "num_tokens": 524216793.0, + "step": 13743 + }, + { + "epoch": 1.748378068947971, + "ewc_loss": 0.043751634657382965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.187581776524894e-05, + "grad_norm": 28.82661247253418, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8728569746017456, + "num_tokens": 524257629.0, + "step": 13744 + }, + { + "epoch": 1.7485052792265616, + "ewc_loss": 0.04360652714967728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1803263734909706e-05, + "grad_norm": 28.827547073364258, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8786200284957886, + "num_tokens": 524291090.0, + "step": 13745 + }, + { + "epoch": 1.7486324895051522, + "ewc_loss": 0.04369042441248894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1845211449544877e-05, + "grad_norm": 28.741003036499023, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8797441720962524, + "num_tokens": 524324882.0, + "step": 13746 + }, + { + "epoch": 1.7487596997837427, + "ewc_loss": 0.04359268397092819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1796342480229214e-05, + "grad_norm": 28.862829208374023, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8513175249099731, + "num_tokens": 524359422.0, + "step": 13747 + }, + { + "epoch": 1.748886910062333, + "ewc_loss": 0.04368313029408455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1841564375790767e-05, + "grad_norm": 28.716022491455078, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8726766109466553, + "num_tokens": 524398383.0, + "step": 13748 + }, + { + "epoch": 1.7490141203409235, + "ewc_loss": 0.04368579760193825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.184289951401297e-05, + "grad_norm": 28.904813766479492, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8635735511779785, + "num_tokens": 524437984.0, + "step": 13749 + }, + { + "epoch": 1.749141330619514, + "ewc_loss": 0.04362761229276657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1813806597492658e-05, + "grad_norm": 28.710002899169922, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8727369904518127, + "num_tokens": 524471324.0, + "step": 13750 + }, + { + "epoch": 1.7492685408981046, + "ewc_loss": 0.04368162155151367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.18408113141777e-05, + "grad_norm": 29.05657196044922, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8580468893051147, + "num_tokens": 524511471.0, + "step": 13751 + }, + { + "epoch": 1.749395751176695, + "ewc_loss": 0.04378941282629967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.189470615121536e-05, + "grad_norm": 28.817073822021484, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8702924847602844, + "num_tokens": 524541743.0, + "step": 13752 + }, + { + "epoch": 1.7495229614552856, + "ewc_loss": 0.04352426528930664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1762132746516727e-05, + "grad_norm": 28.909677505493164, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8673639297485352, + "num_tokens": 524581470.0, + "step": 13753 + }, + { + "epoch": 1.749650171733876, + "ewc_loss": 0.04381135106086731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.190567465731874e-05, + "grad_norm": 28.89964485168457, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8699343204498291, + "num_tokens": 524612722.0, + "step": 13754 + }, + { + "epoch": 1.7497773820124665, + "ewc_loss": 0.04363301023840904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.181650597776752e-05, + "grad_norm": 28.971153259277344, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8763085603713989, + "num_tokens": 524647640.0, + "step": 13755 + }, + { + "epoch": 1.749904592291057, + "ewc_loss": 0.043710190802812576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1855095837963745e-05, + "grad_norm": 28.982942581176758, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8605951070785522, + "num_tokens": 524682366.0, + "step": 13756 + }, + { + "epoch": 1.7500318025696475, + "ewc_loss": 0.0435527041554451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1776351786684245e-05, + "grad_norm": 28.88062858581543, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8663565516471863, + "num_tokens": 524724242.0, + "step": 13757 + }, + { + "epoch": 1.750159012848238, + "ewc_loss": 0.04366382956504822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1831914637004957e-05, + "grad_norm": 28.891305923461914, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8542047739028931, + "num_tokens": 524757373.0, + "step": 13758 + }, + { + "epoch": 1.7502862231268286, + "ewc_loss": 0.04366310313344002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1831550839124247e-05, + "grad_norm": 28.966861724853516, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8571622967720032, + "num_tokens": 524797704.0, + "step": 13759 + }, + { + "epoch": 1.750413433405419, + "ewc_loss": 0.04365173727273941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.182586831622757e-05, + "grad_norm": 28.8512020111084, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8698900938034058, + "num_tokens": 524836710.0, + "step": 13760 + }, + { + "epoch": 1.7505406436840096, + "ewc_loss": 0.04366427659988403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1832138372701593e-05, + "grad_norm": 28.956396102905273, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8645551204681396, + "num_tokens": 524870224.0, + "step": 13761 + }, + { + "epoch": 1.7506678539626002, + "ewc_loss": 0.04371744394302368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1858722902834415e-05, + "grad_norm": 28.837621688842773, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8697832226753235, + "num_tokens": 524905345.0, + "step": 13762 + }, + { + "epoch": 1.7507950642411907, + "ewc_loss": 0.04359593614935875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1797968656755984e-05, + "grad_norm": 28.82306480407715, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8510783910751343, + "num_tokens": 524943893.0, + "step": 13763 + }, + { + "epoch": 1.7509222745197812, + "ewc_loss": 0.04369105398654938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.184552613471169e-05, + "grad_norm": 28.8894100189209, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8653517961502075, + "num_tokens": 524982924.0, + "step": 13764 + }, + { + "epoch": 1.7510494847983717, + "ewc_loss": 0.04367387667298317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.183693868573755e-05, + "grad_norm": 28.91414451599121, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8659224510192871, + "num_tokens": 525016749.0, + "step": 13765 + }, + { + "epoch": 1.7511766950769623, + "ewc_loss": 0.04369095340371132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1845477021997795e-05, + "grad_norm": 28.9035701751709, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8689604997634888, + "num_tokens": 525049178.0, + "step": 13766 + }, + { + "epoch": 1.7513039053555528, + "ewc_loss": 0.04368429258465767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1842146452399902e-05, + "grad_norm": 28.780881881713867, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8529751300811768, + "num_tokens": 525083456.0, + "step": 13767 + }, + { + "epoch": 1.7514311156341433, + "ewc_loss": 0.04359504580497742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1797523004352115e-05, + "grad_norm": 28.863386154174805, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8692950010299683, + "num_tokens": 525120159.0, + "step": 13768 + }, + { + "epoch": 1.7515583259127339, + "ewc_loss": 0.04369741678237915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1848707547178492e-05, + "grad_norm": 28.74728775024414, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8720765709877014, + "num_tokens": 525159174.0, + "step": 13769 + }, + { + "epoch": 1.7516855361913244, + "ewc_loss": 0.04373873770236969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.186936944781337e-05, + "grad_norm": 29.017765045166016, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8643155097961426, + "num_tokens": 525198373.0, + "step": 13770 + }, + { + "epoch": 1.751812746469915, + "ewc_loss": 0.04365157708525658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1825788280693814e-05, + "grad_norm": 28.906564712524414, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8730391263961792, + "num_tokens": 525240618.0, + "step": 13771 + }, + { + "epoch": 1.7519399567485054, + "ewc_loss": 0.04365212842822075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1826064767083153e-05, + "grad_norm": 28.891809463500977, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8814460039138794, + "num_tokens": 525274899.0, + "step": 13772 + }, + { + "epoch": 1.7520671670270958, + "ewc_loss": 0.04366171732544899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1830857804161496e-05, + "grad_norm": 28.864017486572266, + "learning_rate": 1e-06, + "loss": 0.5081, + "mean_token_accuracy": 0.8466846942901611, + "num_tokens": 525310144.0, + "step": 13773 + }, + { + "epoch": 1.7521943773056863, + "ewc_loss": 0.04366180673241615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1830903278896585e-05, + "grad_norm": 28.84999656677246, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8764632940292358, + "num_tokens": 525348565.0, + "step": 13774 + }, + { + "epoch": 1.7523215875842768, + "ewc_loss": 0.04366263747215271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1831318008480594e-05, + "grad_norm": 28.84748649597168, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8727031946182251, + "num_tokens": 525383709.0, + "step": 13775 + }, + { + "epoch": 1.7524487978628673, + "ewc_loss": 0.043663397431373596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1831698177265935e-05, + "grad_norm": 28.83831024169922, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8613404631614685, + "num_tokens": 525423983.0, + "step": 13776 + }, + { + "epoch": 1.7525760081414579, + "ewc_loss": 0.04364698752760887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1823494535055943e-05, + "grad_norm": 28.84023666381836, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8734074234962463, + "num_tokens": 525457357.0, + "step": 13777 + }, + { + "epoch": 1.7527032184200484, + "ewc_loss": 0.04366758093237877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1833790015080012e-05, + "grad_norm": 28.814035415649414, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8659200668334961, + "num_tokens": 525497021.0, + "step": 13778 + }, + { + "epoch": 1.7528304286986387, + "ewc_loss": 0.04377556964755058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1887784896534868e-05, + "grad_norm": 28.8657283782959, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8826701641082764, + "num_tokens": 525533592.0, + "step": 13779 + }, + { + "epoch": 1.7529576389772292, + "ewc_loss": 0.043632254004478455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1816127627971582e-05, + "grad_norm": 28.815916061401367, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.858895480632782, + "num_tokens": 525572747.0, + "step": 13780 + }, + { + "epoch": 1.7530848492558198, + "ewc_loss": 0.04372464865446091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1862324501853436e-05, + "grad_norm": 28.890731811523438, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.87351393699646, + "num_tokens": 525613578.0, + "step": 13781 + }, + { + "epoch": 1.7532120595344103, + "ewc_loss": 0.04373564571142197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1867823306820355e-05, + "grad_norm": 28.735143661499023, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8737191557884216, + "num_tokens": 525655047.0, + "step": 13782 + }, + { + "epoch": 1.7533392698130008, + "ewc_loss": 0.043792497366666794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1896248654229566e-05, + "grad_norm": 28.847543716430664, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8641893863677979, + "num_tokens": 525689561.0, + "step": 13783 + }, + { + "epoch": 1.7534664800915913, + "ewc_loss": 0.0437190905213356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1859545086044818e-05, + "grad_norm": 28.697298049926758, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8631731867790222, + "num_tokens": 525726946.0, + "step": 13784 + }, + { + "epoch": 1.7535936903701819, + "ewc_loss": 0.04377821087837219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1889105482841842e-05, + "grad_norm": 28.91201400756836, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8598164916038513, + "num_tokens": 525762089.0, + "step": 13785 + }, + { + "epoch": 1.7537209006487724, + "ewc_loss": 0.04383479803800583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1917399863013998e-05, + "grad_norm": 28.74641227722168, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.873223602771759, + "num_tokens": 525796081.0, + "step": 13786 + }, + { + "epoch": 1.753848110927363, + "ewc_loss": 0.04372056946158409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.186028541473206e-05, + "grad_norm": 28.72608184814453, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8568710088729858, + "num_tokens": 525829945.0, + "step": 13787 + }, + { + "epoch": 1.7539753212059535, + "ewc_loss": 0.04383990541100502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1919951905147173e-05, + "grad_norm": 28.765491485595703, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8756610155105591, + "num_tokens": 525871289.0, + "step": 13788 + }, + { + "epoch": 1.754102531484544, + "ewc_loss": 0.04377056658267975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1885283786104992e-05, + "grad_norm": 28.719560623168945, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.868539035320282, + "num_tokens": 525908455.0, + "step": 13789 + }, + { + "epoch": 1.7542297417631345, + "ewc_loss": 0.043830353766679764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1915177057962865e-05, + "grad_norm": 28.792491912841797, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8877229690551758, + "num_tokens": 525943734.0, + "step": 13790 + }, + { + "epoch": 1.754356952041725, + "ewc_loss": 0.04387236014008522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.193618092860561e-05, + "grad_norm": 28.693992614746094, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8897395133972168, + "num_tokens": 525988070.0, + "step": 13791 + }, + { + "epoch": 1.7544841623203156, + "ewc_loss": 0.043826743960380554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1913372620474547e-05, + "grad_norm": 28.778657913208008, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8822397589683533, + "num_tokens": 526027940.0, + "step": 13792 + }, + { + "epoch": 1.754611372598906, + "ewc_loss": 0.04390278831124306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1951394955976866e-05, + "grad_norm": 28.715885162353516, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8781019449234009, + "num_tokens": 526061357.0, + "step": 13793 + }, + { + "epoch": 1.7547385828774966, + "ewc_loss": 0.043827567249536514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.191378371207975e-05, + "grad_norm": 28.83576774597168, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.869365930557251, + "num_tokens": 526102157.0, + "step": 13794 + }, + { + "epoch": 1.7548657931560872, + "ewc_loss": 0.043882086873054504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1941043087281287e-05, + "grad_norm": 28.70962142944336, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.859050452709198, + "num_tokens": 526137241.0, + "step": 13795 + }, + { + "epoch": 1.7549930034346777, + "ewc_loss": 0.04376474767923355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.188237340305932e-05, + "grad_norm": 28.760889053344727, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8595136404037476, + "num_tokens": 526169079.0, + "step": 13796 + }, + { + "epoch": 1.755120213713268, + "ewc_loss": 0.04385678097605705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1928390196990222e-05, + "grad_norm": 28.773414611816406, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8726955652236938, + "num_tokens": 526212212.0, + "step": 13797 + }, + { + "epoch": 1.7552474239918585, + "ewc_loss": 0.043757639825344086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.187881909776479e-05, + "grad_norm": 28.68421745300293, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.875372052192688, + "num_tokens": 526250380.0, + "step": 13798 + }, + { + "epoch": 1.755374634270449, + "ewc_loss": 0.043787840753793716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1893920347793028e-05, + "grad_norm": 28.802532196044922, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8743752837181091, + "num_tokens": 526292525.0, + "step": 13799 + }, + { + "epoch": 1.7555018445490396, + "ewc_loss": 0.04383261874318123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.191630846937187e-05, + "grad_norm": 28.65165901184082, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8690780401229858, + "num_tokens": 526330427.0, + "step": 13800 + }, + { + "epoch": 1.75562905482763, + "ewc_loss": 0.04378360137343407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1891801225137897e-05, + "grad_norm": 28.832609176635742, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8695757985115051, + "num_tokens": 526362954.0, + "step": 13801 + }, + { + "epoch": 1.7557562651062206, + "ewc_loss": 0.043854255229234695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.192712781834416e-05, + "grad_norm": 28.757741928100586, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8628108501434326, + "num_tokens": 526394310.0, + "step": 13802 + }, + { + "epoch": 1.755883475384811, + "ewc_loss": 0.043823376297950745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1911688236286864e-05, + "grad_norm": 28.741315841674805, + "learning_rate": 1e-06, + "loss": 0.4871, + "mean_token_accuracy": 0.847426176071167, + "num_tokens": 526438507.0, + "step": 13803 + }, + { + "epoch": 1.7560106856634015, + "ewc_loss": 0.04391332343220711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.195666093030013e-05, + "grad_norm": 28.82712745666504, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8642818927764893, + "num_tokens": 526475124.0, + "step": 13804 + }, + { + "epoch": 1.756137895941992, + "ewc_loss": 0.04388219863176346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1941099475952797e-05, + "grad_norm": 28.7961368560791, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8757156729698181, + "num_tokens": 526514936.0, + "step": 13805 + }, + { + "epoch": 1.7562651062205825, + "ewc_loss": 0.04378191754221916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1890959033044055e-05, + "grad_norm": 28.74725341796875, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.862385630607605, + "num_tokens": 526548649.0, + "step": 13806 + }, + { + "epoch": 1.756392316499173, + "ewc_loss": 0.0437769889831543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.188849430240225e-05, + "grad_norm": 28.68540382385254, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8519268035888672, + "num_tokens": 526589028.0, + "step": 13807 + }, + { + "epoch": 1.7565195267777636, + "ewc_loss": 0.04389873892068863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1949368601781316e-05, + "grad_norm": 28.93202018737793, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8681200742721558, + "num_tokens": 526626327.0, + "step": 13808 + }, + { + "epoch": 1.756646737056354, + "ewc_loss": 0.04393886402249336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1969432054902427e-05, + "grad_norm": 28.81899642944336, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8641817569732666, + "num_tokens": 526672264.0, + "step": 13809 + }, + { + "epoch": 1.7567739473349446, + "ewc_loss": 0.04377847537398338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.18892382690683e-05, + "grad_norm": 28.811241149902344, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8617355823516846, + "num_tokens": 526709979.0, + "step": 13810 + }, + { + "epoch": 1.7569011576135352, + "ewc_loss": 0.04385831579566002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1929157810518518e-05, + "grad_norm": 28.77105140686035, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.879677414894104, + "num_tokens": 526745002.0, + "step": 13811 + }, + { + "epoch": 1.7570283678921257, + "ewc_loss": 0.04382549598813057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.191274870710913e-05, + "grad_norm": 28.777164459228516, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8828583359718323, + "num_tokens": 526785684.0, + "step": 13812 + }, + { + "epoch": 1.7571555781707162, + "ewc_loss": 0.04381684958934784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1908424969296902e-05, + "grad_norm": 28.741310119628906, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8854885101318359, + "num_tokens": 526820246.0, + "step": 13813 + }, + { + "epoch": 1.7572827884493067, + "ewc_loss": 0.04384101554751396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1920508515904658e-05, + "grad_norm": 28.79874610900879, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8715775609016418, + "num_tokens": 526854815.0, + "step": 13814 + }, + { + "epoch": 1.7574099987278973, + "ewc_loss": 0.04382839426398277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1914196622674353e-05, + "grad_norm": 28.7138729095459, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8731076717376709, + "num_tokens": 526896336.0, + "step": 13815 + }, + { + "epoch": 1.7575372090064878, + "ewc_loss": 0.04388047754764557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.194023909396492e-05, + "grad_norm": 28.84012794494629, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8793286085128784, + "num_tokens": 526939586.0, + "step": 13816 + }, + { + "epoch": 1.7576644192850783, + "ewc_loss": 0.04391149431467056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.195574779761955e-05, + "grad_norm": 28.75944709777832, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8726012110710144, + "num_tokens": 526975027.0, + "step": 13817 + }, + { + "epoch": 1.7577916295636689, + "ewc_loss": 0.04378129169344902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1890646166866645e-05, + "grad_norm": 28.75602149963379, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8699080944061279, + "num_tokens": 527017820.0, + "step": 13818 + }, + { + "epoch": 1.7579188398422594, + "ewc_loss": 0.0438297763466835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.19148878386477e-05, + "grad_norm": 28.80701446533203, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8789849281311035, + "num_tokens": 527051647.0, + "step": 13819 + }, + { + "epoch": 1.75804605012085, + "ewc_loss": 0.043873902410268784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.193695036112331e-05, + "grad_norm": 28.936342239379883, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8734410405158997, + "num_tokens": 527089590.0, + "step": 13820 + }, + { + "epoch": 1.7581732603994404, + "ewc_loss": 0.04385164752602577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.192582360294182e-05, + "grad_norm": 28.842798233032227, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8650151491165161, + "num_tokens": 527126696.0, + "step": 13821 + }, + { + "epoch": 1.7583004706780307, + "ewc_loss": 0.04378388449549675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1891943106311373e-05, + "grad_norm": 28.804218292236328, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8593150973320007, + "num_tokens": 527164823.0, + "step": 13822 + }, + { + "epoch": 1.7584276809566213, + "ewc_loss": 0.04379964992403984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1899824787396938e-05, + "grad_norm": 28.863914489746094, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8502368330955505, + "num_tokens": 527208032.0, + "step": 13823 + }, + { + "epoch": 1.7585548912352118, + "ewc_loss": 0.04382362216711044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1911811927566305e-05, + "grad_norm": 28.850433349609375, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8696736693382263, + "num_tokens": 527242773.0, + "step": 13824 + }, + { + "epoch": 1.7586821015138023, + "ewc_loss": 0.043804414570331573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1902207663515583e-05, + "grad_norm": 28.83349609375, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8713327646255493, + "num_tokens": 527277968.0, + "step": 13825 + }, + { + "epoch": 1.7588093117923929, + "ewc_loss": 0.04386196658015251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1930984075879678e-05, + "grad_norm": 28.941394805908203, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.881555438041687, + "num_tokens": 527311077.0, + "step": 13826 + }, + { + "epoch": 1.7589365220709834, + "ewc_loss": 0.04381340369582176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1906702386331744e-05, + "grad_norm": 28.854698181152344, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8741812705993652, + "num_tokens": 527340543.0, + "step": 13827 + }, + { + "epoch": 1.7590637323495737, + "ewc_loss": 0.04379372298717499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.189686165365856e-05, + "grad_norm": 28.823469161987305, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8663285970687866, + "num_tokens": 527379596.0, + "step": 13828 + }, + { + "epoch": 1.7591909426281642, + "ewc_loss": 0.04387597367167473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.193798718508333e-05, + "grad_norm": 28.882734298706055, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.867621660232544, + "num_tokens": 527417873.0, + "step": 13829 + }, + { + "epoch": 1.7593181529067548, + "ewc_loss": 0.043799709528684616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1899853891227394e-05, + "grad_norm": 28.881378173828125, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8698575496673584, + "num_tokens": 527458607.0, + "step": 13830 + }, + { + "epoch": 1.7594453631853453, + "ewc_loss": 0.04384278133511543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1921390725765377e-05, + "grad_norm": 28.898067474365234, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8609669804573059, + "num_tokens": 527493299.0, + "step": 13831 + }, + { + "epoch": 1.7595725734639358, + "ewc_loss": 0.04373934492468834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.186967321904376e-05, + "grad_norm": 28.826650619506836, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8550124764442444, + "num_tokens": 527536065.0, + "step": 13832 + }, + { + "epoch": 1.7596997837425263, + "ewc_loss": 0.043826255947351456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.191312705690507e-05, + "grad_norm": 28.85564613342285, + "learning_rate": 1e-06, + "loss": 0.4756, + "mean_token_accuracy": 0.852388858795166, + "num_tokens": 527576477.0, + "step": 13833 + }, + { + "epoch": 1.7598269940211169, + "ewc_loss": 0.04379243776202202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.189621955039911e-05, + "grad_norm": 28.9182186126709, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8651468753814697, + "num_tokens": 527611777.0, + "step": 13834 + }, + { + "epoch": 1.7599542042997074, + "ewc_loss": 0.043817926198244095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1908963390160352e-05, + "grad_norm": 28.920679092407227, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8850095272064209, + "num_tokens": 527650236.0, + "step": 13835 + }, + { + "epoch": 1.760081414578298, + "ewc_loss": 0.04373579099774361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1867896066396497e-05, + "grad_norm": 28.782072067260742, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8743139505386353, + "num_tokens": 527685486.0, + "step": 13836 + }, + { + "epoch": 1.7602086248568884, + "ewc_loss": 0.04382937029004097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1914685930823907e-05, + "grad_norm": 28.933561325073242, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8552056550979614, + "num_tokens": 527725386.0, + "step": 13837 + }, + { + "epoch": 1.760335835135479, + "ewc_loss": 0.04381284862756729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1906424080953002e-05, + "grad_norm": 28.91509437561035, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8552120923995972, + "num_tokens": 527756213.0, + "step": 13838 + }, + { + "epoch": 1.7604630454140695, + "ewc_loss": 0.043685123324394226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.184256118198391e-05, + "grad_norm": 28.792207717895508, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8715964555740356, + "num_tokens": 527793884.0, + "step": 13839 + }, + { + "epoch": 1.76059025569266, + "ewc_loss": 0.04380134493112564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.190067243645899e-05, + "grad_norm": 28.928003311157227, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8655322790145874, + "num_tokens": 527826428.0, + "step": 13840 + }, + { + "epoch": 1.7607174659712506, + "ewc_loss": 0.043773628771305084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1886813556193374e-05, + "grad_norm": 28.78908348083496, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.859728991985321, + "num_tokens": 527866691.0, + "step": 13841 + }, + { + "epoch": 1.760844676249841, + "ewc_loss": 0.043789248913526535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.18946242966922e-05, + "grad_norm": 28.892728805541992, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.876609206199646, + "num_tokens": 527909482.0, + "step": 13842 + }, + { + "epoch": 1.7609718865284316, + "ewc_loss": 0.04381907358765602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.190953637182247e-05, + "grad_norm": 28.912261962890625, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.856559157371521, + "num_tokens": 527947818.0, + "step": 13843 + }, + { + "epoch": 1.7610990968070221, + "ewc_loss": 0.043831873685121536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1915937395533547e-05, + "grad_norm": 28.861003875732422, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8715812563896179, + "num_tokens": 527986760.0, + "step": 13844 + }, + { + "epoch": 1.7612263070856127, + "ewc_loss": 0.04374709352850914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1873547666473314e-05, + "grad_norm": 28.83877944946289, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8607919812202454, + "num_tokens": 528024742.0, + "step": 13845 + }, + { + "epoch": 1.761353517364203, + "ewc_loss": 0.04376614838838577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1883073713979684e-05, + "grad_norm": 28.82052230834961, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8538240790367126, + "num_tokens": 528060781.0, + "step": 13846 + }, + { + "epoch": 1.7614807276427935, + "ewc_loss": 0.04375655949115753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.187828067690134e-05, + "grad_norm": 28.886327743530273, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8698771595954895, + "num_tokens": 528099051.0, + "step": 13847 + }, + { + "epoch": 1.761607937921384, + "ewc_loss": 0.04376831650733948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1884157831664197e-05, + "grad_norm": 28.80776023864746, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8609476685523987, + "num_tokens": 528138325.0, + "step": 13848 + }, + { + "epoch": 1.7617351481999746, + "ewc_loss": 0.0437900647521019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1895031750318594e-05, + "grad_norm": 28.88119888305664, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8587374687194824, + "num_tokens": 528177997.0, + "step": 13849 + }, + { + "epoch": 1.761862358478565, + "ewc_loss": 0.04382697865366936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1913489035796374e-05, + "grad_norm": 28.77492332458496, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8642278909683228, + "num_tokens": 528216360.0, + "step": 13850 + }, + { + "epoch": 1.7619895687571556, + "ewc_loss": 0.04379039630293846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1895197278354317e-05, + "grad_norm": 28.873260498046875, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8837041854858398, + "num_tokens": 528251100.0, + "step": 13851 + }, + { + "epoch": 1.762116779035746, + "ewc_loss": 0.04387207701802254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1936039047432132e-05, + "grad_norm": 28.9606990814209, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8718591928482056, + "num_tokens": 528286298.0, + "step": 13852 + }, + { + "epoch": 1.7622439893143365, + "ewc_loss": 0.04376724734902382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1883623048779555e-05, + "grad_norm": 28.73914337158203, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8837677836418152, + "num_tokens": 528322970.0, + "step": 13853 + }, + { + "epoch": 1.762371199592927, + "ewc_loss": 0.04384605213999748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1923025997239165e-05, + "grad_norm": 28.912199020385742, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8849709033966064, + "num_tokens": 528355191.0, + "step": 13854 + }, + { + "epoch": 1.7624984098715175, + "ewc_loss": 0.04380955919623375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1904779714532197e-05, + "grad_norm": 28.860671997070312, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8681682348251343, + "num_tokens": 528388975.0, + "step": 13855 + }, + { + "epoch": 1.762625620150108, + "ewc_loss": 0.043772924691438675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.188646249123849e-05, + "grad_norm": 28.89887237548828, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8729562759399414, + "num_tokens": 528432139.0, + "step": 13856 + }, + { + "epoch": 1.7627528304286986, + "ewc_loss": 0.04380848631262779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.190424311265815e-05, + "grad_norm": 28.86705207824707, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8771535158157349, + "num_tokens": 528468315.0, + "step": 13857 + }, + { + "epoch": 1.762880040707289, + "ewc_loss": 0.043772246688604355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.188612415920943e-05, + "grad_norm": 28.834203720092773, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8568743467330933, + "num_tokens": 528510368.0, + "step": 13858 + }, + { + "epoch": 1.7630072509858796, + "ewc_loss": 0.04376346990466118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1881734937778674e-05, + "grad_norm": 28.833650588989258, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8707376718521118, + "num_tokens": 528545637.0, + "step": 13859 + }, + { + "epoch": 1.7631344612644702, + "ewc_loss": 0.0437653511762619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1882675355300307e-05, + "grad_norm": 28.840576171875, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8729224801063538, + "num_tokens": 528586267.0, + "step": 13860 + }, + { + "epoch": 1.7632616715430607, + "ewc_loss": 0.04377356544137001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1886782633373514e-05, + "grad_norm": 28.908527374267578, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8727400898933411, + "num_tokens": 528629640.0, + "step": 13861 + }, + { + "epoch": 1.7633888818216512, + "ewc_loss": 0.04378516972064972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1892585209570825e-05, + "grad_norm": 28.774919509887695, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8544694185256958, + "num_tokens": 528665373.0, + "step": 13862 + }, + { + "epoch": 1.7635160921002417, + "ewc_loss": 0.04376969113945961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1884845409658737e-05, + "grad_norm": 28.82284164428711, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8820945024490356, + "num_tokens": 528703426.0, + "step": 13863 + }, + { + "epoch": 1.7636433023788323, + "ewc_loss": 0.043787628412246704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1893814846407622e-05, + "grad_norm": 28.80852699279785, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8500726222991943, + "num_tokens": 528744886.0, + "step": 13864 + }, + { + "epoch": 1.7637705126574228, + "ewc_loss": 0.04384545981884003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1922729501966387e-05, + "grad_norm": 28.887651443481445, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8807245492935181, + "num_tokens": 528779773.0, + "step": 13865 + }, + { + "epoch": 1.7638977229360133, + "ewc_loss": 0.043847452849149704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.192372630815953e-05, + "grad_norm": 28.851842880249023, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8714039921760559, + "num_tokens": 528821878.0, + "step": 13866 + }, + { + "epoch": 1.7640249332146039, + "ewc_loss": 0.04384588822722435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1922944142716005e-05, + "grad_norm": 28.821369171142578, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8689535856246948, + "num_tokens": 528858056.0, + "step": 13867 + }, + { + "epoch": 1.7641521434931944, + "ewc_loss": 0.04383467882871628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.191733983636368e-05, + "grad_norm": 28.9021053314209, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8652713298797607, + "num_tokens": 528903116.0, + "step": 13868 + }, + { + "epoch": 1.764279353771785, + "ewc_loss": 0.0438704788684845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1935238692094572e-05, + "grad_norm": 28.845054626464844, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8614344000816345, + "num_tokens": 528941561.0, + "step": 13869 + }, + { + "epoch": 1.7644065640503754, + "ewc_loss": 0.04386577382683754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1932886738795787e-05, + "grad_norm": 28.97625160217285, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8849740028381348, + "num_tokens": 528976215.0, + "step": 13870 + }, + { + "epoch": 1.7645337743289657, + "ewc_loss": 0.04389010742306709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1945053958916105e-05, + "grad_norm": 28.819189071655273, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8747455477714539, + "num_tokens": 529019280.0, + "step": 13871 + }, + { + "epoch": 1.7646609846075563, + "ewc_loss": 0.043793123215436935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1896561520406976e-05, + "grad_norm": 28.881357192993164, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.872211217880249, + "num_tokens": 529060733.0, + "step": 13872 + }, + { + "epoch": 1.7647881948861468, + "ewc_loss": 0.04383813962340355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1919069695286453e-05, + "grad_norm": 28.95728874206543, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8704386949539185, + "num_tokens": 529094948.0, + "step": 13873 + }, + { + "epoch": 1.7649154051647373, + "ewc_loss": 0.043756525963544846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1878262487007305e-05, + "grad_norm": 28.84073257446289, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.879776120185852, + "num_tokens": 529131374.0, + "step": 13874 + }, + { + "epoch": 1.7650426154433279, + "ewc_loss": 0.043816693127155304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.190834675275255e-05, + "grad_norm": 28.90120506286621, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8648459315299988, + "num_tokens": 529169133.0, + "step": 13875 + }, + { + "epoch": 1.7651698257219184, + "ewc_loss": 0.04383721202611923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.191860585298855e-05, + "grad_norm": 28.81023597717285, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8665659427642822, + "num_tokens": 529206467.0, + "step": 13876 + }, + { + "epoch": 1.7652970360005087, + "ewc_loss": 0.043815113604068756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.190755731135141e-05, + "grad_norm": 28.826568603515625, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.858917236328125, + "num_tokens": 529244891.0, + "step": 13877 + }, + { + "epoch": 1.7654242462790992, + "ewc_loss": 0.04388979449868202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.19448975258274e-05, + "grad_norm": 28.894428253173828, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8708004951477051, + "num_tokens": 529277384.0, + "step": 13878 + }, + { + "epoch": 1.7655514565576897, + "ewc_loss": 0.04380374774336815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1901874788454734e-05, + "grad_norm": 28.844423294067383, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.871924877166748, + "num_tokens": 529322530.0, + "step": 13879 + }, + { + "epoch": 1.7656786668362803, + "ewc_loss": 0.043852657079696655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1926329281996004e-05, + "grad_norm": 28.804950714111328, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8628843426704407, + "num_tokens": 529356345.0, + "step": 13880 + }, + { + "epoch": 1.7658058771148708, + "ewc_loss": 0.04381635785102844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1908179405727424e-05, + "grad_norm": 28.841989517211914, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8703241348266602, + "num_tokens": 529391877.0, + "step": 13881 + }, + { + "epoch": 1.7659330873934613, + "ewc_loss": 0.043810151517391205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1905076209804974e-05, + "grad_norm": 28.78462028503418, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8760241270065308, + "num_tokens": 529425460.0, + "step": 13882 + }, + { + "epoch": 1.7660602976720519, + "ewc_loss": 0.04392107576131821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1960537196719088e-05, + "grad_norm": 28.913148880004883, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8590736389160156, + "num_tokens": 529459660.0, + "step": 13883 + }, + { + "epoch": 1.7661875079506424, + "ewc_loss": 0.04388605058193207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1943025785731152e-05, + "grad_norm": 28.83306121826172, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8603537082672119, + "num_tokens": 529501839.0, + "step": 13884 + }, + { + "epoch": 1.766314718229233, + "ewc_loss": 0.04385265335440636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.19263274630066e-05, + "grad_norm": 28.834077835083008, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8665092587471008, + "num_tokens": 529540828.0, + "step": 13885 + }, + { + "epoch": 1.7664419285078234, + "ewc_loss": 0.043867964297533035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1933981770416722e-05, + "grad_norm": 28.97027587890625, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8681932687759399, + "num_tokens": 529581578.0, + "step": 13886 + }, + { + "epoch": 1.766569138786414, + "ewc_loss": 0.043835390359163284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1917694539297372e-05, + "grad_norm": 28.82534408569336, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8683483004570007, + "num_tokens": 529624363.0, + "step": 13887 + }, + { + "epoch": 1.7666963490650045, + "ewc_loss": 0.043867021799087524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1933510652161203e-05, + "grad_norm": 28.91624641418457, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8751445412635803, + "num_tokens": 529665286.0, + "step": 13888 + }, + { + "epoch": 1.766823559343595, + "ewc_loss": 0.04392813891172409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1964069674140774e-05, + "grad_norm": 28.844688415527344, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8636407256126404, + "num_tokens": 529704150.0, + "step": 13889 + }, + { + "epoch": 1.7669507696221856, + "ewc_loss": 0.04384779930114746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1923899112152867e-05, + "grad_norm": 29.021940231323242, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.870573103427887, + "num_tokens": 529742110.0, + "step": 13890 + }, + { + "epoch": 1.767077979900776, + "ewc_loss": 0.043893784284591675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.194689295720309e-05, + "grad_norm": 28.89472770690918, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8675251007080078, + "num_tokens": 529776158.0, + "step": 13891 + }, + { + "epoch": 1.7672051901793666, + "ewc_loss": 0.04385003075003624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1925015971646644e-05, + "grad_norm": 28.976909637451172, + "learning_rate": 1e-06, + "loss": 0.4904, + "mean_token_accuracy": 0.8465261459350586, + "num_tokens": 529814002.0, + "step": 13892 + }, + { + "epoch": 1.7673324004579571, + "ewc_loss": 0.04380413517355919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.190206760133151e-05, + "grad_norm": 28.779693603515625, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8712847828865051, + "num_tokens": 529846696.0, + "step": 13893 + }, + { + "epoch": 1.7674596107365477, + "ewc_loss": 0.04381886497139931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1909432689426467e-05, + "grad_norm": 28.910451889038086, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8651459217071533, + "num_tokens": 529881557.0, + "step": 13894 + }, + { + "epoch": 1.767586821015138, + "ewc_loss": 0.043912433087825775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1956217096885666e-05, + "grad_norm": 28.938501358032227, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8557031154632568, + "num_tokens": 529924337.0, + "step": 13895 + }, + { + "epoch": 1.7677140312937285, + "ewc_loss": 0.04381123185157776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1905616449657828e-05, + "grad_norm": 28.93745231628418, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8669849634170532, + "num_tokens": 529967195.0, + "step": 13896 + }, + { + "epoch": 1.767841241572319, + "ewc_loss": 0.04376531019806862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1882655346416868e-05, + "grad_norm": 28.824081420898438, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.8500535488128662, + "num_tokens": 530003454.0, + "step": 13897 + }, + { + "epoch": 1.7679684518509096, + "ewc_loss": 0.043808307498693466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1904153982177377e-05, + "grad_norm": 28.902610778808594, + "learning_rate": 1e-06, + "loss": 0.5058, + "mean_token_accuracy": 0.8422739505767822, + "num_tokens": 530042577.0, + "step": 13898 + }, + { + "epoch": 1.7680956621295, + "ewc_loss": 0.043890614062547684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1945306798443198e-05, + "grad_norm": 28.92863655090332, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8535099625587463, + "num_tokens": 530084903.0, + "step": 13899 + }, + { + "epoch": 1.7682228724080906, + "ewc_loss": 0.04372158646583557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.186079291277565e-05, + "grad_norm": 28.903156280517578, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.864071249961853, + "num_tokens": 530117598.0, + "step": 13900 + }, + { + "epoch": 1.768350082686681, + "ewc_loss": 0.04388958215713501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.194479020545259e-05, + "grad_norm": 28.943344116210938, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8705487847328186, + "num_tokens": 530151891.0, + "step": 13901 + }, + { + "epoch": 1.7684772929652715, + "ewc_loss": 0.04371068999171257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.185534503951203e-05, + "grad_norm": 28.790096282958984, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8583279252052307, + "num_tokens": 530191263.0, + "step": 13902 + }, + { + "epoch": 1.768604503243862, + "ewc_loss": 0.043862614780664444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.193130785599351e-05, + "grad_norm": 28.932966232299805, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8633689880371094, + "num_tokens": 530231156.0, + "step": 13903 + }, + { + "epoch": 1.7687317135224525, + "ewc_loss": 0.043843772262334824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1921885490883142e-05, + "grad_norm": 28.79751205444336, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8808021545410156, + "num_tokens": 530270089.0, + "step": 13904 + }, + { + "epoch": 1.768858923801043, + "ewc_loss": 0.043896373361349106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.194818625866901e-05, + "grad_norm": 28.973432540893555, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8706397414207458, + "num_tokens": 530309808.0, + "step": 13905 + }, + { + "epoch": 1.7689861340796336, + "ewc_loss": 0.043837614357471466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1918807760812342e-05, + "grad_norm": 28.965728759765625, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8560647368431091, + "num_tokens": 530357723.0, + "step": 13906 + }, + { + "epoch": 1.769113344358224, + "ewc_loss": 0.04384484887123108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.192242391174659e-05, + "grad_norm": 28.877355575561523, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8550904989242554, + "num_tokens": 530389586.0, + "step": 13907 + }, + { + "epoch": 1.7692405546368146, + "ewc_loss": 0.04371905326843262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1859526896150783e-05, + "grad_norm": 28.836761474609375, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8568332195281982, + "num_tokens": 530430356.0, + "step": 13908 + }, + { + "epoch": 1.7693677649154052, + "ewc_loss": 0.043900541961193085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1950270820525475e-05, + "grad_norm": 28.9299259185791, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8718627691268921, + "num_tokens": 530472859.0, + "step": 13909 + }, + { + "epoch": 1.7694949751939957, + "ewc_loss": 0.0438382662832737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1919133359915577e-05, + "grad_norm": 29.017871856689453, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.853839635848999, + "num_tokens": 530515422.0, + "step": 13910 + }, + { + "epoch": 1.7696221854725862, + "ewc_loss": 0.043837178498506546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1918589482083917e-05, + "grad_norm": 28.889846801757812, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8652265667915344, + "num_tokens": 530553842.0, + "step": 13911 + }, + { + "epoch": 1.7697493957511767, + "ewc_loss": 0.043767672032117844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.188383587053977e-05, + "grad_norm": 28.848934173583984, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8661091327667236, + "num_tokens": 530592593.0, + "step": 13912 + }, + { + "epoch": 1.7698766060297673, + "ewc_loss": 0.04378229007124901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1891144569963217e-05, + "grad_norm": 28.89472770690918, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.861565887928009, + "num_tokens": 530632309.0, + "step": 13913 + }, + { + "epoch": 1.7700038163083578, + "ewc_loss": 0.043860048055648804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1930023649474606e-05, + "grad_norm": 28.885128021240234, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8617005348205566, + "num_tokens": 530667415.0, + "step": 13914 + }, + { + "epoch": 1.7701310265869483, + "ewc_loss": 0.0437900684773922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1895033569307998e-05, + "grad_norm": 28.94467544555664, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8639319539070129, + "num_tokens": 530705736.0, + "step": 13915 + }, + { + "epoch": 1.7702582368655388, + "ewc_loss": 0.04379766434431076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.18988316191826e-05, + "grad_norm": 28.99744415283203, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8755728006362915, + "num_tokens": 530744931.0, + "step": 13916 + }, + { + "epoch": 1.7703854471441294, + "ewc_loss": 0.04376767948269844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1883839508518577e-05, + "grad_norm": 28.8701114654541, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8783930540084839, + "num_tokens": 530784700.0, + "step": 13917 + }, + { + "epoch": 1.77051265742272, + "ewc_loss": 0.043768543750047684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.188427242799662e-05, + "grad_norm": 29.014892578125, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8828780055046082, + "num_tokens": 530821097.0, + "step": 13918 + }, + { + "epoch": 1.7706398677013102, + "ewc_loss": 0.04379589110612869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1897945771343075e-05, + "grad_norm": 28.827951431274414, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8607547283172607, + "num_tokens": 530860444.0, + "step": 13919 + }, + { + "epoch": 1.7707670779799007, + "ewc_loss": 0.043676555156707764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.183827746193856e-05, + "grad_norm": 28.9210205078125, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8644252419471741, + "num_tokens": 530907304.0, + "step": 13920 + }, + { + "epoch": 1.7708942882584913, + "ewc_loss": 0.043879635632038116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1939817088423297e-05, + "grad_norm": 28.993379592895508, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8589726686477661, + "num_tokens": 530945692.0, + "step": 13921 + }, + { + "epoch": 1.7710214985370818, + "ewc_loss": 0.043736618012189865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1868308976991102e-05, + "grad_norm": 28.898963928222656, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8767917156219482, + "num_tokens": 530986695.0, + "step": 13922 + }, + { + "epoch": 1.7711487088156723, + "ewc_loss": 0.043816473335027695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1908235794398934e-05, + "grad_norm": 28.972761154174805, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8779365420341492, + "num_tokens": 531022656.0, + "step": 13923 + }, + { + "epoch": 1.7712759190942629, + "ewc_loss": 0.04375375062227249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.18768745980924e-05, + "grad_norm": 28.93478775024414, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8755806684494019, + "num_tokens": 531056896.0, + "step": 13924 + }, + { + "epoch": 1.7714031293728534, + "ewc_loss": 0.04376007616519928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1880037820665166e-05, + "grad_norm": 28.926651000976562, + "learning_rate": 1e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8399480581283569, + "num_tokens": 531097286.0, + "step": 13925 + }, + { + "epoch": 1.7715303396514437, + "ewc_loss": 0.04374094679951668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.187047357438132e-05, + "grad_norm": 28.90959930419922, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8751431107521057, + "num_tokens": 531136430.0, + "step": 13926 + }, + { + "epoch": 1.7716575499300342, + "ewc_loss": 0.04378785938024521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1893929442740045e-05, + "grad_norm": 28.894472122192383, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8595881462097168, + "num_tokens": 531176698.0, + "step": 13927 + }, + { + "epoch": 1.7717847602086247, + "ewc_loss": 0.043737057596445084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.186852907470893e-05, + "grad_norm": 28.924957275390625, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.865153431892395, + "num_tokens": 531214343.0, + "step": 13928 + }, + { + "epoch": 1.7719119704872153, + "ewc_loss": 0.043721701949834824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1860851120436564e-05, + "grad_norm": 28.795717239379883, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8791057467460632, + "num_tokens": 531252325.0, + "step": 13929 + }, + { + "epoch": 1.7720391807658058, + "ewc_loss": 0.04378270357847214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.189135193475522e-05, + "grad_norm": 28.91512680053711, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8550345301628113, + "num_tokens": 531290378.0, + "step": 13930 + }, + { + "epoch": 1.7721663910443963, + "ewc_loss": 0.04384564608335495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1922822270425968e-05, + "grad_norm": 28.84132194519043, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8579089641571045, + "num_tokens": 531330406.0, + "step": 13931 + }, + { + "epoch": 1.7722936013229869, + "ewc_loss": 0.04389405623078346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1947027562418953e-05, + "grad_norm": 28.89919662475586, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8661760687828064, + "num_tokens": 531367425.0, + "step": 13932 + }, + { + "epoch": 1.7724208116015774, + "ewc_loss": 0.04380638897418976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1903195374761708e-05, + "grad_norm": 28.825651168823242, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8875026702880859, + "num_tokens": 531406465.0, + "step": 13933 + }, + { + "epoch": 1.772548021880168, + "ewc_loss": 0.04383973404765129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1919866412645206e-05, + "grad_norm": 28.97223663330078, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8558844327926636, + "num_tokens": 531445356.0, + "step": 13934 + }, + { + "epoch": 1.7726752321587584, + "ewc_loss": 0.04387880861759186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1939404177828692e-05, + "grad_norm": 28.88874626159668, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8642799854278564, + "num_tokens": 531484737.0, + "step": 13935 + }, + { + "epoch": 1.772802442437349, + "ewc_loss": 0.043781377375125885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1890688003622927e-05, + "grad_norm": 28.988496780395508, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8632895946502686, + "num_tokens": 531523453.0, + "step": 13936 + }, + { + "epoch": 1.7729296527159395, + "ewc_loss": 0.04385482892394066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.192741521866992e-05, + "grad_norm": 28.95475196838379, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8736482858657837, + "num_tokens": 531559627.0, + "step": 13937 + }, + { + "epoch": 1.77305686299453, + "ewc_loss": 0.043749358505010605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.187467907788232e-05, + "grad_norm": 28.818754196166992, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8718632459640503, + "num_tokens": 531597677.0, + "step": 13938 + }, + { + "epoch": 1.7731840732731206, + "ewc_loss": 0.04382352903485298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1911764633841813e-05, + "grad_norm": 28.990421295166016, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.875349760055542, + "num_tokens": 531642753.0, + "step": 13939 + }, + { + "epoch": 1.773311283551711, + "ewc_loss": 0.0438130758702755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1906538677285425e-05, + "grad_norm": 28.905336380004883, + "learning_rate": 1e-06, + "loss": 0.459, + "mean_token_accuracy": 0.8595575094223022, + "num_tokens": 531688249.0, + "step": 13940 + }, + { + "epoch": 1.7734384938303016, + "ewc_loss": 0.04381439834833145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1907198970438913e-05, + "grad_norm": 28.956539154052734, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8804333806037903, + "num_tokens": 531727994.0, + "step": 13941 + }, + { + "epoch": 1.7735657041088921, + "ewc_loss": 0.04380717873573303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1903590095462278e-05, + "grad_norm": 28.856409072875977, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8706552386283875, + "num_tokens": 531764428.0, + "step": 13942 + }, + { + "epoch": 1.7736929143874827, + "ewc_loss": 0.043799277395009995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1899639250477776e-05, + "grad_norm": 28.989421844482422, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8624590635299683, + "num_tokens": 531806604.0, + "step": 13943 + }, + { + "epoch": 1.773820124666073, + "ewc_loss": 0.0438607782125473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.193038926634472e-05, + "grad_norm": 28.851099014282227, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8846875429153442, + "num_tokens": 531845504.0, + "step": 13944 + }, + { + "epoch": 1.7739473349446635, + "ewc_loss": 0.04376068338751793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1880341591895558e-05, + "grad_norm": 28.943584442138672, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8758561611175537, + "num_tokens": 531880962.0, + "step": 13945 + }, + { + "epoch": 1.774074545223254, + "ewc_loss": 0.04379238188266754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1896190446568653e-05, + "grad_norm": 28.87371826171875, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8652832508087158, + "num_tokens": 531918182.0, + "step": 13946 + }, + { + "epoch": 1.7742017555018446, + "ewc_loss": 0.04376710206270218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1883550289203413e-05, + "grad_norm": 29.02825164794922, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8685797452926636, + "num_tokens": 531958340.0, + "step": 13947 + }, + { + "epoch": 1.774328965780435, + "ewc_loss": 0.04382918402552605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1914591343374923e-05, + "grad_norm": 28.878826141357422, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8549097180366516, + "num_tokens": 531996951.0, + "step": 13948 + }, + { + "epoch": 1.7744561760590256, + "ewc_loss": 0.04375123605132103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.187561767641455e-05, + "grad_norm": 28.909603118896484, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.871337890625, + "num_tokens": 532036963.0, + "step": 13949 + }, + { + "epoch": 1.774583386337616, + "ewc_loss": 0.043855197727680206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.192759893659968e-05, + "grad_norm": 29.052745819091797, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8615847826004028, + "num_tokens": 532070937.0, + "step": 13950 + }, + { + "epoch": 1.7747105966162064, + "ewc_loss": 0.04374798387289047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.187399149988778e-05, + "grad_norm": 28.843647003173828, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8535217642784119, + "num_tokens": 532107866.0, + "step": 13951 + }, + { + "epoch": 1.774837806894797, + "ewc_loss": 0.043728917837142944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1864458176423796e-05, + "grad_norm": 28.885530471801758, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8603852987289429, + "num_tokens": 532151485.0, + "step": 13952 + }, + { + "epoch": 1.7749650171733875, + "ewc_loss": 0.043842289596796036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.19211451621959e-05, + "grad_norm": 28.94089698791504, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8732098340988159, + "num_tokens": 532191209.0, + "step": 13953 + }, + { + "epoch": 1.775092227451978, + "ewc_loss": 0.043791525065898895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.189576298405882e-05, + "grad_norm": 29.002546310424805, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8619458675384521, + "num_tokens": 532232219.0, + "step": 13954 + }, + { + "epoch": 1.7752194377305686, + "ewc_loss": 0.04378778487443924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1893893062951975e-05, + "grad_norm": 29.02950668334961, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8542430400848389, + "num_tokens": 532265330.0, + "step": 13955 + }, + { + "epoch": 1.775346648009159, + "ewc_loss": 0.043736837804317474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1868418116355315e-05, + "grad_norm": 28.907752990722656, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8652974367141724, + "num_tokens": 532295964.0, + "step": 13956 + }, + { + "epoch": 1.7754738582877496, + "ewc_loss": 0.04380141571164131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1900706997257657e-05, + "grad_norm": 28.94829559326172, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.870286226272583, + "num_tokens": 532333804.0, + "step": 13957 + }, + { + "epoch": 1.7756010685663401, + "ewc_loss": 0.04388919845223427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.194459921156522e-05, + "grad_norm": 29.062236785888672, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8721750378608704, + "num_tokens": 532373088.0, + "step": 13958 + }, + { + "epoch": 1.7757282788449307, + "ewc_loss": 0.0437663234770298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1883161025471054e-05, + "grad_norm": 28.837139129638672, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8654710650444031, + "num_tokens": 532409473.0, + "step": 13959 + }, + { + "epoch": 1.7758554891235212, + "ewc_loss": 0.043725691735744476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1862846551812254e-05, + "grad_norm": 28.934261322021484, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8780274391174316, + "num_tokens": 532447883.0, + "step": 13960 + }, + { + "epoch": 1.7759826994021117, + "ewc_loss": 0.04377002641558647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1885012756683864e-05, + "grad_norm": 28.766891479492188, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8714122772216797, + "num_tokens": 532485783.0, + "step": 13961 + }, + { + "epoch": 1.7761099096807023, + "ewc_loss": 0.04380985349416733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1904927052673884e-05, + "grad_norm": 29.06551742553711, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.873358964920044, + "num_tokens": 532511082.0, + "step": 13962 + }, + { + "epoch": 1.7762371199592928, + "ewc_loss": 0.04389861971139908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1949310394120403e-05, + "grad_norm": 28.821857452392578, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8836023807525635, + "num_tokens": 532555516.0, + "step": 13963 + }, + { + "epoch": 1.7763643302378833, + "ewc_loss": 0.0436701774597168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1835088773514144e-05, + "grad_norm": 28.869585037231445, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8514426946640015, + "num_tokens": 532599200.0, + "step": 13964 + }, + { + "epoch": 1.7764915405164738, + "ewc_loss": 0.043861694633960724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1930847651674412e-05, + "grad_norm": 28.859630584716797, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8778741359710693, + "num_tokens": 532636335.0, + "step": 13965 + }, + { + "epoch": 1.7766187507950644, + "ewc_loss": 0.04388396441936493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1941981685813516e-05, + "grad_norm": 28.975296020507812, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8791218996047974, + "num_tokens": 532678337.0, + "step": 13966 + }, + { + "epoch": 1.776745961073655, + "ewc_loss": 0.04386912286281586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1934562028036453e-05, + "grad_norm": 28.796213150024414, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8631652593612671, + "num_tokens": 532715051.0, + "step": 13967 + }, + { + "epoch": 1.7768731713522452, + "ewc_loss": 0.043884482234716415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1942241801298223e-05, + "grad_norm": 28.9885311126709, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8777830004692078, + "num_tokens": 532750836.0, + "step": 13968 + }, + { + "epoch": 1.7770003816308357, + "ewc_loss": 0.04393704608082771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1968522560200654e-05, + "grad_norm": 29.050905227661133, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8737776279449463, + "num_tokens": 532790331.0, + "step": 13969 + }, + { + "epoch": 1.7771275919094263, + "ewc_loss": 0.043841760605573654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.192087958974298e-05, + "grad_norm": 28.988706588745117, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8630942702293396, + "num_tokens": 532830169.0, + "step": 13970 + }, + { + "epoch": 1.7772548021880168, + "ewc_loss": 0.043917059898376465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1958529032417573e-05, + "grad_norm": 29.06174659729004, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8611879348754883, + "num_tokens": 532865957.0, + "step": 13971 + }, + { + "epoch": 1.7773820124666073, + "ewc_loss": 0.04381147027015686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1905734683969058e-05, + "grad_norm": 28.98740577697754, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8637908697128296, + "num_tokens": 532906112.0, + "step": 13972 + }, + { + "epoch": 1.7775092227451978, + "ewc_loss": 0.043758757412433624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1879379346501082e-05, + "grad_norm": 28.889678955078125, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8732814192771912, + "num_tokens": 532939676.0, + "step": 13973 + }, + { + "epoch": 1.7776364330237884, + "ewc_loss": 0.043828584253787994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1914291210123338e-05, + "grad_norm": 29.006196975708008, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.8490551114082336, + "num_tokens": 532977668.0, + "step": 13974 + }, + { + "epoch": 1.7777636433023787, + "ewc_loss": 0.04374520480632782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.187260179198347e-05, + "grad_norm": 28.878162384033203, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8657652139663696, + "num_tokens": 533013862.0, + "step": 13975 + }, + { + "epoch": 1.7778908535809692, + "ewc_loss": 0.04383917152881622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.191958628827706e-05, + "grad_norm": 28.867929458618164, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8555732369422913, + "num_tokens": 533051222.0, + "step": 13976 + }, + { + "epoch": 1.7780180638595597, + "ewc_loss": 0.043852075934410095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1926038243691437e-05, + "grad_norm": 28.955842971801758, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.86418616771698, + "num_tokens": 533090182.0, + "step": 13977 + }, + { + "epoch": 1.7781452741381503, + "ewc_loss": 0.04390450194478035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1952251699985936e-05, + "grad_norm": 28.95246124267578, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8803200721740723, + "num_tokens": 533129600.0, + "step": 13978 + }, + { + "epoch": 1.7782724844167408, + "ewc_loss": 0.043804701417684555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1902351363678463e-05, + "grad_norm": 28.946880340576172, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8669238090515137, + "num_tokens": 533172351.0, + "step": 13979 + }, + { + "epoch": 1.7783996946953313, + "ewc_loss": 0.043883003294467926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.194150147261098e-05, + "grad_norm": 28.959102630615234, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.864196240901947, + "num_tokens": 533211583.0, + "step": 13980 + }, + { + "epoch": 1.7785269049739219, + "ewc_loss": 0.04382319375872612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1911597286816686e-05, + "grad_norm": 28.887056350708008, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8727840185165405, + "num_tokens": 533249960.0, + "step": 13981 + }, + { + "epoch": 1.7786541152525124, + "ewc_loss": 0.043795373290777206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.189768747484777e-05, + "grad_norm": 28.935630798339844, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8745126724243164, + "num_tokens": 533287328.0, + "step": 13982 + }, + { + "epoch": 1.778781325531103, + "ewc_loss": 0.04385233670473099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1926169210928492e-05, + "grad_norm": 28.854412078857422, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.86806321144104, + "num_tokens": 533327772.0, + "step": 13983 + }, + { + "epoch": 1.7789085358096934, + "ewc_loss": 0.04375544190406799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.187772042816505e-05, + "grad_norm": 28.94046974182129, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8608419895172119, + "num_tokens": 533372004.0, + "step": 13984 + }, + { + "epoch": 1.779035746088284, + "ewc_loss": 0.04386312887072563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.193156433349941e-05, + "grad_norm": 28.999576568603516, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8642849922180176, + "num_tokens": 533409502.0, + "step": 13985 + }, + { + "epoch": 1.7791629563668745, + "ewc_loss": 0.04376725107431412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1883624867768958e-05, + "grad_norm": 28.912565231323242, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8557709455490112, + "num_tokens": 533451160.0, + "step": 13986 + }, + { + "epoch": 1.779290166645465, + "ewc_loss": 0.043860018253326416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1930009097559378e-05, + "grad_norm": 28.874534606933594, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8749238848686218, + "num_tokens": 533480413.0, + "step": 13987 + }, + { + "epoch": 1.7794173769240555, + "ewc_loss": 0.04378234222531319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.189117185480427e-05, + "grad_norm": 28.8430118560791, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.855708122253418, + "num_tokens": 533518826.0, + "step": 13988 + }, + { + "epoch": 1.779544587202646, + "ewc_loss": 0.043881818652153015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1940908482065424e-05, + "grad_norm": 28.964773178100586, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8564438223838806, + "num_tokens": 533563760.0, + "step": 13989 + }, + { + "epoch": 1.7796717974812366, + "ewc_loss": 0.043844182044267654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1922091036685742e-05, + "grad_norm": 28.798931121826172, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8668150305747986, + "num_tokens": 533601499.0, + "step": 13990 + }, + { + "epoch": 1.7797990077598271, + "ewc_loss": 0.043839000165462494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1919500795775093e-05, + "grad_norm": 29.027021408081055, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8708888292312622, + "num_tokens": 533640757.0, + "step": 13991 + }, + { + "epoch": 1.7799262180384177, + "ewc_loss": 0.04396473243832588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.198236688855104e-05, + "grad_norm": 28.95955467224121, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8690599799156189, + "num_tokens": 533678047.0, + "step": 13992 + }, + { + "epoch": 1.780053428317008, + "ewc_loss": 0.04380499944090843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1902500520809554e-05, + "grad_norm": 28.887826919555664, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8686698079109192, + "num_tokens": 533720903.0, + "step": 13993 + }, + { + "epoch": 1.7801806385955985, + "ewc_loss": 0.04386613890528679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.193306863773614e-05, + "grad_norm": 28.84018325805664, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8686878681182861, + "num_tokens": 533759720.0, + "step": 13994 + }, + { + "epoch": 1.780307848874189, + "ewc_loss": 0.04386945068836212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1934725737082772e-05, + "grad_norm": 28.957576751708984, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8547460436820984, + "num_tokens": 533796122.0, + "step": 13995 + }, + { + "epoch": 1.7804350591527796, + "ewc_loss": 0.04393940046429634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1969699446344748e-05, + "grad_norm": 28.887569427490234, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.872454047203064, + "num_tokens": 533836326.0, + "step": 13996 + }, + { + "epoch": 1.78056226943137, + "ewc_loss": 0.043933432549238205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1966716303722933e-05, + "grad_norm": 29.015378952026367, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8779221177101135, + "num_tokens": 533867874.0, + "step": 13997 + }, + { + "epoch": 1.7806894797099606, + "ewc_loss": 0.043982747942209244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.19913745240774e-05, + "grad_norm": 28.946413040161133, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8793401718139648, + "num_tokens": 533902877.0, + "step": 13998 + }, + { + "epoch": 1.780816689988551, + "ewc_loss": 0.043852560222148895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.192628016928211e-05, + "grad_norm": 28.8460750579834, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.870591402053833, + "num_tokens": 533937440.0, + "step": 13999 + }, + { + "epoch": 1.7809439002671414, + "ewc_loss": 0.04396471753716469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1982359612593427e-05, + "grad_norm": 28.99834442138672, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8772864937782288, + "num_tokens": 533972308.0, + "step": 14000 + }, + { + "epoch": 1.781071110545732, + "ewc_loss": 0.04391837120056152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1959185687592253e-05, + "grad_norm": 28.91628074645996, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8546324968338013, + "num_tokens": 534011913.0, + "step": 14001 + }, + { + "epoch": 1.7811983208243225, + "ewc_loss": 0.04386765882372856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1933828975306824e-05, + "grad_norm": 28.9300594329834, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8787838220596313, + "num_tokens": 534057235.0, + "step": 14002 + }, + { + "epoch": 1.781325531102913, + "ewc_loss": 0.043891116976737976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1945557818980888e-05, + "grad_norm": 28.95026969909668, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8688088655471802, + "num_tokens": 534092850.0, + "step": 14003 + }, + { + "epoch": 1.7814527413815036, + "ewc_loss": 0.043870341032743454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.193517138948664e-05, + "grad_norm": 28.871885299682617, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8724865913391113, + "num_tokens": 534135310.0, + "step": 14004 + }, + { + "epoch": 1.781579951660094, + "ewc_loss": 0.04382753372192383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1913767341175117e-05, + "grad_norm": 28.926149368286133, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8660634756088257, + "num_tokens": 534176008.0, + "step": 14005 + }, + { + "epoch": 1.7817071619386846, + "ewc_loss": 0.043905939906835556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1952970200800337e-05, + "grad_norm": 28.889041900634766, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8671677708625793, + "num_tokens": 534208217.0, + "step": 14006 + }, + { + "epoch": 1.7818343722172751, + "ewc_loss": 0.043927982449531555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.196399145759642e-05, + "grad_norm": 28.901504516601562, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8718294501304626, + "num_tokens": 534247564.0, + "step": 14007 + }, + { + "epoch": 1.7819615824958657, + "ewc_loss": 0.043958842754364014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.19794219447067e-05, + "grad_norm": 28.82782745361328, + "learning_rate": 1e-06, + "loss": 0.4825, + "mean_token_accuracy": 0.8491126298904419, + "num_tokens": 534281144.0, + "step": 14008 + }, + { + "epoch": 1.7820887927744562, + "ewc_loss": 0.04385381191968918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.192690590163693e-05, + "grad_norm": 28.87838363647461, + "learning_rate": 1e-06, + "loss": 0.4625, + "mean_token_accuracy": 0.858254075050354, + "num_tokens": 534317893.0, + "step": 14009 + }, + { + "epoch": 1.7822160030530467, + "ewc_loss": 0.044077273458242416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2038637325749733e-05, + "grad_norm": 28.898426055908203, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8651441335678101, + "num_tokens": 534356116.0, + "step": 14010 + }, + { + "epoch": 1.7823432133316373, + "ewc_loss": 0.04390770569443703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1953852410661057e-05, + "grad_norm": 28.82097053527832, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8420504331588745, + "num_tokens": 534390499.0, + "step": 14011 + }, + { + "epoch": 1.7824704236102278, + "ewc_loss": 0.04399179667234421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.199589835072402e-05, + "grad_norm": 28.920991897583008, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8628898859024048, + "num_tokens": 534424242.0, + "step": 14012 + }, + { + "epoch": 1.7825976338888183, + "ewc_loss": 0.043999239802360535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1999620003043674e-05, + "grad_norm": 28.87379264831543, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8586648106575012, + "num_tokens": 534467465.0, + "step": 14013 + }, + { + "epoch": 1.7827248441674088, + "ewc_loss": 0.04401220753788948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.200610288127791e-05, + "grad_norm": 28.948484420776367, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8612922430038452, + "num_tokens": 534502959.0, + "step": 14014 + }, + { + "epoch": 1.7828520544459994, + "ewc_loss": 0.044050563126802444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.20252823055489e-05, + "grad_norm": 28.956464767456055, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8685846328735352, + "num_tokens": 534549888.0, + "step": 14015 + }, + { + "epoch": 1.78297926472459, + "ewc_loss": 0.04398970678448677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1994854250806384e-05, + "grad_norm": 28.819726943969727, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.868385910987854, + "num_tokens": 534591686.0, + "step": 14016 + }, + { + "epoch": 1.7831064750031802, + "ewc_loss": 0.04396374896168709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.198187394242268e-05, + "grad_norm": 28.850479125976562, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8838004469871521, + "num_tokens": 534628584.0, + "step": 14017 + }, + { + "epoch": 1.7832336852817707, + "ewc_loss": 0.04405456408858299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2027281374903396e-05, + "grad_norm": 28.94375991821289, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8568634986877441, + "num_tokens": 534665684.0, + "step": 14018 + }, + { + "epoch": 1.7833608955603613, + "ewc_loss": 0.04412740841507912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2063704818719998e-05, + "grad_norm": 28.980459213256836, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8696168661117554, + "num_tokens": 534701604.0, + "step": 14019 + }, + { + "epoch": 1.7834881058389518, + "ewc_loss": 0.043958500027656555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1979249140713364e-05, + "grad_norm": 28.865921020507812, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8507862091064453, + "num_tokens": 534741765.0, + "step": 14020 + }, + { + "epoch": 1.7836153161175423, + "ewc_loss": 0.043995290994644165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1997646399540827e-05, + "grad_norm": 28.887781143188477, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8752458095550537, + "num_tokens": 534780331.0, + "step": 14021 + }, + { + "epoch": 1.7837425263961328, + "ewc_loss": 0.04404867812991142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.202433825004846e-05, + "grad_norm": 28.848731994628906, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8547568321228027, + "num_tokens": 534816254.0, + "step": 14022 + }, + { + "epoch": 1.7838697366747234, + "ewc_loss": 0.04406188055872917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.203094118158333e-05, + "grad_norm": 28.933504104614258, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8653392791748047, + "num_tokens": 534851367.0, + "step": 14023 + }, + { + "epoch": 1.7839969469533137, + "ewc_loss": 0.04411080479621887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2055402951082215e-05, + "grad_norm": 28.921539306640625, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8641552925109863, + "num_tokens": 534889077.0, + "step": 14024 + }, + { + "epoch": 1.7841241572319042, + "ewc_loss": 0.04401513934135437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.200756898673717e-05, + "grad_norm": 28.875408172607422, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8561726808547974, + "num_tokens": 534924249.0, + "step": 14025 + }, + { + "epoch": 1.7842513675104947, + "ewc_loss": 0.04398288577795029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1991443645674735e-05, + "grad_norm": 28.863100051879883, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8511596918106079, + "num_tokens": 534966370.0, + "step": 14026 + }, + { + "epoch": 1.7843785777890853, + "ewc_loss": 0.04412217438220978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2061087292968296e-05, + "grad_norm": 28.927230834960938, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8730945587158203, + "num_tokens": 535005869.0, + "step": 14027 + }, + { + "epoch": 1.7845057880676758, + "ewc_loss": 0.044033754616975784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2016876755515113e-05, + "grad_norm": 28.892982482910156, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8635139465332031, + "num_tokens": 535042057.0, + "step": 14028 + }, + { + "epoch": 1.7846329983462663, + "ewc_loss": 0.044027023017406464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2013511625118554e-05, + "grad_norm": 28.901599884033203, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8609050512313843, + "num_tokens": 535074410.0, + "step": 14029 + }, + { + "epoch": 1.7847602086248568, + "ewc_loss": 0.044074587523937225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2037293092580512e-05, + "grad_norm": 28.93438148498535, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8576672077178955, + "num_tokens": 535120356.0, + "step": 14030 + }, + { + "epoch": 1.7848874189034474, + "ewc_loss": 0.04401955008506775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.200977542088367e-05, + "grad_norm": 28.94077491760254, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8566043376922607, + "num_tokens": 535156939.0, + "step": 14031 + }, + { + "epoch": 1.785014629182038, + "ewc_loss": 0.04411297291517258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2056487068766728e-05, + "grad_norm": 29.11564826965332, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8691027164459229, + "num_tokens": 535195553.0, + "step": 14032 + }, + { + "epoch": 1.7851418394606284, + "ewc_loss": 0.044034793972969055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2017396986484528e-05, + "grad_norm": 28.95775032043457, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8714426159858704, + "num_tokens": 535235692.0, + "step": 14033 + }, + { + "epoch": 1.785269049739219, + "ewc_loss": 0.043894845992326736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1947422283119522e-05, + "grad_norm": 28.921966552734375, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.865748941898346, + "num_tokens": 535269265.0, + "step": 14034 + }, + { + "epoch": 1.7853962600178095, + "ewc_loss": 0.04408174753189087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2040874682716094e-05, + "grad_norm": 29.007488250732422, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8819437026977539, + "num_tokens": 535303825.0, + "step": 14035 + }, + { + "epoch": 1.7855234702964, + "ewc_loss": 0.043954480439424515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.197724097641185e-05, + "grad_norm": 28.868499755859375, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8611555099487305, + "num_tokens": 535338802.0, + "step": 14036 + }, + { + "epoch": 1.7856506805749905, + "ewc_loss": 0.044025711715221405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2012854969943874e-05, + "grad_norm": 28.938541412353516, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8737583160400391, + "num_tokens": 535376592.0, + "step": 14037 + }, + { + "epoch": 1.785777890853581, + "ewc_loss": 0.043993134051561356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1996567738824524e-05, + "grad_norm": 28.857303619384766, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8636519312858582, + "num_tokens": 535416528.0, + "step": 14038 + }, + { + "epoch": 1.7859051011321716, + "ewc_loss": 0.04409598559141159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.204799238825217e-05, + "grad_norm": 28.961793899536133, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8630265593528748, + "num_tokens": 535450892.0, + "step": 14039 + }, + { + "epoch": 1.7860323114107621, + "ewc_loss": 0.044128235429525375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2064117729314603e-05, + "grad_norm": 29.01665496826172, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8694027662277222, + "num_tokens": 535488198.0, + "step": 14040 + }, + { + "epoch": 1.7861595216893527, + "ewc_loss": 0.04402846843004227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.201423376391176e-05, + "grad_norm": 28.822185516357422, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8652000427246094, + "num_tokens": 535526020.0, + "step": 14041 + }, + { + "epoch": 1.786286731967943, + "ewc_loss": 0.043982651084661484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1991325411363505e-05, + "grad_norm": 28.94675064086914, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8714155554771423, + "num_tokens": 535560400.0, + "step": 14042 + }, + { + "epoch": 1.7864139422465335, + "ewc_loss": 0.0441771037876606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2088552213972434e-05, + "grad_norm": 28.889402389526367, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8605620265007019, + "num_tokens": 535594699.0, + "step": 14043 + }, + { + "epoch": 1.786541152525124, + "ewc_loss": 0.044040247797966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2020123651600443e-05, + "grad_norm": 28.970619201660156, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.87217116355896, + "num_tokens": 535627019.0, + "step": 14044 + }, + { + "epoch": 1.7866683628037145, + "ewc_loss": 0.04405492916703224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2027465092833154e-05, + "grad_norm": 28.832799911499023, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8764457106590271, + "num_tokens": 535667479.0, + "step": 14045 + }, + { + "epoch": 1.786795573082305, + "ewc_loss": 0.044024188071489334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.201209463237319e-05, + "grad_norm": 28.9494686126709, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8611916899681091, + "num_tokens": 535705211.0, + "step": 14046 + }, + { + "epoch": 1.7869227833608956, + "ewc_loss": 0.04406879469752312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2034397261450067e-05, + "grad_norm": 28.85114097595215, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8476871252059937, + "num_tokens": 535744907.0, + "step": 14047 + }, + { + "epoch": 1.787049993639486, + "ewc_loss": 0.044047240167856216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2023619749234058e-05, + "grad_norm": 28.973970413208008, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8662588596343994, + "num_tokens": 535782526.0, + "step": 14048 + }, + { + "epoch": 1.7871772039180764, + "ewc_loss": 0.04410593584179878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2052967324270867e-05, + "grad_norm": 28.85994529724121, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.881951093673706, + "num_tokens": 535824153.0, + "step": 14049 + }, + { + "epoch": 1.787304414196667, + "ewc_loss": 0.04400769993662834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2003849153406918e-05, + "grad_norm": 28.929819107055664, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.878278374671936, + "num_tokens": 535859019.0, + "step": 14050 + }, + { + "epoch": 1.7874316244752575, + "ewc_loss": 0.04407365247607231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.20368256123038e-05, + "grad_norm": 28.930898666381836, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.878481924533844, + "num_tokens": 535896866.0, + "step": 14051 + }, + { + "epoch": 1.787558834753848, + "ewc_loss": 0.04401954263448715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2009771782904863e-05, + "grad_norm": 28.860193252563477, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8675738573074341, + "num_tokens": 535937128.0, + "step": 14052 + }, + { + "epoch": 1.7876860450324386, + "ewc_loss": 0.04398877173662186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1994386770529673e-05, + "grad_norm": 28.914188385009766, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8824612498283386, + "num_tokens": 535972727.0, + "step": 14053 + }, + { + "epoch": 1.787813255311029, + "ewc_loss": 0.04407498612999916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2037493181414902e-05, + "grad_norm": 29.05039405822754, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8716264963150024, + "num_tokens": 536011491.0, + "step": 14054 + }, + { + "epoch": 1.7879404655896196, + "ewc_loss": 0.044054318219423294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2027159502613358e-05, + "grad_norm": 28.93247413635254, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8704454898834229, + "num_tokens": 536048074.0, + "step": 14055 + }, + { + "epoch": 1.7880676758682101, + "ewc_loss": 0.04399292171001434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1996460418449715e-05, + "grad_norm": 28.900989532470703, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8736815452575684, + "num_tokens": 536089554.0, + "step": 14056 + }, + { + "epoch": 1.7881948861468007, + "ewc_loss": 0.04410157725214958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.205078817496542e-05, + "grad_norm": 28.976659774780273, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8764431476593018, + "num_tokens": 536127672.0, + "step": 14057 + }, + { + "epoch": 1.7883220964253912, + "ewc_loss": 0.044035233557224274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2017617084202357e-05, + "grad_norm": 28.97055435180664, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8684074878692627, + "num_tokens": 536157517.0, + "step": 14058 + }, + { + "epoch": 1.7884493067039817, + "ewc_loss": 0.044037383049726486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2018692106939852e-05, + "grad_norm": 28.931962966918945, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8792926073074341, + "num_tokens": 536192855.0, + "step": 14059 + }, + { + "epoch": 1.7885765169825723, + "ewc_loss": 0.044039178639650345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.20195888687158e-05, + "grad_norm": 28.92110252380371, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8692491054534912, + "num_tokens": 536231727.0, + "step": 14060 + }, + { + "epoch": 1.7887037272611628, + "ewc_loss": 0.044016074389219284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.200803646701388e-05, + "grad_norm": 28.940744400024414, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8805849552154541, + "num_tokens": 536270738.0, + "step": 14061 + }, + { + "epoch": 1.7888309375397533, + "ewc_loss": 0.044083744287490845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2041871488909237e-05, + "grad_norm": 29.006229400634766, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8861351609230042, + "num_tokens": 536313006.0, + "step": 14062 + }, + { + "epoch": 1.7889581478183438, + "ewc_loss": 0.04401155561208725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2005777282174677e-05, + "grad_norm": 28.910306930541992, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8606619834899902, + "num_tokens": 536350697.0, + "step": 14063 + }, + { + "epoch": 1.7890853580969344, + "ewc_loss": 0.04403286799788475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2016434741090052e-05, + "grad_norm": 28.938751220703125, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8761295080184937, + "num_tokens": 536394252.0, + "step": 14064 + }, + { + "epoch": 1.789212568375525, + "ewc_loss": 0.04393453150987625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1967265638522804e-05, + "grad_norm": 29.00574493408203, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8660604357719421, + "num_tokens": 536426591.0, + "step": 14065 + }, + { + "epoch": 1.7893397786541152, + "ewc_loss": 0.04407454654574394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2037273083697073e-05, + "grad_norm": 28.993515014648438, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8558695316314697, + "num_tokens": 536468567.0, + "step": 14066 + }, + { + "epoch": 1.7894669889327057, + "ewc_loss": 0.04402494803071022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2012474801158533e-05, + "grad_norm": 28.9742488861084, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8583167791366577, + "num_tokens": 536506352.0, + "step": 14067 + }, + { + "epoch": 1.7895941992112963, + "ewc_loss": 0.04399143531918526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1995718270773068e-05, + "grad_norm": 28.97201919555664, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8602981567382812, + "num_tokens": 536538122.0, + "step": 14068 + }, + { + "epoch": 1.7897214094898868, + "ewc_loss": 0.04400936886668205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2004684069543146e-05, + "grad_norm": 28.953731536865234, + "learning_rate": 1e-06, + "loss": 0.4672, + "mean_token_accuracy": 0.857723593711853, + "num_tokens": 536574617.0, + "step": 14069 + }, + { + "epoch": 1.7898486197684773, + "ewc_loss": 0.044003237038850784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.200161907239817e-05, + "grad_norm": 29.02910614013672, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.879780650138855, + "num_tokens": 536611895.0, + "step": 14070 + }, + { + "epoch": 1.7899758300470678, + "ewc_loss": 0.04401000961661339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.200500421167817e-05, + "grad_norm": 28.893281936645508, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8599368333816528, + "num_tokens": 536651336.0, + "step": 14071 + }, + { + "epoch": 1.7901030403256584, + "ewc_loss": 0.04396959766745567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1984798877383582e-05, + "grad_norm": 28.978364944458008, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8631173372268677, + "num_tokens": 536692566.0, + "step": 14072 + }, + { + "epoch": 1.7902302506042487, + "ewc_loss": 0.044079411774873734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2039705072529614e-05, + "grad_norm": 29.003190994262695, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8735685348510742, + "num_tokens": 536730855.0, + "step": 14073 + }, + { + "epoch": 1.7903574608828392, + "ewc_loss": 0.0439772829413414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1988642401993275e-05, + "grad_norm": 28.928586959838867, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8654863834381104, + "num_tokens": 536766478.0, + "step": 14074 + }, + { + "epoch": 1.7904846711614297, + "ewc_loss": 0.044024184346199036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2012092813383788e-05, + "grad_norm": 28.88965606689453, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8625807166099548, + "num_tokens": 536799900.0, + "step": 14075 + }, + { + "epoch": 1.7906118814400203, + "ewc_loss": 0.04402874782681465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2014373826095834e-05, + "grad_norm": 28.97332191467285, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8735643625259399, + "num_tokens": 536843597.0, + "step": 14076 + }, + { + "epoch": 1.7907390917186108, + "ewc_loss": 0.04407436028122902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2037180315237492e-05, + "grad_norm": 28.96998405456543, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8787215352058411, + "num_tokens": 536879189.0, + "step": 14077 + }, + { + "epoch": 1.7908663019972013, + "ewc_loss": 0.044031642377376556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2015821741661057e-05, + "grad_norm": 28.940471649169922, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8713577389717102, + "num_tokens": 536918099.0, + "step": 14078 + }, + { + "epoch": 1.7909935122757918, + "ewc_loss": 0.04400429129600525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.20021465793252e-05, + "grad_norm": 28.913427352905273, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.857787549495697, + "num_tokens": 536954410.0, + "step": 14079 + }, + { + "epoch": 1.7911207225543824, + "ewc_loss": 0.0440962053835392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2048103346605785e-05, + "grad_norm": 29.17374038696289, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8899166584014893, + "num_tokens": 536986688.0, + "step": 14080 + }, + { + "epoch": 1.791247932832973, + "ewc_loss": 0.04402673617005348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2013367924955674e-05, + "grad_norm": 28.89818000793457, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8677959442138672, + "num_tokens": 537024181.0, + "step": 14081 + }, + { + "epoch": 1.7913751431115634, + "ewc_loss": 0.04398689791560173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1993448171997443e-05, + "grad_norm": 29.156429290771484, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8712194561958313, + "num_tokens": 537064174.0, + "step": 14082 + }, + { + "epoch": 1.791502353390154, + "ewc_loss": 0.04408067837357521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.204033989983145e-05, + "grad_norm": 28.988876342773438, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8599042296409607, + "num_tokens": 537100279.0, + "step": 14083 + }, + { + "epoch": 1.7916295636687445, + "ewc_loss": 0.043972063809633255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1986032152199186e-05, + "grad_norm": 28.926971435546875, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8426852226257324, + "num_tokens": 537135339.0, + "step": 14084 + }, + { + "epoch": 1.791756773947335, + "ewc_loss": 0.04399418085813522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.199708978878334e-05, + "grad_norm": 29.150205612182617, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8619869947433472, + "num_tokens": 537169608.0, + "step": 14085 + }, + { + "epoch": 1.7918839842259255, + "ewc_loss": 0.043992310762405396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.199615482822992e-05, + "grad_norm": 28.868104934692383, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8757030367851257, + "num_tokens": 537205560.0, + "step": 14086 + }, + { + "epoch": 1.792011194504516, + "ewc_loss": 0.043903037905693054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1951518647256307e-05, + "grad_norm": 29.03094482421875, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8564563989639282, + "num_tokens": 537238697.0, + "step": 14087 + }, + { + "epoch": 1.7921384047831066, + "ewc_loss": 0.044107936322689056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2053967768442817e-05, + "grad_norm": 29.213199615478516, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8628363609313965, + "num_tokens": 537282365.0, + "step": 14088 + }, + { + "epoch": 1.7922656150616971, + "ewc_loss": 0.043979767709970474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1989884771755897e-05, + "grad_norm": 28.939037322998047, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.875569224357605, + "num_tokens": 537320805.0, + "step": 14089 + }, + { + "epoch": 1.7923928253402877, + "ewc_loss": 0.04388369247317314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1941847080597654e-05, + "grad_norm": 28.958667755126953, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8700789213180542, + "num_tokens": 537354607.0, + "step": 14090 + }, + { + "epoch": 1.792520035618878, + "ewc_loss": 0.044087883085012436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2043941498850472e-05, + "grad_norm": 29.003009796142578, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8780480623245239, + "num_tokens": 537386672.0, + "step": 14091 + }, + { + "epoch": 1.7926472458974685, + "ewc_loss": 0.043913353234529495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1956677301204763e-05, + "grad_norm": 28.895021438598633, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8697488307952881, + "num_tokens": 537430882.0, + "step": 14092 + }, + { + "epoch": 1.792774456176059, + "ewc_loss": 0.04401518031954765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2007590814610012e-05, + "grad_norm": 29.04016876220703, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8699823617935181, + "num_tokens": 537465114.0, + "step": 14093 + }, + { + "epoch": 1.7929016664546495, + "ewc_loss": 0.04407702386379242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2038511815480888e-05, + "grad_norm": 29.10451316833496, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8631125092506409, + "num_tokens": 537502820.0, + "step": 14094 + }, + { + "epoch": 1.79302887673324, + "ewc_loss": 0.044056039303541183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2028019884601235e-05, + "grad_norm": 29.025102615356445, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8760530948638916, + "num_tokens": 537537422.0, + "step": 14095 + }, + { + "epoch": 1.7931560870118306, + "ewc_loss": 0.043952085077762604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1976042262394913e-05, + "grad_norm": 28.94357681274414, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8724353313446045, + "num_tokens": 537566828.0, + "step": 14096 + }, + { + "epoch": 1.793283297290421, + "ewc_loss": 0.044044654816389084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.202232826675754e-05, + "grad_norm": 29.136707305908203, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8653962016105652, + "num_tokens": 537606102.0, + "step": 14097 + }, + { + "epoch": 1.7934105075690114, + "ewc_loss": 0.044079869985580444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.203993426519446e-05, + "grad_norm": 29.03367042541504, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8749841451644897, + "num_tokens": 537638527.0, + "step": 14098 + }, + { + "epoch": 1.793537717847602, + "ewc_loss": 0.0439915657043457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1995781935402192e-05, + "grad_norm": 28.872264862060547, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8667681217193604, + "num_tokens": 537683375.0, + "step": 14099 + }, + { + "epoch": 1.7936649281261925, + "ewc_loss": 0.04409889504313469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2049447579775006e-05, + "grad_norm": 29.134172439575195, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8621408939361572, + "num_tokens": 537731354.0, + "step": 14100 + }, + { + "epoch": 1.793792138404783, + "ewc_loss": 0.044103290885686874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.205164491897449e-05, + "grad_norm": 29.274749755859375, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8626583218574524, + "num_tokens": 537770055.0, + "step": 14101 + }, + { + "epoch": 1.7939193486833735, + "ewc_loss": 0.043993741273880005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1996871510054916e-05, + "grad_norm": 29.012361526489258, + "learning_rate": 1e-06, + "loss": 0.4933, + "mean_token_accuracy": 0.8471711874008179, + "num_tokens": 537807977.0, + "step": 14102 + }, + { + "epoch": 1.794046558961964, + "ewc_loss": 0.04390878230333328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1954390831524506e-05, + "grad_norm": 28.986560821533203, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8622856140136719, + "num_tokens": 537847809.0, + "step": 14103 + }, + { + "epoch": 1.7941737692405546, + "ewc_loss": 0.04403096064925194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.201547977165319e-05, + "grad_norm": 29.342119216918945, + "learning_rate": 1e-06, + "loss": 0.4908, + "mean_token_accuracy": 0.8490805625915527, + "num_tokens": 537885506.0, + "step": 14104 + }, + { + "epoch": 1.7943009795191451, + "ewc_loss": 0.04394375905394554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.19718804146396e-05, + "grad_norm": 28.80622673034668, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8630231618881226, + "num_tokens": 537918936.0, + "step": 14105 + }, + { + "epoch": 1.7944281897977357, + "ewc_loss": 0.04385196045041084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1925980036030523e-05, + "grad_norm": 28.979604721069336, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8720283508300781, + "num_tokens": 537954205.0, + "step": 14106 + }, + { + "epoch": 1.7945554000763262, + "ewc_loss": 0.04412095993757248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.206047975050751e-05, + "grad_norm": 29.271371841430664, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8697555661201477, + "num_tokens": 537989190.0, + "step": 14107 + }, + { + "epoch": 1.7946826103549167, + "ewc_loss": 0.043948251754045486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1974125047563575e-05, + "grad_norm": 28.938913345336914, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8590380549430847, + "num_tokens": 538028852.0, + "step": 14108 + }, + { + "epoch": 1.7948098206335072, + "ewc_loss": 0.04386603459715843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1933017706032842e-05, + "grad_norm": 28.793792724609375, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8729695081710815, + "num_tokens": 538061245.0, + "step": 14109 + }, + { + "epoch": 1.7949370309120978, + "ewc_loss": 0.04405995458364487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.202997711719945e-05, + "grad_norm": 29.083837509155273, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8797764778137207, + "num_tokens": 538103238.0, + "step": 14110 + }, + { + "epoch": 1.7950642411906883, + "ewc_loss": 0.04404475912451744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.202237919846084e-05, + "grad_norm": 28.900150299072266, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8534014225006104, + "num_tokens": 538142692.0, + "step": 14111 + }, + { + "epoch": 1.7951914514692788, + "ewc_loss": 0.04389985278248787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1949927031528205e-05, + "grad_norm": 28.8369140625, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8659703731536865, + "num_tokens": 538183155.0, + "step": 14112 + }, + { + "epoch": 1.7953186617478694, + "ewc_loss": 0.0440860278904438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2043013814254664e-05, + "grad_norm": 28.927820205688477, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.865825891494751, + "num_tokens": 538222063.0, + "step": 14113 + }, + { + "epoch": 1.7954458720264599, + "ewc_loss": 0.04404416307806969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2022080884198658e-05, + "grad_norm": 28.887903213500977, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8694294691085815, + "num_tokens": 538259847.0, + "step": 14114 + }, + { + "epoch": 1.7955730823050502, + "ewc_loss": 0.04409746080636978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.204873089795001e-05, + "grad_norm": 28.89818000793457, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8738155961036682, + "num_tokens": 538298557.0, + "step": 14115 + }, + { + "epoch": 1.7957002925836407, + "ewc_loss": 0.04405726492404938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.202863288403023e-05, + "grad_norm": 28.858070373535156, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8651378154754639, + "num_tokens": 538337926.0, + "step": 14116 + }, + { + "epoch": 1.7958275028622313, + "ewc_loss": 0.044139374047517776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.206968747486826e-05, + "grad_norm": 29.13642692565918, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8715882301330566, + "num_tokens": 538382755.0, + "step": 14117 + }, + { + "epoch": 1.7959547131408218, + "ewc_loss": 0.04412325099110603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2061625713831745e-05, + "grad_norm": 28.948062896728516, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8497878313064575, + "num_tokens": 538417639.0, + "step": 14118 + }, + { + "epoch": 1.7960819234194123, + "ewc_loss": 0.04401909187436104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2009546228218824e-05, + "grad_norm": 29.04751205444336, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8615819215774536, + "num_tokens": 538458444.0, + "step": 14119 + }, + { + "epoch": 1.7962091336980028, + "ewc_loss": 0.04415266215801239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.207633042417001e-05, + "grad_norm": 28.86212158203125, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8685842752456665, + "num_tokens": 538496580.0, + "step": 14120 + }, + { + "epoch": 1.7963363439765934, + "ewc_loss": 0.043989699333906174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1994848793838173e-05, + "grad_norm": 28.944135665893555, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8763547539710999, + "num_tokens": 538538474.0, + "step": 14121 + }, + { + "epoch": 1.7964635542551837, + "ewc_loss": 0.044112954288721085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.205647797381971e-05, + "grad_norm": 28.91176414489746, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8694433569908142, + "num_tokens": 538576828.0, + "step": 14122 + }, + { + "epoch": 1.7965907645337742, + "ewc_loss": 0.04404912889003754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2024563804734498e-05, + "grad_norm": 28.991384506225586, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.875786304473877, + "num_tokens": 538611630.0, + "step": 14123 + }, + { + "epoch": 1.7967179748123647, + "ewc_loss": 0.04405143857002258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.202571886300575e-05, + "grad_norm": 28.8847713470459, + "learning_rate": 1e-06, + "loss": 0.5176, + "mean_token_accuracy": 0.8427655696868896, + "num_tokens": 538659537.0, + "step": 14124 + }, + { + "epoch": 1.7968451850909553, + "ewc_loss": 0.04398747906088829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.199373921030201e-05, + "grad_norm": 28.922210693359375, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8755049109458923, + "num_tokens": 538700216.0, + "step": 14125 + }, + { + "epoch": 1.7969723953695458, + "ewc_loss": 0.04413430020213127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2067149984650314e-05, + "grad_norm": 28.879804611206055, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8649592995643616, + "num_tokens": 538737719.0, + "step": 14126 + }, + { + "epoch": 1.7970996056481363, + "ewc_loss": 0.04406985640525818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2034928406355903e-05, + "grad_norm": 28.96950912475586, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8748132586479187, + "num_tokens": 538774723.0, + "step": 14127 + }, + { + "epoch": 1.7972268159267268, + "ewc_loss": 0.04404182732105255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2020913093001582e-05, + "grad_norm": 28.954559326171875, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8767844438552856, + "num_tokens": 538815288.0, + "step": 14128 + }, + { + "epoch": 1.7973540262053174, + "ewc_loss": 0.04405999556183815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.202999712608289e-05, + "grad_norm": 28.993858337402344, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.866578221321106, + "num_tokens": 538858239.0, + "step": 14129 + }, + { + "epoch": 1.797481236483908, + "ewc_loss": 0.04405442625284195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.202721225330606e-05, + "grad_norm": 28.93045997619629, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8706181645393372, + "num_tokens": 538896947.0, + "step": 14130 + }, + { + "epoch": 1.7976084467624984, + "ewc_loss": 0.0440000556409359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2000027456670068e-05, + "grad_norm": 28.824098587036133, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8653604984283447, + "num_tokens": 538936511.0, + "step": 14131 + }, + { + "epoch": 1.797735657041089, + "ewc_loss": 0.044103849679231644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2051925043342635e-05, + "grad_norm": 28.943462371826172, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8609222173690796, + "num_tokens": 538969198.0, + "step": 14132 + }, + { + "epoch": 1.7978628673196795, + "ewc_loss": 0.04408598691225052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2042993805371225e-05, + "grad_norm": 28.93758773803711, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8779846429824829, + "num_tokens": 539002258.0, + "step": 14133 + }, + { + "epoch": 1.79799007759827, + "ewc_loss": 0.0441216304898262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2060814444557764e-05, + "grad_norm": 28.953033447265625, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8535830974578857, + "num_tokens": 539039993.0, + "step": 14134 + }, + { + "epoch": 1.7981172878768605, + "ewc_loss": 0.044063180685043335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2031590560800396e-05, + "grad_norm": 28.901643753051758, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8634265661239624, + "num_tokens": 539080713.0, + "step": 14135 + }, + { + "epoch": 1.798244498155451, + "ewc_loss": 0.04404402896761894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2022013581590727e-05, + "grad_norm": 28.82025718688965, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8778469562530518, + "num_tokens": 539121115.0, + "step": 14136 + }, + { + "epoch": 1.7983717084340416, + "ewc_loss": 0.04415952414274216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.20797628571745e-05, + "grad_norm": 28.89566993713379, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8862303495407104, + "num_tokens": 539159889.0, + "step": 14137 + }, + { + "epoch": 1.7984989187126321, + "ewc_loss": 0.04412992298603058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2064961740397848e-05, + "grad_norm": 28.982852935791016, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.86638343334198, + "num_tokens": 539201508.0, + "step": 14138 + }, + { + "epoch": 1.7986261289912227, + "ewc_loss": 0.04416412487626076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2082062059780583e-05, + "grad_norm": 28.97803497314453, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8492605686187744, + "num_tokens": 539238640.0, + "step": 14139 + }, + { + "epoch": 1.798753339269813, + "ewc_loss": 0.04416962340474129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2084812371758744e-05, + "grad_norm": 29.090055465698242, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8638775944709778, + "num_tokens": 539274797.0, + "step": 14140 + }, + { + "epoch": 1.7988805495484035, + "ewc_loss": 0.04407692328095436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.203846088377759e-05, + "grad_norm": 28.899145126342773, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8555051684379578, + "num_tokens": 539308410.0, + "step": 14141 + }, + { + "epoch": 1.799007759826994, + "ewc_loss": 0.04409767687320709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2048838218324818e-05, + "grad_norm": 29.013471603393555, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8594595193862915, + "num_tokens": 539345578.0, + "step": 14142 + }, + { + "epoch": 1.7991349701055845, + "ewc_loss": 0.04414571449160576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.207285797339864e-05, + "grad_norm": 28.934280395507812, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8518314361572266, + "num_tokens": 539378946.0, + "step": 14143 + }, + { + "epoch": 1.799262180384175, + "ewc_loss": 0.04401308298110962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2006541257724166e-05, + "grad_norm": 28.795045852661133, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8791592121124268, + "num_tokens": 539417663.0, + "step": 14144 + }, + { + "epoch": 1.7993893906627656, + "ewc_loss": 0.044134676456451416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2067339159548283e-05, + "grad_norm": 28.90024185180664, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8820682168006897, + "num_tokens": 539459597.0, + "step": 14145 + }, + { + "epoch": 1.799516600941356, + "ewc_loss": 0.04415121674537659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2075608285376802e-05, + "grad_norm": 28.938392639160156, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8706467747688293, + "num_tokens": 539499579.0, + "step": 14146 + }, + { + "epoch": 1.7996438112199464, + "ewc_loss": 0.04418071731925011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2090358470450155e-05, + "grad_norm": 29.03251838684082, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8760920763015747, + "num_tokens": 539538505.0, + "step": 14147 + }, + { + "epoch": 1.799771021498537, + "ewc_loss": 0.044185031205415726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.209251579188276e-05, + "grad_norm": 28.960908889770508, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8604934215545654, + "num_tokens": 539574824.0, + "step": 14148 + }, + { + "epoch": 1.7998982317771275, + "ewc_loss": 0.04416439309716225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2082196664996445e-05, + "grad_norm": 28.99929428100586, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8543683886528015, + "num_tokens": 539614181.0, + "step": 14149 + }, + { + "epoch": 1.800025442055718, + "ewc_loss": 0.04412277042865753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2061385607230477e-05, + "grad_norm": 28.91471290588379, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8554139137268066, + "num_tokens": 539656713.0, + "step": 14150 + }, + { + "epoch": 1.8001526523343085, + "ewc_loss": 0.04411013796925545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2055068257031962e-05, + "grad_norm": 28.811697006225586, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8657795190811157, + "num_tokens": 539695436.0, + "step": 14151 + }, + { + "epoch": 1.800279862612899, + "ewc_loss": 0.04416850209236145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.208425030403305e-05, + "grad_norm": 29.07088279724121, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.868854284286499, + "num_tokens": 539736549.0, + "step": 14152 + }, + { + "epoch": 1.8004070728914896, + "ewc_loss": 0.04420635476708412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2103176888776943e-05, + "grad_norm": 28.841114044189453, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8724119067192078, + "num_tokens": 539776326.0, + "step": 14153 + }, + { + "epoch": 1.8005342831700801, + "ewc_loss": 0.04406937584280968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2034688299754634e-05, + "grad_norm": 28.96514320373535, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.874257504940033, + "num_tokens": 539819665.0, + "step": 14154 + }, + { + "epoch": 1.8006614934486707, + "ewc_loss": 0.04421263188123703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2106316464487463e-05, + "grad_norm": 28.911230087280273, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8656235337257385, + "num_tokens": 539858175.0, + "step": 14155 + }, + { + "epoch": 1.8007887037272612, + "ewc_loss": 0.04409715160727501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2048576283850707e-05, + "grad_norm": 28.943401336669922, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.865606963634491, + "num_tokens": 539895152.0, + "step": 14156 + }, + { + "epoch": 1.8009159140058517, + "ewc_loss": 0.04404932260513306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.202466203016229e-05, + "grad_norm": 28.900949478149414, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8722110986709595, + "num_tokens": 539941033.0, + "step": 14157 + }, + { + "epoch": 1.8010431242844422, + "ewc_loss": 0.04406248405575752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2031241314834915e-05, + "grad_norm": 29.0168514251709, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8691434860229492, + "num_tokens": 539977959.0, + "step": 14158 + }, + { + "epoch": 1.8011703345630328, + "ewc_loss": 0.0440760999917984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2038049792172387e-05, + "grad_norm": 28.932645797729492, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8655874729156494, + "num_tokens": 540014857.0, + "step": 14159 + }, + { + "epoch": 1.8012975448416233, + "ewc_loss": 0.043998852372169495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.1999425371177495e-05, + "grad_norm": 28.943618774414062, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.86503666639328, + "num_tokens": 540055775.0, + "step": 14160 + }, + { + "epoch": 1.8014247551202138, + "ewc_loss": 0.04412306845188141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2061534764361568e-05, + "grad_norm": 28.9919490814209, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8764800429344177, + "num_tokens": 540090287.0, + "step": 14161 + }, + { + "epoch": 1.8015519653988044, + "ewc_loss": 0.04400254786014557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2001273464411497e-05, + "grad_norm": 28.93974494934082, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8757381439208984, + "num_tokens": 540129139.0, + "step": 14162 + }, + { + "epoch": 1.8016791756773949, + "ewc_loss": 0.04411889240145683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2059446564526297e-05, + "grad_norm": 29.01932144165039, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8661903738975525, + "num_tokens": 540165187.0, + "step": 14163 + }, + { + "epoch": 1.8018063859559852, + "ewc_loss": 0.04405678063631058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.202839095843956e-05, + "grad_norm": 28.96585464477539, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8795232772827148, + "num_tokens": 540201113.0, + "step": 14164 + }, + { + "epoch": 1.8019335962345757, + "ewc_loss": 0.04409494251012802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2047472157282755e-05, + "grad_norm": 29.001991271972656, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8689189553260803, + "num_tokens": 540237448.0, + "step": 14165 + }, + { + "epoch": 1.8020608065131662, + "ewc_loss": 0.04414695128798485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2073476429795846e-05, + "grad_norm": 29.032712936401367, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8616692423820496, + "num_tokens": 540279703.0, + "step": 14166 + }, + { + "epoch": 1.8021880167917568, + "ewc_loss": 0.044071637094020844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2035817892174236e-05, + "grad_norm": 28.885862350463867, + "learning_rate": 1e-06, + "loss": 0.4767, + "mean_token_accuracy": 0.8491051197052002, + "num_tokens": 540313071.0, + "step": 14167 + }, + { + "epoch": 1.8023152270703473, + "ewc_loss": 0.0440855473279953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2042773707653396e-05, + "grad_norm": 28.988588333129883, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8718273639678955, + "num_tokens": 540353940.0, + "step": 14168 + }, + { + "epoch": 1.8024424373489378, + "ewc_loss": 0.04415583238005638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2077916582929902e-05, + "grad_norm": 28.86722183227539, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8645856380462646, + "num_tokens": 540391241.0, + "step": 14169 + }, + { + "epoch": 1.8025696476275284, + "ewc_loss": 0.04405488073825836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2027439626981504e-05, + "grad_norm": 29.10968017578125, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8785010576248169, + "num_tokens": 540420205.0, + "step": 14170 + }, + { + "epoch": 1.8026968579061187, + "ewc_loss": 0.044161807745695114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2080903363530524e-05, + "grad_norm": 28.850704193115234, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8712570667266846, + "num_tokens": 540452121.0, + "step": 14171 + }, + { + "epoch": 1.8028240681847092, + "ewc_loss": 0.044103432446718216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2051715859561227e-05, + "grad_norm": 29.042818069458008, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8677369356155396, + "num_tokens": 540487415.0, + "step": 14172 + }, + { + "epoch": 1.8029512784632997, + "ewc_loss": 0.04424353316426277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2121766960481182e-05, + "grad_norm": 28.998538970947266, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.857252836227417, + "num_tokens": 540522293.0, + "step": 14173 + }, + { + "epoch": 1.8030784887418903, + "ewc_loss": 0.04418087378144264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2090436686994508e-05, + "grad_norm": 29.03513526916504, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8785852789878845, + "num_tokens": 540560345.0, + "step": 14174 + }, + { + "epoch": 1.8032056990204808, + "ewc_loss": 0.04415243864059448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2076219465816393e-05, + "grad_norm": 29.003808975219727, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8759967088699341, + "num_tokens": 540598299.0, + "step": 14175 + }, + { + "epoch": 1.8033329092990713, + "ewc_loss": 0.04419484734535217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2097423425293528e-05, + "grad_norm": 29.088958740234375, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8792009353637695, + "num_tokens": 540634480.0, + "step": 14176 + }, + { + "epoch": 1.8034601195776618, + "ewc_loss": 0.044153932482004166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.207696707046125e-05, + "grad_norm": 28.913185119628906, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8588980436325073, + "num_tokens": 540679473.0, + "step": 14177 + }, + { + "epoch": 1.8035873298562524, + "ewc_loss": 0.04414002224802971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.207001125498209e-05, + "grad_norm": 28.940242767333984, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8617876768112183, + "num_tokens": 540722259.0, + "step": 14178 + }, + { + "epoch": 1.803714540134843, + "ewc_loss": 0.04419485852122307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2097428882261738e-05, + "grad_norm": 28.97677993774414, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8715987205505371, + "num_tokens": 540758058.0, + "step": 14179 + }, + { + "epoch": 1.8038417504134334, + "ewc_loss": 0.044228702783584595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2114350940682925e-05, + "grad_norm": 28.965612411499023, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8633317947387695, + "num_tokens": 540798535.0, + "step": 14180 + }, + { + "epoch": 1.803968960692024, + "ewc_loss": 0.04428217560052872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2141088265925646e-05, + "grad_norm": 29.059894561767578, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8676578998565674, + "num_tokens": 540839423.0, + "step": 14181 + }, + { + "epoch": 1.8040961709706145, + "ewc_loss": 0.0441889725625515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.20944857574068e-05, + "grad_norm": 29.053945541381836, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.873542308807373, + "num_tokens": 540887512.0, + "step": 14182 + }, + { + "epoch": 1.804223381249205, + "ewc_loss": 0.044180288910865784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2090143829700537e-05, + "grad_norm": 28.978225708007812, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8698041439056396, + "num_tokens": 540922292.0, + "step": 14183 + }, + { + "epoch": 1.8043505915277955, + "ewc_loss": 0.04409537836909294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2047688617021777e-05, + "grad_norm": 28.941600799560547, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8738638162612915, + "num_tokens": 540964105.0, + "step": 14184 + }, + { + "epoch": 1.804477801806386, + "ewc_loss": 0.04416849464178085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.208424666605424e-05, + "grad_norm": 28.958425521850586, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8657582402229309, + "num_tokens": 541001793.0, + "step": 14185 + }, + { + "epoch": 1.8046050120849766, + "ewc_loss": 0.044180817902088165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2090409402153455e-05, + "grad_norm": 29.092205047607422, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8749887347221375, + "num_tokens": 541041104.0, + "step": 14186 + }, + { + "epoch": 1.8047322223635671, + "ewc_loss": 0.04420829191803932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2104146410129033e-05, + "grad_norm": 28.97869873046875, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8607260584831238, + "num_tokens": 541079643.0, + "step": 14187 + }, + { + "epoch": 1.8048594326421576, + "ewc_loss": 0.04410206526517868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2051031919545494e-05, + "grad_norm": 28.989713668823242, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8720892667770386, + "num_tokens": 541118554.0, + "step": 14188 + }, + { + "epoch": 1.804986642920748, + "ewc_loss": 0.04410795122385025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.205397504440043e-05, + "grad_norm": 29.09764289855957, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8561051487922668, + "num_tokens": 541156893.0, + "step": 14189 + }, + { + "epoch": 1.8051138531993385, + "ewc_loss": 0.04411453753709793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2057269234210253e-05, + "grad_norm": 28.9649715423584, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8830515146255493, + "num_tokens": 541197283.0, + "step": 14190 + }, + { + "epoch": 1.805241063477929, + "ewc_loss": 0.0440482534468174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2024127247277647e-05, + "grad_norm": 29.024272918701172, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8812453746795654, + "num_tokens": 541232343.0, + "step": 14191 + }, + { + "epoch": 1.8053682737565195, + "ewc_loss": 0.044082239270210266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2041120246285573e-05, + "grad_norm": 28.923622131347656, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8590763211250305, + "num_tokens": 541270274.0, + "step": 14192 + }, + { + "epoch": 1.80549548403511, + "ewc_loss": 0.044027019292116165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.201350980612915e-05, + "grad_norm": 28.943126678466797, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8624061346054077, + "num_tokens": 541306018.0, + "step": 14193 + }, + { + "epoch": 1.8056226943137006, + "ewc_loss": 0.04413669928908348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2068350517656654e-05, + "grad_norm": 28.989118576049805, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8680204153060913, + "num_tokens": 541344974.0, + "step": 14194 + }, + { + "epoch": 1.805749904592291, + "ewc_loss": 0.0440538115799427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2026906663086265e-05, + "grad_norm": 28.856609344482422, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8769400119781494, + "num_tokens": 541378590.0, + "step": 14195 + }, + { + "epoch": 1.8058771148708814, + "ewc_loss": 0.044129129499197006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2064565200707875e-05, + "grad_norm": 28.945140838623047, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.869978666305542, + "num_tokens": 541415945.0, + "step": 14196 + }, + { + "epoch": 1.806004325149472, + "ewc_loss": 0.04415293037891388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2076465029385872e-05, + "grad_norm": 28.978256225585938, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8648426532745361, + "num_tokens": 541459577.0, + "step": 14197 + }, + { + "epoch": 1.8061315354280625, + "ewc_loss": 0.04406222328543663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2031112166587263e-05, + "grad_norm": 28.91834831237793, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8690134286880493, + "num_tokens": 541501041.0, + "step": 14198 + }, + { + "epoch": 1.806258745706653, + "ewc_loss": 0.044100940227508545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.20504698518198e-05, + "grad_norm": 28.931562423706055, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8602021336555481, + "num_tokens": 541538587.0, + "step": 14199 + }, + { + "epoch": 1.8063859559852435, + "ewc_loss": 0.04410301148891449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2051504856790416e-05, + "grad_norm": 28.824962615966797, + "learning_rate": 1e-06, + "loss": 0.4807, + "mean_token_accuracy": 0.8554852604866028, + "num_tokens": 541582225.0, + "step": 14200 + }, + { + "epoch": 1.806513166263834, + "ewc_loss": 0.04409916698932648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.204958400398027e-05, + "grad_norm": 28.976598739624023, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.877758264541626, + "num_tokens": 541616956.0, + "step": 14201 + }, + { + "epoch": 1.8066403765424246, + "ewc_loss": 0.04418786242604256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.209393096563872e-05, + "grad_norm": 28.91088104248047, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8559556007385254, + "num_tokens": 541650814.0, + "step": 14202 + }, + { + "epoch": 1.8067675868210151, + "ewc_loss": 0.04411940276622772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2059701223042794e-05, + "grad_norm": 28.928699493408203, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8772453665733337, + "num_tokens": 541692167.0, + "step": 14203 + }, + { + "epoch": 1.8068947970996057, + "ewc_loss": 0.044245049357414246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.212252547906246e-05, + "grad_norm": 28.96274757385254, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8587584495544434, + "num_tokens": 541732427.0, + "step": 14204 + }, + { + "epoch": 1.8070220073781962, + "ewc_loss": 0.04417092725634575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2085463569965214e-05, + "grad_norm": 29.047582626342773, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8677984476089478, + "num_tokens": 541773256.0, + "step": 14205 + }, + { + "epoch": 1.8071492176567867, + "ewc_loss": 0.04418538138270378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.20926904148655e-05, + "grad_norm": 29.023866653442383, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8735636472702026, + "num_tokens": 541815008.0, + "step": 14206 + }, + { + "epoch": 1.8072764279353772, + "ewc_loss": 0.044084325432777405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2042162527213804e-05, + "grad_norm": 28.96436882019043, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8716382384300232, + "num_tokens": 541853462.0, + "step": 14207 + }, + { + "epoch": 1.8074036382139678, + "ewc_loss": 0.044088445603847504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2044223442208022e-05, + "grad_norm": 28.97024154663086, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8631280660629272, + "num_tokens": 541890487.0, + "step": 14208 + }, + { + "epoch": 1.8075308484925583, + "ewc_loss": 0.044158224016427994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.207911165896803e-05, + "grad_norm": 28.949256896972656, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8716319799423218, + "num_tokens": 541929857.0, + "step": 14209 + }, + { + "epoch": 1.8076580587711488, + "ewc_loss": 0.04407792538404465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2038962924852967e-05, + "grad_norm": 28.972387313842773, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8615847826004028, + "num_tokens": 541969923.0, + "step": 14210 + }, + { + "epoch": 1.8077852690497394, + "ewc_loss": 0.04412340000271797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.206170029239729e-05, + "grad_norm": 29.011417388916016, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8764568567276001, + "num_tokens": 542008802.0, + "step": 14211 + }, + { + "epoch": 1.8079124793283299, + "ewc_loss": 0.04419320821762085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2096604880061932e-05, + "grad_norm": 28.933622360229492, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8811916708946228, + "num_tokens": 542049276.0, + "step": 14212 + }, + { + "epoch": 1.8080396896069202, + "ewc_loss": 0.044080063700675964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2040032490622252e-05, + "grad_norm": 29.047334671020508, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8619774580001831, + "num_tokens": 542086921.0, + "step": 14213 + }, + { + "epoch": 1.8081668998855107, + "ewc_loss": 0.04415520280599594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.207760189776309e-05, + "grad_norm": 28.996061325073242, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8678337335586548, + "num_tokens": 542124291.0, + "step": 14214 + }, + { + "epoch": 1.8082941101641012, + "ewc_loss": 0.0440816730260849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.204083648393862e-05, + "grad_norm": 29.016809463500977, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8573895692825317, + "num_tokens": 542167600.0, + "step": 14215 + }, + { + "epoch": 1.8084213204426918, + "ewc_loss": 0.044186782091856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2093390725785866e-05, + "grad_norm": 29.062707901000977, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8622145652770996, + "num_tokens": 542201134.0, + "step": 14216 + }, + { + "epoch": 1.8085485307212823, + "ewc_loss": 0.04403901472687721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.201950701419264e-05, + "grad_norm": 29.088680267333984, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8547161221504211, + "num_tokens": 542242590.0, + "step": 14217 + }, + { + "epoch": 1.8086757409998728, + "ewc_loss": 0.04412026330828667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2060132323531434e-05, + "grad_norm": 28.968130111694336, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8680465221405029, + "num_tokens": 542281945.0, + "step": 14218 + }, + { + "epoch": 1.8088029512784631, + "ewc_loss": 0.04407303407788277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2036516384105198e-05, + "grad_norm": 28.987285614013672, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8760634064674377, + "num_tokens": 542319389.0, + "step": 14219 + }, + { + "epoch": 1.8089301615570537, + "ewc_loss": 0.0440874882042408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2043743229005486e-05, + "grad_norm": 28.944887161254883, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8552039861679077, + "num_tokens": 542356912.0, + "step": 14220 + }, + { + "epoch": 1.8090573718356442, + "ewc_loss": 0.04413943365216255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2069716578698717e-05, + "grad_norm": 29.036846160888672, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.861362874507904, + "num_tokens": 542394054.0, + "step": 14221 + }, + { + "epoch": 1.8091845821142347, + "ewc_loss": 0.04410076513886452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2050382540328428e-05, + "grad_norm": 28.940658569335938, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8727313280105591, + "num_tokens": 542431063.0, + "step": 14222 + }, + { + "epoch": 1.8093117923928252, + "ewc_loss": 0.04416580870747566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2082904251874425e-05, + "grad_norm": 29.002649307250977, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.856386125087738, + "num_tokens": 542471398.0, + "step": 14223 + }, + { + "epoch": 1.8094390026714158, + "ewc_loss": 0.04409332945942879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.204666452598758e-05, + "grad_norm": 28.930065155029297, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8538406491279602, + "num_tokens": 542516288.0, + "step": 14224 + }, + { + "epoch": 1.8095662129500063, + "ewc_loss": 0.044129591435194016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2064796212362126e-05, + "grad_norm": 28.884693145751953, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8663753271102905, + "num_tokens": 542552292.0, + "step": 14225 + }, + { + "epoch": 1.8096934232285968, + "ewc_loss": 0.04415690526366234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2078453184803948e-05, + "grad_norm": 28.99191665649414, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8697706460952759, + "num_tokens": 542592380.0, + "step": 14226 + }, + { + "epoch": 1.8098206335071874, + "ewc_loss": 0.044219668954610825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.210983438999392e-05, + "grad_norm": 29.050058364868164, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.859668493270874, + "num_tokens": 542632698.0, + "step": 14227 + }, + { + "epoch": 1.8099478437857779, + "ewc_loss": 0.04416755959391594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.208377918577753e-05, + "grad_norm": 28.939754486083984, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8563930988311768, + "num_tokens": 542668234.0, + "step": 14228 + }, + { + "epoch": 1.8100750540643684, + "ewc_loss": 0.04425680264830589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2128400814835913e-05, + "grad_norm": 29.16893196105957, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8665331602096558, + "num_tokens": 542706205.0, + "step": 14229 + }, + { + "epoch": 1.810202264342959, + "ewc_loss": 0.04420529678463936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.210264756286051e-05, + "grad_norm": 29.001728057861328, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8659008741378784, + "num_tokens": 542737068.0, + "step": 14230 + }, + { + "epoch": 1.8103294746215495, + "ewc_loss": 0.04409161955118179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2045809600967914e-05, + "grad_norm": 28.901203155517578, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8786114454269409, + "num_tokens": 542769776.0, + "step": 14231 + }, + { + "epoch": 1.81045668490014, + "ewc_loss": 0.04423723369836807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.211861647083424e-05, + "grad_norm": 29.034629821777344, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.880020260810852, + "num_tokens": 542802440.0, + "step": 14232 + }, + { + "epoch": 1.8105838951787305, + "ewc_loss": 0.044213730841875076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2106865799287334e-05, + "grad_norm": 29.11149024963379, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.880092978477478, + "num_tokens": 542841158.0, + "step": 14233 + }, + { + "epoch": 1.810711105457321, + "ewc_loss": 0.04427195340394974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2135976905701682e-05, + "grad_norm": 28.997835159301758, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.862432599067688, + "num_tokens": 542876637.0, + "step": 14234 + }, + { + "epoch": 1.8108383157359116, + "ewc_loss": 0.04411114379763603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2055572117096744e-05, + "grad_norm": 28.90803337097168, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8737986087799072, + "num_tokens": 542915087.0, + "step": 14235 + }, + { + "epoch": 1.8109655260145021, + "ewc_loss": 0.04428914189338684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2144571630633436e-05, + "grad_norm": 29.116565704345703, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8587157726287842, + "num_tokens": 542950635.0, + "step": 14236 + }, + { + "epoch": 1.8110927362930926, + "ewc_loss": 0.04428735375404358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.214367668784689e-05, + "grad_norm": 28.918405532836914, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8626346588134766, + "num_tokens": 542990078.0, + "step": 14237 + }, + { + "epoch": 1.811219946571683, + "ewc_loss": 0.044238582253456116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2119291315902956e-05, + "grad_norm": 29.065773010253906, + "learning_rate": 1e-06, + "loss": 0.5053, + "mean_token_accuracy": 0.841964066028595, + "num_tokens": 543030333.0, + "step": 14238 + }, + { + "epoch": 1.8113471568502735, + "ewc_loss": 0.044282216578722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2141108274809085e-05, + "grad_norm": 29.069580078125, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8597132563591003, + "num_tokens": 543070537.0, + "step": 14239 + }, + { + "epoch": 1.811474367128864, + "ewc_loss": 0.04423119127750397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.211559512943495e-05, + "grad_norm": 29.030981063842773, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8650567531585693, + "num_tokens": 543109137.0, + "step": 14240 + }, + { + "epoch": 1.8116015774074545, + "ewc_loss": 0.04424053058028221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2120264475233853e-05, + "grad_norm": 28.966712951660156, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8603811860084534, + "num_tokens": 543146616.0, + "step": 14241 + }, + { + "epoch": 1.811728787686045, + "ewc_loss": 0.04422643035650253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2113215891295113e-05, + "grad_norm": 28.999088287353516, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8805354833602905, + "num_tokens": 543178433.0, + "step": 14242 + }, + { + "epoch": 1.8118559979646356, + "ewc_loss": 0.044281166046857834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.214058258687146e-05, + "grad_norm": 29.09264373779297, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8793739080429077, + "num_tokens": 543215086.0, + "step": 14243 + }, + { + "epoch": 1.811983208243226, + "ewc_loss": 0.044279806315898895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2139902284834534e-05, + "grad_norm": 29.049474716186523, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8526626825332642, + "num_tokens": 543252111.0, + "step": 14244 + }, + { + "epoch": 1.8121104185218164, + "ewc_loss": 0.04425625503063202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.212812796642538e-05, + "grad_norm": 29.149372100830078, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8752496242523193, + "num_tokens": 543291938.0, + "step": 14245 + }, + { + "epoch": 1.812237628800407, + "ewc_loss": 0.04421808570623398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.210904312960338e-05, + "grad_norm": 29.041805267333984, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8711368441581726, + "num_tokens": 543335141.0, + "step": 14246 + }, + { + "epoch": 1.8123648390789975, + "ewc_loss": 0.04415620490908623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2078102119849063e-05, + "grad_norm": 29.083932876586914, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8671815395355225, + "num_tokens": 543371305.0, + "step": 14247 + }, + { + "epoch": 1.812492049357588, + "ewc_loss": 0.04419584944844246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2097925466368906e-05, + "grad_norm": 29.035823822021484, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8780336976051331, + "num_tokens": 543407431.0, + "step": 14248 + }, + { + "epoch": 1.8126192596361785, + "ewc_loss": 0.044148292392492294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.207414581789635e-05, + "grad_norm": 29.023441314697266, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8669856190681458, + "num_tokens": 543450287.0, + "step": 14249 + }, + { + "epoch": 1.812746469914769, + "ewc_loss": 0.04417094588279724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.208547266491223e-05, + "grad_norm": 28.981069564819336, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8797492980957031, + "num_tokens": 543486521.0, + "step": 14250 + }, + { + "epoch": 1.8128736801933596, + "ewc_loss": 0.044118732213974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.205936652899254e-05, + "grad_norm": 29.031497955322266, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8666681051254272, + "num_tokens": 543522157.0, + "step": 14251 + }, + { + "epoch": 1.8130008904719501, + "ewc_loss": 0.04420812055468559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2104060917627066e-05, + "grad_norm": 29.03501319885254, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8705485463142395, + "num_tokens": 543560503.0, + "step": 14252 + }, + { + "epoch": 1.8131281007505406, + "ewc_loss": 0.04423069208860397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2115345927886665e-05, + "grad_norm": 29.084186553955078, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8557144403457642, + "num_tokens": 543598943.0, + "step": 14253 + }, + { + "epoch": 1.8132553110291312, + "ewc_loss": 0.04418875649571419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2094378437031992e-05, + "grad_norm": 29.020177841186523, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.869568407535553, + "num_tokens": 543640466.0, + "step": 14254 + }, + { + "epoch": 1.8133825213077217, + "ewc_loss": 0.044207070022821426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.210353522968944e-05, + "grad_norm": 29.0783634185791, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8692563772201538, + "num_tokens": 543677581.0, + "step": 14255 + }, + { + "epoch": 1.8135097315863122, + "ewc_loss": 0.04414578154683113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2072890715207905e-05, + "grad_norm": 29.09990692138672, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8602374792098999, + "num_tokens": 543711220.0, + "step": 14256 + }, + { + "epoch": 1.8136369418649028, + "ewc_loss": 0.044216156005859375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2108077246230096e-05, + "grad_norm": 28.973108291625977, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8620699048042297, + "num_tokens": 543750109.0, + "step": 14257 + }, + { + "epoch": 1.8137641521434933, + "ewc_loss": 0.04406941309571266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.203470648964867e-05, + "grad_norm": 28.983631134033203, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8725212812423706, + "num_tokens": 543788040.0, + "step": 14258 + }, + { + "epoch": 1.8138913624220838, + "ewc_loss": 0.044285695999860764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2142847228678875e-05, + "grad_norm": 29.002609252929688, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8663439750671387, + "num_tokens": 543821037.0, + "step": 14259 + }, + { + "epoch": 1.8140185727006743, + "ewc_loss": 0.04415891692042351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.207945908594411e-05, + "grad_norm": 28.788877487182617, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8801863193511963, + "num_tokens": 543856331.0, + "step": 14260 + }, + { + "epoch": 1.8141457829792649, + "ewc_loss": 0.04429014399647713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.214507185271941e-05, + "grad_norm": 29.16183090209961, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8726972937583923, + "num_tokens": 543892898.0, + "step": 14261 + }, + { + "epoch": 1.8142729932578552, + "ewc_loss": 0.04434534162282944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2172671378939413e-05, + "grad_norm": 29.017980575561523, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8632798194885254, + "num_tokens": 543932858.0, + "step": 14262 + }, + { + "epoch": 1.8144002035364457, + "ewc_loss": 0.044193118810653687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2096559405326843e-05, + "grad_norm": 29.020057678222656, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8762985467910767, + "num_tokens": 543968145.0, + "step": 14263 + }, + { + "epoch": 1.8145274138150362, + "ewc_loss": 0.04428979754447937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2144899048726074e-05, + "grad_norm": 29.01244354248047, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8673233985900879, + "num_tokens": 544000495.0, + "step": 14264 + }, + { + "epoch": 1.8146546240936268, + "ewc_loss": 0.0442059263586998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2102962248027325e-05, + "grad_norm": 28.805978775024414, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8627641201019287, + "num_tokens": 544041345.0, + "step": 14265 + }, + { + "epoch": 1.8147818343722173, + "ewc_loss": 0.04431085288524628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2155427359393798e-05, + "grad_norm": 28.987106323242188, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8718686103820801, + "num_tokens": 544080084.0, + "step": 14266 + }, + { + "epoch": 1.8149090446508078, + "ewc_loss": 0.04432298243045807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.216149187006522e-05, + "grad_norm": 29.02272605895996, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8592382669448853, + "num_tokens": 544115828.0, + "step": 14267 + }, + { + "epoch": 1.8150362549293981, + "ewc_loss": 0.044315535575151443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.215776839875616e-05, + "grad_norm": 29.063152313232422, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8634227514266968, + "num_tokens": 544153213.0, + "step": 14268 + }, + { + "epoch": 1.8151634652079887, + "ewc_loss": 0.044290460646152496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.214523010479752e-05, + "grad_norm": 29.024377822875977, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8707983493804932, + "num_tokens": 544192383.0, + "step": 14269 + }, + { + "epoch": 1.8152906754865792, + "ewc_loss": 0.04424296319484711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2121481379144825e-05, + "grad_norm": 28.953475952148438, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8698822855949402, + "num_tokens": 544232034.0, + "step": 14270 + }, + { + "epoch": 1.8154178857651697, + "ewc_loss": 0.04434031993150711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2170159354573116e-05, + "grad_norm": 28.994504928588867, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8519815802574158, + "num_tokens": 544266242.0, + "step": 14271 + }, + { + "epoch": 1.8155450960437602, + "ewc_loss": 0.04433801770210266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2169009753270075e-05, + "grad_norm": 28.99051856994629, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8571357727050781, + "num_tokens": 544304995.0, + "step": 14272 + }, + { + "epoch": 1.8156723063223508, + "ewc_loss": 0.04439151659607887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2195757992449217e-05, + "grad_norm": 29.034318923950195, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.866348147392273, + "num_tokens": 544341950.0, + "step": 14273 + }, + { + "epoch": 1.8157995166009413, + "ewc_loss": 0.04438338428735733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.219169255113229e-05, + "grad_norm": 29.119159698486328, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8633304834365845, + "num_tokens": 544379535.0, + "step": 14274 + }, + { + "epoch": 1.8159267268795318, + "ewc_loss": 0.04430641606450081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2153208192321472e-05, + "grad_norm": 29.126211166381836, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8664512634277344, + "num_tokens": 544416939.0, + "step": 14275 + }, + { + "epoch": 1.8160539371581224, + "ewc_loss": 0.04424425587058067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2122127120383084e-05, + "grad_norm": 28.983699798583984, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8786528706550598, + "num_tokens": 544461476.0, + "step": 14276 + }, + { + "epoch": 1.8161811474367129, + "ewc_loss": 0.0442609041929245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2130452634883113e-05, + "grad_norm": 29.03571891784668, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8673830032348633, + "num_tokens": 544505051.0, + "step": 14277 + }, + { + "epoch": 1.8163083577153034, + "ewc_loss": 0.04434117302298546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.217058681708295e-05, + "grad_norm": 29.166357040405273, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8669394254684448, + "num_tokens": 544550408.0, + "step": 14278 + }, + { + "epoch": 1.816435567993894, + "ewc_loss": 0.044234950095415115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2117474145488814e-05, + "grad_norm": 28.961050033569336, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.867281436920166, + "num_tokens": 544586645.0, + "step": 14279 + }, + { + "epoch": 1.8165627782724845, + "ewc_loss": 0.044307056814432144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2153528334456496e-05, + "grad_norm": 29.215618133544922, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8601303100585938, + "num_tokens": 544628087.0, + "step": 14280 + }, + { + "epoch": 1.816689988551075, + "ewc_loss": 0.044304102659225464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2152051315060817e-05, + "grad_norm": 28.997831344604492, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8815990090370178, + "num_tokens": 544667961.0, + "step": 14281 + }, + { + "epoch": 1.8168171988296655, + "ewc_loss": 0.04414740949869156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2073705622460693e-05, + "grad_norm": 29.09506607055664, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8651631474494934, + "num_tokens": 544704919.0, + "step": 14282 + }, + { + "epoch": 1.816944409108256, + "ewc_loss": 0.04422253370285034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2111267753643915e-05, + "grad_norm": 28.989627838134766, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8721159100532532, + "num_tokens": 544741421.0, + "step": 14283 + }, + { + "epoch": 1.8170716193868466, + "ewc_loss": 0.044176552444696426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2088275727583095e-05, + "grad_norm": 29.056610107421875, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8711827397346497, + "num_tokens": 544776414.0, + "step": 14284 + }, + { + "epoch": 1.817198829665437, + "ewc_loss": 0.04429452493786812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.214726191596128e-05, + "grad_norm": 29.077375411987305, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8600407838821411, + "num_tokens": 544814964.0, + "step": 14285 + }, + { + "epoch": 1.8173260399440276, + "ewc_loss": 0.04421574994921684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2107875338406302e-05, + "grad_norm": 29.13077163696289, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8532969355583191, + "num_tokens": 544853381.0, + "step": 14286 + }, + { + "epoch": 1.817453250222618, + "ewc_loss": 0.04418700933456421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.209350532211829e-05, + "grad_norm": 28.932098388671875, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.86965012550354, + "num_tokens": 544892474.0, + "step": 14287 + }, + { + "epoch": 1.8175804605012085, + "ewc_loss": 0.04411642998456955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2058215108700097e-05, + "grad_norm": 29.1754207611084, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.87486732006073, + "num_tokens": 544927667.0, + "step": 14288 + }, + { + "epoch": 1.817707670779799, + "ewc_loss": 0.04427134245634079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2135671315481886e-05, + "grad_norm": 29.008766174316406, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8734451532363892, + "num_tokens": 544959780.0, + "step": 14289 + }, + { + "epoch": 1.8178348810583895, + "ewc_loss": 0.04418138414621353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2090691345511004e-05, + "grad_norm": 29.07451820373535, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.859679102897644, + "num_tokens": 545001832.0, + "step": 14290 + }, + { + "epoch": 1.81796209133698, + "ewc_loss": 0.04415617138147354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.207808574894443e-05, + "grad_norm": 28.9218692779541, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8757211565971375, + "num_tokens": 545038489.0, + "step": 14291 + }, + { + "epoch": 1.8180893016155706, + "ewc_loss": 0.04414522647857666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2072612409829162e-05, + "grad_norm": 28.91875648498535, + "learning_rate": 1e-06, + "loss": 0.5156, + "mean_token_accuracy": 0.8465882539749146, + "num_tokens": 545069086.0, + "step": 14292 + }, + { + "epoch": 1.818216511894161, + "ewc_loss": 0.044278334826231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.21391674131155e-05, + "grad_norm": 29.065349578857422, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8701155185699463, + "num_tokens": 545104564.0, + "step": 14293 + }, + { + "epoch": 1.8183437221727514, + "ewc_loss": 0.04424772784113884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.212386425526347e-05, + "grad_norm": 28.91510772705078, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8721438050270081, + "num_tokens": 545149036.0, + "step": 14294 + }, + { + "epoch": 1.818470932451342, + "ewc_loss": 0.04425181448459625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2125906980363652e-05, + "grad_norm": 29.013132095336914, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8786790370941162, + "num_tokens": 545183825.0, + "step": 14295 + }, + { + "epoch": 1.8185981427299325, + "ewc_loss": 0.04431447386741638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2157237253850326e-05, + "grad_norm": 28.952585220336914, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8705558776855469, + "num_tokens": 545221274.0, + "step": 14296 + }, + { + "epoch": 1.818725353008523, + "ewc_loss": 0.04436107724905014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.218053850810975e-05, + "grad_norm": 29.048158645629883, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8748987317085266, + "num_tokens": 545259242.0, + "step": 14297 + }, + { + "epoch": 1.8188525632871135, + "ewc_loss": 0.04433651268482208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2168256691657007e-05, + "grad_norm": 29.001039505004883, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8538971543312073, + "num_tokens": 545289380.0, + "step": 14298 + }, + { + "epoch": 1.818979773565704, + "ewc_loss": 0.04430532082915306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2152660676511005e-05, + "grad_norm": 29.06075096130371, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.871374249458313, + "num_tokens": 545321935.0, + "step": 14299 + }, + { + "epoch": 1.8191069838442946, + "ewc_loss": 0.044426579028367996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.221328941232059e-05, + "grad_norm": 29.081256866455078, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8751736879348755, + "num_tokens": 545365714.0, + "step": 14300 + }, + { + "epoch": 1.8192341941228851, + "ewc_loss": 0.04424388334155083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2121941583463922e-05, + "grad_norm": 28.947195053100586, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8569110035896301, + "num_tokens": 545406838.0, + "step": 14301 + }, + { + "epoch": 1.8193614044014756, + "ewc_loss": 0.04428545758128166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2142728994367644e-05, + "grad_norm": 29.04667854309082, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8652616739273071, + "num_tokens": 545443265.0, + "step": 14302 + }, + { + "epoch": 1.8194886146800662, + "ewc_loss": 0.04432246461510658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2161231754580513e-05, + "grad_norm": 28.905248641967773, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8775817155838013, + "num_tokens": 545479903.0, + "step": 14303 + }, + { + "epoch": 1.8196158249586567, + "ewc_loss": 0.04440600425004959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.220300302724354e-05, + "grad_norm": 29.197410583496094, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8750402927398682, + "num_tokens": 545513767.0, + "step": 14304 + }, + { + "epoch": 1.8197430352372472, + "ewc_loss": 0.044381871819496155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2190935851540416e-05, + "grad_norm": 29.034576416015625, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8527976274490356, + "num_tokens": 545550541.0, + "step": 14305 + }, + { + "epoch": 1.8198702455158378, + "ewc_loss": 0.04429659619927406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.21482987399213e-05, + "grad_norm": 28.976119995117188, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8687251210212708, + "num_tokens": 545590739.0, + "step": 14306 + }, + { + "epoch": 1.8199974557944283, + "ewc_loss": 0.044352635741233826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2176318452693522e-05, + "grad_norm": 29.12083625793457, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8536810874938965, + "num_tokens": 545627127.0, + "step": 14307 + }, + { + "epoch": 1.8201246660730188, + "ewc_loss": 0.044432707130908966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2216354409465566e-05, + "grad_norm": 29.038230895996094, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8720827698707581, + "num_tokens": 545665473.0, + "step": 14308 + }, + { + "epoch": 1.8202518763516093, + "ewc_loss": 0.04429880529642105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2149402866489254e-05, + "grad_norm": 29.07659912109375, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8830641508102417, + "num_tokens": 545708018.0, + "step": 14309 + }, + { + "epoch": 1.8203790866301999, + "ewc_loss": 0.04430319368839264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.215159656770993e-05, + "grad_norm": 29.049842834472656, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.872438371181488, + "num_tokens": 545745635.0, + "step": 14310 + }, + { + "epoch": 1.8205062969087902, + "ewc_loss": 0.044344354420900345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2172176613821648e-05, + "grad_norm": 29.021812438964844, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8565037250518799, + "num_tokens": 545785260.0, + "step": 14311 + }, + { + "epoch": 1.8206335071873807, + "ewc_loss": 0.04434361681342125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2171809177962132e-05, + "grad_norm": 29.064414978027344, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8648094534873962, + "num_tokens": 545821696.0, + "step": 14312 + }, + { + "epoch": 1.8207607174659712, + "ewc_loss": 0.044279903173446655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.213995139754843e-05, + "grad_norm": 28.91872787475586, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8813347816467285, + "num_tokens": 545857971.0, + "step": 14313 + }, + { + "epoch": 1.8208879277445618, + "ewc_loss": 0.04439631104469299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.219815542048309e-05, + "grad_norm": 29.07471466064453, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8569315671920776, + "num_tokens": 545891000.0, + "step": 14314 + }, + { + "epoch": 1.8210151380231523, + "ewc_loss": 0.04442257434129715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2211286704987288e-05, + "grad_norm": 29.039888381958008, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8539249897003174, + "num_tokens": 545932825.0, + "step": 14315 + }, + { + "epoch": 1.8211423483017428, + "ewc_loss": 0.04432499408721924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.216249777120538e-05, + "grad_norm": 29.073598861694336, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8568740487098694, + "num_tokens": 545972849.0, + "step": 14316 + }, + { + "epoch": 1.8212695585803331, + "ewc_loss": 0.04429570585489273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2147853087517433e-05, + "grad_norm": 29.067819595336914, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8800619840621948, + "num_tokens": 546011492.0, + "step": 14317 + }, + { + "epoch": 1.8213967688589237, + "ewc_loss": 0.044311221688985825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2155611077323556e-05, + "grad_norm": 29.023263931274414, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8625441193580627, + "num_tokens": 546048461.0, + "step": 14318 + }, + { + "epoch": 1.8215239791375142, + "ewc_loss": 0.04432836174964905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.216418033640366e-05, + "grad_norm": 29.039148330688477, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8759876489639282, + "num_tokens": 546087113.0, + "step": 14319 + }, + { + "epoch": 1.8216511894161047, + "ewc_loss": 0.04437435790896416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2187179638422094e-05, + "grad_norm": 29.111286163330078, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8698992133140564, + "num_tokens": 546124413.0, + "step": 14320 + }, + { + "epoch": 1.8217783996946952, + "ewc_loss": 0.044322263449430466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2161131710163318e-05, + "grad_norm": 29.079408645629883, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8790686130523682, + "num_tokens": 546165763.0, + "step": 14321 + }, + { + "epoch": 1.8219056099732858, + "ewc_loss": 0.04431264102458954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.215632048319094e-05, + "grad_norm": 29.01927375793457, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8682736754417419, + "num_tokens": 546203218.0, + "step": 14322 + }, + { + "epoch": 1.8220328202518763, + "ewc_loss": 0.04430085048079491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2150425138534047e-05, + "grad_norm": 29.095129013061523, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.857563853263855, + "num_tokens": 546247610.0, + "step": 14323 + }, + { + "epoch": 1.8221600305304668, + "ewc_loss": 0.04438179358839989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2190897652762942e-05, + "grad_norm": 29.174041748046875, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.860836386680603, + "num_tokens": 546288952.0, + "step": 14324 + }, + { + "epoch": 1.8222872408090574, + "ewc_loss": 0.04431866481900215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.215933272964321e-05, + "grad_norm": 29.02741241455078, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8790132999420166, + "num_tokens": 546322449.0, + "step": 14325 + }, + { + "epoch": 1.8224144510876479, + "ewc_loss": 0.04425190016627312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2125950636109337e-05, + "grad_norm": 29.170223236083984, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8734511137008667, + "num_tokens": 546362692.0, + "step": 14326 + }, + { + "epoch": 1.8225416613662384, + "ewc_loss": 0.044295575469732285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2147787603898905e-05, + "grad_norm": 29.059188842773438, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.851149320602417, + "num_tokens": 546398856.0, + "step": 14327 + }, + { + "epoch": 1.822668871644829, + "ewc_loss": 0.04417813941836357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2089070625952445e-05, + "grad_norm": 29.0838623046875, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8598965406417847, + "num_tokens": 546439633.0, + "step": 14328 + }, + { + "epoch": 1.8227960819234195, + "ewc_loss": 0.044266026467084885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2133013771963306e-05, + "grad_norm": 29.172046661376953, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8720849752426147, + "num_tokens": 546477282.0, + "step": 14329 + }, + { + "epoch": 1.82292329220201, + "ewc_loss": 0.04422041401267052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2110207282821648e-05, + "grad_norm": 29.109756469726562, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8792698383331299, + "num_tokens": 546514584.0, + "step": 14330 + }, + { + "epoch": 1.8230505024806005, + "ewc_loss": 0.04425770044326782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.212885010521859e-05, + "grad_norm": 29.049854278564453, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8716707229614258, + "num_tokens": 546550761.0, + "step": 14331 + }, + { + "epoch": 1.823177712759191, + "ewc_loss": 0.044192470610141754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2096235625213012e-05, + "grad_norm": 29.112674713134766, + "learning_rate": 1e-06, + "loss": 0.4694, + "mean_token_accuracy": 0.8536362051963806, + "num_tokens": 546594157.0, + "step": 14332 + }, + { + "epoch": 1.8233049230377816, + "ewc_loss": 0.04414699971675873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2073500076658092e-05, + "grad_norm": 29.00593376159668, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8760911226272583, + "num_tokens": 546634671.0, + "step": 14333 + }, + { + "epoch": 1.823432133316372, + "ewc_loss": 0.04418836534023285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.209418198617641e-05, + "grad_norm": 29.122699737548828, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8683127164840698, + "num_tokens": 546672084.0, + "step": 14334 + }, + { + "epoch": 1.8235593435949626, + "ewc_loss": 0.04419781640172005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.209890772064682e-05, + "grad_norm": 29.095415115356445, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8612611293792725, + "num_tokens": 546701920.0, + "step": 14335 + }, + { + "epoch": 1.823686553873553, + "ewc_loss": 0.044181372970342636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2090685888542794e-05, + "grad_norm": 29.08568572998047, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8729047775268555, + "num_tokens": 546735776.0, + "step": 14336 + }, + { + "epoch": 1.8238137641521435, + "ewc_loss": 0.044159263372421265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2079631889937446e-05, + "grad_norm": 29.008909225463867, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8733398914337158, + "num_tokens": 546772991.0, + "step": 14337 + }, + { + "epoch": 1.823940974430734, + "ewc_loss": 0.044170066714286804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2085034288465977e-05, + "grad_norm": 28.994853973388672, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8614019155502319, + "num_tokens": 546805310.0, + "step": 14338 + }, + { + "epoch": 1.8240681847093245, + "ewc_loss": 0.0443587563931942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2179377992870286e-05, + "grad_norm": 29.29103660583496, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8740328550338745, + "num_tokens": 546838240.0, + "step": 14339 + }, + { + "epoch": 1.824195394987915, + "ewc_loss": 0.04425736889243126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2128684577182867e-05, + "grad_norm": 28.950185775756836, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8632136583328247, + "num_tokens": 546878484.0, + "step": 14340 + }, + { + "epoch": 1.8243226052665056, + "ewc_loss": 0.044152259826660156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.207613033533562e-05, + "grad_norm": 29.06828498840332, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8828104734420776, + "num_tokens": 546919747.0, + "step": 14341 + }, + { + "epoch": 1.8244498155450959, + "ewc_loss": 0.04432809725403786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2164049369166605e-05, + "grad_norm": 29.09113121032715, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8674309253692627, + "num_tokens": 546962614.0, + "step": 14342 + }, + { + "epoch": 1.8245770258236864, + "ewc_loss": 0.04429391399025917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2146956325741485e-05, + "grad_norm": 29.1352481842041, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8606958389282227, + "num_tokens": 547000059.0, + "step": 14343 + }, + { + "epoch": 1.824704236102277, + "ewc_loss": 0.04429112374782562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2145561160868965e-05, + "grad_norm": 29.11722755432129, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8700704574584961, + "num_tokens": 547037372.0, + "step": 14344 + }, + { + "epoch": 1.8248314463808675, + "ewc_loss": 0.044158726930618286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.207936267950572e-05, + "grad_norm": 28.98171615600586, + "learning_rate": 1e-06, + "loss": 0.4884, + "mean_token_accuracy": 0.853641152381897, + "num_tokens": 547071840.0, + "step": 14345 + }, + { + "epoch": 1.824958656659458, + "ewc_loss": 0.044271279126405716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2135640392662026e-05, + "grad_norm": 29.10523223876953, + "learning_rate": 1e-06, + "loss": 0.5113, + "mean_token_accuracy": 0.8431130051612854, + "num_tokens": 547108916.0, + "step": 14346 + }, + { + "epoch": 1.8250858669380485, + "ewc_loss": 0.04424497112631798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2122485461295582e-05, + "grad_norm": 29.039159774780273, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8867447376251221, + "num_tokens": 547146991.0, + "step": 14347 + }, + { + "epoch": 1.825213077216639, + "ewc_loss": 0.044291190803050995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2145595721667632e-05, + "grad_norm": 29.030473709106445, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.879367470741272, + "num_tokens": 547184093.0, + "step": 14348 + }, + { + "epoch": 1.8253402874952296, + "ewc_loss": 0.044280268251895905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2140133296488784e-05, + "grad_norm": 29.059127807617188, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8743276000022888, + "num_tokens": 547217570.0, + "step": 14349 + }, + { + "epoch": 1.8254674977738201, + "ewc_loss": 0.04429684579372406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2148422431200743e-05, + "grad_norm": 29.076847076416016, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.866824746131897, + "num_tokens": 547259498.0, + "step": 14350 + }, + { + "epoch": 1.8255947080524106, + "ewc_loss": 0.044273391366004944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2136695406516083e-05, + "grad_norm": 29.041156768798828, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8656260371208191, + "num_tokens": 547298419.0, + "step": 14351 + }, + { + "epoch": 1.8257219183310012, + "ewc_loss": 0.04432356357574463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2161781089380383e-05, + "grad_norm": 29.12886619567871, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8754101991653442, + "num_tokens": 547342984.0, + "step": 14352 + }, + { + "epoch": 1.8258491286095917, + "ewc_loss": 0.04425674304366112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2128371711005457e-05, + "grad_norm": 29.043073654174805, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8697036504745483, + "num_tokens": 547383162.0, + "step": 14353 + }, + { + "epoch": 1.8259763388881822, + "ewc_loss": 0.044246233999729156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2123116650618613e-05, + "grad_norm": 29.06426429748535, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8887861967086792, + "num_tokens": 547416967.0, + "step": 14354 + }, + { + "epoch": 1.8261035491667728, + "ewc_loss": 0.04431149363517761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2155747501528822e-05, + "grad_norm": 29.063817977905273, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8702982664108276, + "num_tokens": 547454695.0, + "step": 14355 + }, + { + "epoch": 1.8262307594453633, + "ewc_loss": 0.04429621621966362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.214810774603393e-05, + "grad_norm": 29.05571174621582, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8787562251091003, + "num_tokens": 547494258.0, + "step": 14356 + }, + { + "epoch": 1.8263579697239538, + "ewc_loss": 0.04428078606724739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.214039341197349e-05, + "grad_norm": 29.009578704833984, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.863301694393158, + "num_tokens": 547531404.0, + "step": 14357 + }, + { + "epoch": 1.8264851800025443, + "ewc_loss": 0.04422697797417641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2113488739705645e-05, + "grad_norm": 28.97109031677246, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8723257780075073, + "num_tokens": 547566553.0, + "step": 14358 + }, + { + "epoch": 1.8266123902811349, + "ewc_loss": 0.04431365430355072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2156827981234528e-05, + "grad_norm": 29.03853988647461, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8604947328567505, + "num_tokens": 547605505.0, + "step": 14359 + }, + { + "epoch": 1.8267396005597252, + "ewc_loss": 0.04441476985812187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.220738497271668e-05, + "grad_norm": 29.160442352294922, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8762416839599609, + "num_tokens": 547643146.0, + "step": 14360 + }, + { + "epoch": 1.8268668108383157, + "ewc_loss": 0.04433774948120117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2168875148054212e-05, + "grad_norm": 29.141098022460938, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8658098578453064, + "num_tokens": 547683373.0, + "step": 14361 + }, + { + "epoch": 1.8269940211169062, + "ewc_loss": 0.04432142525911331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2160713342600502e-05, + "grad_norm": 29.0772762298584, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8620362281799316, + "num_tokens": 547716762.0, + "step": 14362 + }, + { + "epoch": 1.8271212313954968, + "ewc_loss": 0.04428346827626228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2141734007163905e-05, + "grad_norm": 29.06760597229004, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8755086660385132, + "num_tokens": 547756907.0, + "step": 14363 + }, + { + "epoch": 1.8272484416740873, + "ewc_loss": 0.04432573914527893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2162868845043704e-05, + "grad_norm": 29.058319091796875, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8640221953392029, + "num_tokens": 547793545.0, + "step": 14364 + }, + { + "epoch": 1.8273756519526778, + "ewc_loss": 0.044345226138830185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.21726131712785e-05, + "grad_norm": 29.079090118408203, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8595781922340393, + "num_tokens": 547836845.0, + "step": 14365 + }, + { + "epoch": 1.8275028622312681, + "ewc_loss": 0.04430447518825531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.215223685197998e-05, + "grad_norm": 29.07349967956543, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.873816728591919, + "num_tokens": 547875849.0, + "step": 14366 + }, + { + "epoch": 1.8276300725098586, + "ewc_loss": 0.044377490878105164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2188745788298547e-05, + "grad_norm": 29.151124954223633, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8618785738945007, + "num_tokens": 547918672.0, + "step": 14367 + }, + { + "epoch": 1.8277572827884492, + "ewc_loss": 0.044269878417253494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.213494008174166e-05, + "grad_norm": 29.01066780090332, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8549450635910034, + "num_tokens": 547958681.0, + "step": 14368 + }, + { + "epoch": 1.8278844930670397, + "ewc_loss": 0.04433957114815712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.216978646174539e-05, + "grad_norm": 29.101871490478516, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.861457109451294, + "num_tokens": 547996940.0, + "step": 14369 + }, + { + "epoch": 1.8280117033456302, + "ewc_loss": 0.044324733316898346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2162366803968325e-05, + "grad_norm": 29.152475357055664, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8763535022735596, + "num_tokens": 548030188.0, + "step": 14370 + }, + { + "epoch": 1.8281389136242208, + "ewc_loss": 0.044367678463459015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2183839973877184e-05, + "grad_norm": 29.013599395751953, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8830878138542175, + "num_tokens": 548066891.0, + "step": 14371 + }, + { + "epoch": 1.8282661239028113, + "ewc_loss": 0.044319212436676025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2159605578053743e-05, + "grad_norm": 29.18647003173828, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8741666078567505, + "num_tokens": 548104664.0, + "step": 14372 + }, + { + "epoch": 1.8283933341814018, + "ewc_loss": 0.04441625252366066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.220812712039333e-05, + "grad_norm": 29.134592056274414, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8645340204238892, + "num_tokens": 548143938.0, + "step": 14373 + }, + { + "epoch": 1.8285205444599923, + "ewc_loss": 0.044286031275987625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2143016394693404e-05, + "grad_norm": 29.13400650024414, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8664705157279968, + "num_tokens": 548183213.0, + "step": 14374 + }, + { + "epoch": 1.8286477547385829, + "ewc_loss": 0.04432094842195511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2160475054988638e-05, + "grad_norm": 29.102785110473633, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8521065711975098, + "num_tokens": 548221304.0, + "step": 14375 + }, + { + "epoch": 1.8287749650171734, + "ewc_loss": 0.04432406648993492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2162033928907476e-05, + "grad_norm": 29.123106002807617, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8658478260040283, + "num_tokens": 548261123.0, + "step": 14376 + }, + { + "epoch": 1.828902175295764, + "ewc_loss": 0.044342562556266785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2171281671035103e-05, + "grad_norm": 28.994495391845703, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.84843909740448, + "num_tokens": 548305040.0, + "step": 14377 + }, + { + "epoch": 1.8290293855743545, + "ewc_loss": 0.044368766248226166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2184383851708844e-05, + "grad_norm": 29.1527099609375, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8754712343215942, + "num_tokens": 548338538.0, + "step": 14378 + }, + { + "epoch": 1.829156595852945, + "ewc_loss": 0.04453301429748535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2266507585300133e-05, + "grad_norm": 29.132787704467773, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8701088428497314, + "num_tokens": 548378510.0, + "step": 14379 + }, + { + "epoch": 1.8292838061315355, + "ewc_loss": 0.0443425290286541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.217126530013047e-05, + "grad_norm": 28.991552352905273, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8614178895950317, + "num_tokens": 548415790.0, + "step": 14380 + }, + { + "epoch": 1.829411016410126, + "ewc_loss": 0.04440730810165405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.220365422545001e-05, + "grad_norm": 29.089778900146484, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.85811847448349, + "num_tokens": 548447294.0, + "step": 14381 + }, + { + "epoch": 1.8295382266887166, + "ewc_loss": 0.04444209113717079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.222104558313731e-05, + "grad_norm": 28.956621170043945, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8735027313232422, + "num_tokens": 548484617.0, + "step": 14382 + }, + { + "epoch": 1.829665436967307, + "ewc_loss": 0.04441410303115845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2207052097655833e-05, + "grad_norm": 29.099979400634766, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8635671138763428, + "num_tokens": 548527552.0, + "step": 14383 + }, + { + "epoch": 1.8297926472458976, + "ewc_loss": 0.04443547502160072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.221773684141226e-05, + "grad_norm": 29.065126419067383, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8859896659851074, + "num_tokens": 548561322.0, + "step": 14384 + }, + { + "epoch": 1.829919857524488, + "ewc_loss": 0.04444935545325279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2224678104976192e-05, + "grad_norm": 29.088171005249023, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8779222965240479, + "num_tokens": 548594810.0, + "step": 14385 + }, + { + "epoch": 1.8300470678030785, + "ewc_loss": 0.04435976967215538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2179885490913875e-05, + "grad_norm": 29.05757713317871, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8616397380828857, + "num_tokens": 548631094.0, + "step": 14386 + }, + { + "epoch": 1.830174278081669, + "ewc_loss": 0.044463757425546646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.223187948402483e-05, + "grad_norm": 29.1868839263916, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8688399195671082, + "num_tokens": 548669162.0, + "step": 14387 + }, + { + "epoch": 1.8303014883602595, + "ewc_loss": 0.0444064624607563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2203230400918983e-05, + "grad_norm": 29.02834129333496, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8671765327453613, + "num_tokens": 548704555.0, + "step": 14388 + }, + { + "epoch": 1.83042869863885, + "ewc_loss": 0.044420961290597916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2210480892681517e-05, + "grad_norm": 29.099529266357422, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8811867237091064, + "num_tokens": 548741852.0, + "step": 14389 + }, + { + "epoch": 1.8305559089174406, + "ewc_loss": 0.04441303387284279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.220651731477119e-05, + "grad_norm": 29.037498474121094, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8557915687561035, + "num_tokens": 548788455.0, + "step": 14390 + }, + { + "epoch": 1.8306831191960309, + "ewc_loss": 0.04441250115633011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.220624992332887e-05, + "grad_norm": 29.09450340270996, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8750761151313782, + "num_tokens": 548826347.0, + "step": 14391 + }, + { + "epoch": 1.8308103294746214, + "ewc_loss": 0.04439758509397507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.219879206677433e-05, + "grad_norm": 29.02112579345703, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8621845245361328, + "num_tokens": 548865428.0, + "step": 14392 + }, + { + "epoch": 1.830937539753212, + "ewc_loss": 0.04434902220964432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2174510377226397e-05, + "grad_norm": 29.12941551208496, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8552951812744141, + "num_tokens": 548901671.0, + "step": 14393 + }, + { + "epoch": 1.8310647500318025, + "ewc_loss": 0.04438900575041771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.219450288976077e-05, + "grad_norm": 29.026241302490234, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8634599447250366, + "num_tokens": 548939234.0, + "step": 14394 + }, + { + "epoch": 1.831191960310393, + "ewc_loss": 0.04436803609132767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.218401823483873e-05, + "grad_norm": 29.01610565185547, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8726841807365417, + "num_tokens": 548975334.0, + "step": 14395 + }, + { + "epoch": 1.8313191705889835, + "ewc_loss": 0.04441343992948532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2206719222594984e-05, + "grad_norm": 29.03323745727539, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8609366416931152, + "num_tokens": 549016372.0, + "step": 14396 + }, + { + "epoch": 1.831446380867574, + "ewc_loss": 0.04442000761628151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2210004317457788e-05, + "grad_norm": 29.102432250976562, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8734495639801025, + "num_tokens": 549058049.0, + "step": 14397 + }, + { + "epoch": 1.8315735911461646, + "ewc_loss": 0.04436882212758064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2184411136549897e-05, + "grad_norm": 29.062942504882812, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8557804822921753, + "num_tokens": 549097855.0, + "step": 14398 + }, + { + "epoch": 1.831700801424755, + "ewc_loss": 0.04437728598713875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2188642105902545e-05, + "grad_norm": 29.03390884399414, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8873168230056763, + "num_tokens": 549130906.0, + "step": 14399 + }, + { + "epoch": 1.8318280117033456, + "ewc_loss": 0.04439636319875717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.219818088633474e-05, + "grad_norm": 29.129528045654297, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8486385345458984, + "num_tokens": 549176574.0, + "step": 14400 + }, + { + "epoch": 1.8319552219819362, + "ewc_loss": 0.04442077875137329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.221038994321134e-05, + "grad_norm": 29.07544708251953, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8656845092773438, + "num_tokens": 549213945.0, + "step": 14401 + }, + { + "epoch": 1.8320824322605267, + "ewc_loss": 0.04438343271613121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2191716197994538e-05, + "grad_norm": 29.16933250427246, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8588950634002686, + "num_tokens": 549255843.0, + "step": 14402 + }, + { + "epoch": 1.8322096425391172, + "ewc_loss": 0.04445984214544296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.222992043243721e-05, + "grad_norm": 29.11252784729004, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.87098628282547, + "num_tokens": 549292573.0, + "step": 14403 + }, + { + "epoch": 1.8323368528177078, + "ewc_loss": 0.044332340359687805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.216617031081114e-05, + "grad_norm": 29.125242233276367, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8798452615737915, + "num_tokens": 549335291.0, + "step": 14404 + }, + { + "epoch": 1.8324640630962983, + "ewc_loss": 0.04436691105365753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2183456167113036e-05, + "grad_norm": 29.105937957763672, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8544188737869263, + "num_tokens": 549381337.0, + "step": 14405 + }, + { + "epoch": 1.8325912733748888, + "ewc_loss": 0.04432111233472824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2160556909511797e-05, + "grad_norm": 29.10379409790039, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8793380260467529, + "num_tokens": 549418180.0, + "step": 14406 + }, + { + "epoch": 1.8327184836534793, + "ewc_loss": 0.044289615005254745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2144808099255897e-05, + "grad_norm": 29.112051010131836, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8757438659667969, + "num_tokens": 549461623.0, + "step": 14407 + }, + { + "epoch": 1.8328456939320699, + "ewc_loss": 0.044270679354667664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.213534025941044e-05, + "grad_norm": 29.027957916259766, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8713645339012146, + "num_tokens": 549500791.0, + "step": 14408 + }, + { + "epoch": 1.8329729042106602, + "ewc_loss": 0.04428790882229805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2143954993225634e-05, + "grad_norm": 29.072134017944336, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8740710616111755, + "num_tokens": 549533343.0, + "step": 14409 + }, + { + "epoch": 1.8331001144892507, + "ewc_loss": 0.044281069189310074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2140535293146968e-05, + "grad_norm": 28.9892635345459, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8591517806053162, + "num_tokens": 549572707.0, + "step": 14410 + }, + { + "epoch": 1.8332273247678412, + "ewc_loss": 0.04432958737015724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2164793335832655e-05, + "grad_norm": 29.044147491455078, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8739535808563232, + "num_tokens": 549609772.0, + "step": 14411 + }, + { + "epoch": 1.8333545350464318, + "ewc_loss": 0.04436744749546051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2183723558555357e-05, + "grad_norm": 29.085901260375977, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8564468026161194, + "num_tokens": 549648626.0, + "step": 14412 + }, + { + "epoch": 1.8334817453250223, + "ewc_loss": 0.04431908205151558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.215954191342462e-05, + "grad_norm": 29.05636978149414, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8564926385879517, + "num_tokens": 549693211.0, + "step": 14413 + }, + { + "epoch": 1.8336089556036128, + "ewc_loss": 0.04440626502037048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.220313217549119e-05, + "grad_norm": 29.092493057250977, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8628125786781311, + "num_tokens": 549729463.0, + "step": 14414 + }, + { + "epoch": 1.8337361658822031, + "ewc_loss": 0.044308122247457504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2154061298351735e-05, + "grad_norm": 29.016803741455078, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8718318343162537, + "num_tokens": 549769139.0, + "step": 14415 + }, + { + "epoch": 1.8338633761607936, + "ewc_loss": 0.044312767684459686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2156384147820063e-05, + "grad_norm": 29.02277183532715, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8713667988777161, + "num_tokens": 549807067.0, + "step": 14416 + }, + { + "epoch": 1.8339905864393842, + "ewc_loss": 0.044281553477048874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.214077721873764e-05, + "grad_norm": 29.020343780517578, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8674184679985046, + "num_tokens": 549842689.0, + "step": 14417 + }, + { + "epoch": 1.8341177967179747, + "ewc_loss": 0.04431084543466568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2155421902425587e-05, + "grad_norm": 29.012910842895508, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8869639039039612, + "num_tokens": 549876108.0, + "step": 14418 + }, + { + "epoch": 1.8342450069965652, + "ewc_loss": 0.0443427711725235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2171385353431106e-05, + "grad_norm": 29.158390045166016, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8821190595626831, + "num_tokens": 549909656.0, + "step": 14419 + }, + { + "epoch": 1.8343722172751558, + "ewc_loss": 0.044352270662784576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2176134734763764e-05, + "grad_norm": 28.959548950195312, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8598119616508484, + "num_tokens": 549949156.0, + "step": 14420 + }, + { + "epoch": 1.8344994275537463, + "ewc_loss": 0.04437236115336418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2186181013239548e-05, + "grad_norm": 29.15839385986328, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8749480247497559, + "num_tokens": 549990256.0, + "step": 14421 + }, + { + "epoch": 1.8346266378323368, + "ewc_loss": 0.04441551864147186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2207759684533812e-05, + "grad_norm": 29.142263412475586, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.876764178276062, + "num_tokens": 550023557.0, + "step": 14422 + }, + { + "epoch": 1.8347538481109273, + "ewc_loss": 0.0443483330309391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2174166588229127e-05, + "grad_norm": 29.121963500976562, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8636599183082581, + "num_tokens": 550069402.0, + "step": 14423 + }, + { + "epoch": 1.8348810583895179, + "ewc_loss": 0.044318877160549164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2159438231028616e-05, + "grad_norm": 29.006938934326172, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8750865459442139, + "num_tokens": 550110860.0, + "step": 14424 + }, + { + "epoch": 1.8350082686681084, + "ewc_loss": 0.04427115619182587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2135578547022305e-05, + "grad_norm": 29.170320510864258, + "learning_rate": 1e-06, + "loss": 0.4826, + "mean_token_accuracy": 0.8535900712013245, + "num_tokens": 550151736.0, + "step": 14425 + }, + { + "epoch": 1.835135478946699, + "ewc_loss": 0.04439911991357803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2199559680302627e-05, + "grad_norm": 28.907093048095703, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.870303213596344, + "num_tokens": 550187731.0, + "step": 14426 + }, + { + "epoch": 1.8352626892252895, + "ewc_loss": 0.044280730187892914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2140364308143035e-05, + "grad_norm": 29.105571746826172, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8698388934135437, + "num_tokens": 550220545.0, + "step": 14427 + }, + { + "epoch": 1.83538989950388, + "ewc_loss": 0.04449281096458435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2246405933401547e-05, + "grad_norm": 29.04448127746582, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8713804483413696, + "num_tokens": 550254755.0, + "step": 14428 + }, + { + "epoch": 1.8355171097824705, + "ewc_loss": 0.044310715049505234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2155358237796463e-05, + "grad_norm": 29.231348037719727, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8749203681945801, + "num_tokens": 550295191.0, + "step": 14429 + }, + { + "epoch": 1.835644320061061, + "ewc_loss": 0.044462818652391434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2231410184758715e-05, + "grad_norm": 29.111438751220703, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8620305061340332, + "num_tokens": 550332984.0, + "step": 14430 + }, + { + "epoch": 1.8357715303396516, + "ewc_loss": 0.044267140328884125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.213357038272079e-05, + "grad_norm": 29.103225708007812, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8699840307235718, + "num_tokens": 550377470.0, + "step": 14431 + }, + { + "epoch": 1.835898740618242, + "ewc_loss": 0.044326115399599075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2163058019941673e-05, + "grad_norm": 29.08742332458496, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8724641799926758, + "num_tokens": 550416123.0, + "step": 14432 + }, + { + "epoch": 1.8360259508968326, + "ewc_loss": 0.0442904531955719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2145226466818713e-05, + "grad_norm": 29.23003578186035, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8727242946624756, + "num_tokens": 550458062.0, + "step": 14433 + }, + { + "epoch": 1.836153161175423, + "ewc_loss": 0.04434536024928093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.217268047388643e-05, + "grad_norm": 29.14024543762207, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.862074613571167, + "num_tokens": 550496485.0, + "step": 14434 + }, + { + "epoch": 1.8362803714540135, + "ewc_loss": 0.04423452168703079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.21172613237286e-05, + "grad_norm": 29.15701675415039, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.870409369468689, + "num_tokens": 550536500.0, + "step": 14435 + }, + { + "epoch": 1.836407581732604, + "ewc_loss": 0.04431271553039551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2156358681968413e-05, + "grad_norm": 29.137306213378906, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8628644943237305, + "num_tokens": 550575611.0, + "step": 14436 + }, + { + "epoch": 1.8365347920111945, + "ewc_loss": 0.04428477585315704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.214238884334918e-05, + "grad_norm": 29.062896728515625, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8595672845840454, + "num_tokens": 550616431.0, + "step": 14437 + }, + { + "epoch": 1.836662002289785, + "ewc_loss": 0.04432570934295654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2162854293128476e-05, + "grad_norm": 29.043981552124023, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8786419630050659, + "num_tokens": 550653626.0, + "step": 14438 + }, + { + "epoch": 1.8367892125683756, + "ewc_loss": 0.04430018737912178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.21500940824626e-05, + "grad_norm": 29.05821990966797, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8638273477554321, + "num_tokens": 550690899.0, + "step": 14439 + }, + { + "epoch": 1.8369164228469659, + "ewc_loss": 0.04428890720009804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2144453396322206e-05, + "grad_norm": 29.06792640686035, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8686692118644714, + "num_tokens": 550734408.0, + "step": 14440 + }, + { + "epoch": 1.8370436331255564, + "ewc_loss": 0.04428260400891304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2141302906675264e-05, + "grad_norm": 29.0025634765625, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8554267287254333, + "num_tokens": 550767888.0, + "step": 14441 + }, + { + "epoch": 1.837170843404147, + "ewc_loss": 0.044284652918577194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.214232699770946e-05, + "grad_norm": 29.034955978393555, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8685624599456787, + "num_tokens": 550807192.0, + "step": 14442 + }, + { + "epoch": 1.8372980536827375, + "ewc_loss": 0.0443742573261261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2187128706718795e-05, + "grad_norm": 29.075946807861328, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8772767782211304, + "num_tokens": 550848006.0, + "step": 14443 + }, + { + "epoch": 1.837425263961328, + "ewc_loss": 0.044346388429403305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.217319342889823e-05, + "grad_norm": 29.073108673095703, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.868079662322998, + "num_tokens": 550888350.0, + "step": 14444 + }, + { + "epoch": 1.8375524742399185, + "ewc_loss": 0.044320523738861084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2160262233228423e-05, + "grad_norm": 29.09516716003418, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8741084933280945, + "num_tokens": 550925389.0, + "step": 14445 + }, + { + "epoch": 1.837679684518509, + "ewc_loss": 0.04437156021595001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2185780835570768e-05, + "grad_norm": 29.286495208740234, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8722174763679504, + "num_tokens": 550962610.0, + "step": 14446 + }, + { + "epoch": 1.8378068947970996, + "ewc_loss": 0.044324733316898346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2162366803968325e-05, + "grad_norm": 29.127246856689453, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8654420971870422, + "num_tokens": 551001018.0, + "step": 14447 + }, + { + "epoch": 1.83793410507569, + "ewc_loss": 0.044288620352745056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2144309696159326e-05, + "grad_norm": 29.0833797454834, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8695268630981445, + "num_tokens": 551038505.0, + "step": 14448 + }, + { + "epoch": 1.8380613153542806, + "ewc_loss": 0.044385772198438644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2192885808181018e-05, + "grad_norm": 29.193571090698242, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8800232410430908, + "num_tokens": 551072235.0, + "step": 14449 + }, + { + "epoch": 1.8381885256328712, + "ewc_loss": 0.044281017035245895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2140508008305915e-05, + "grad_norm": 29.08915138244629, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.881657600402832, + "num_tokens": 551110129.0, + "step": 14450 + }, + { + "epoch": 1.8383157359114617, + "ewc_loss": 0.04432613402605057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.216306711488869e-05, + "grad_norm": 29.196956634521484, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8690649271011353, + "num_tokens": 551152380.0, + "step": 14451 + }, + { + "epoch": 1.8384429461900522, + "ewc_loss": 0.04423391819000244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.211695937148761e-05, + "grad_norm": 29.10561752319336, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8794450163841248, + "num_tokens": 551195942.0, + "step": 14452 + }, + { + "epoch": 1.8385701564686427, + "ewc_loss": 0.04423819109797478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2119094865047373e-05, + "grad_norm": 29.122114181518555, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8597012758255005, + "num_tokens": 551238502.0, + "step": 14453 + }, + { + "epoch": 1.8386973667472333, + "ewc_loss": 0.044257763773202896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.212888102803845e-05, + "grad_norm": 29.109708786010742, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8784599304199219, + "num_tokens": 551278091.0, + "step": 14454 + }, + { + "epoch": 1.8388245770258238, + "ewc_loss": 0.044226132333278656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2113066734164022e-05, + "grad_norm": 29.002174377441406, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8481853008270264, + "num_tokens": 551314940.0, + "step": 14455 + }, + { + "epoch": 1.8389517873044143, + "ewc_loss": 0.04432970657944679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2164853362482972e-05, + "grad_norm": 29.08963394165039, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8548846244812012, + "num_tokens": 551356427.0, + "step": 14456 + }, + { + "epoch": 1.8390789975830049, + "ewc_loss": 0.04432591423392296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2162957975524478e-05, + "grad_norm": 29.10053253173828, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8730124235153198, + "num_tokens": 551400436.0, + "step": 14457 + }, + { + "epoch": 1.8392062078615952, + "ewc_loss": 0.04428067430853844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.214033702330198e-05, + "grad_norm": 29.101163864135742, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8672425746917725, + "num_tokens": 551438335.0, + "step": 14458 + }, + { + "epoch": 1.8393334181401857, + "ewc_loss": 0.04423235356807709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2116177206044085e-05, + "grad_norm": 29.025100708007812, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8709493279457092, + "num_tokens": 551471214.0, + "step": 14459 + }, + { + "epoch": 1.8394606284187762, + "ewc_loss": 0.04435238242149353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2176191123435274e-05, + "grad_norm": 29.103599548339844, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8557037115097046, + "num_tokens": 551504071.0, + "step": 14460 + }, + { + "epoch": 1.8395878386973668, + "ewc_loss": 0.044330477714538574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2165238988236524e-05, + "grad_norm": 29.1481876373291, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8633420467376709, + "num_tokens": 551538456.0, + "step": 14461 + }, + { + "epoch": 1.8397150489759573, + "ewc_loss": 0.0443919338285923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2195967176230624e-05, + "grad_norm": 29.071243286132812, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8728729486465454, + "num_tokens": 551575334.0, + "step": 14462 + }, + { + "epoch": 1.8398422592545478, + "ewc_loss": 0.044340651482343674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.217032488260884e-05, + "grad_norm": 29.060239791870117, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8772021532058716, + "num_tokens": 551617127.0, + "step": 14463 + }, + { + "epoch": 1.8399694695331381, + "ewc_loss": 0.04431290179491043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2156451450427994e-05, + "grad_norm": 29.009397506713867, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8725957870483398, + "num_tokens": 551655548.0, + "step": 14464 + }, + { + "epoch": 1.8400966798117286, + "ewc_loss": 0.044415075331926346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.220753776782658e-05, + "grad_norm": 29.17403221130371, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8663412928581238, + "num_tokens": 551691807.0, + "step": 14465 + }, + { + "epoch": 1.8402238900903192, + "ewc_loss": 0.04435214400291443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2176072889124043e-05, + "grad_norm": 28.977319717407227, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8768652081489563, + "num_tokens": 551729805.0, + "step": 14466 + }, + { + "epoch": 1.8403511003689097, + "ewc_loss": 0.04442695528268814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2213476768229157e-05, + "grad_norm": 29.16411781311035, + "learning_rate": 1e-06, + "loss": 0.5097, + "mean_token_accuracy": 0.8451002836227417, + "num_tokens": 551775220.0, + "step": 14467 + }, + { + "epoch": 1.8404783106475002, + "ewc_loss": 0.04443182796239853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2215914214029908e-05, + "grad_norm": 29.10527801513672, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8607295155525208, + "num_tokens": 551816242.0, + "step": 14468 + }, + { + "epoch": 1.8406055209260908, + "ewc_loss": 0.04437367245554924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2186835849424824e-05, + "grad_norm": 29.014467239379883, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8565107583999634, + "num_tokens": 551854929.0, + "step": 14469 + }, + { + "epoch": 1.8407327312046813, + "ewc_loss": 0.044392962008714676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2196480131242424e-05, + "grad_norm": 29.14676284790039, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8665212392807007, + "num_tokens": 551898877.0, + "step": 14470 + }, + { + "epoch": 1.8408599414832718, + "ewc_loss": 0.04447392746806145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.223696355940774e-05, + "grad_norm": 28.983293533325195, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8847220540046692, + "num_tokens": 551934252.0, + "step": 14471 + }, + { + "epoch": 1.8409871517618623, + "ewc_loss": 0.044377442449331284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.21887221414363e-05, + "grad_norm": 29.122394561767578, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8795872926712036, + "num_tokens": 551974779.0, + "step": 14472 + }, + { + "epoch": 1.8411143620404529, + "ewc_loss": 0.04444770887494087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2223854102776386e-05, + "grad_norm": 33.30781936645508, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8620889782905579, + "num_tokens": 552011627.0, + "step": 14473 + }, + { + "epoch": 1.8412415723190434, + "ewc_loss": 0.04653237760066986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3266189600690268e-05, + "grad_norm": 29.06069564819336, + "learning_rate": 1e-06, + "loss": 0.5011, + "mean_token_accuracy": 0.8472532033920288, + "num_tokens": 552045333.0, + "step": 14474 + }, + { + "epoch": 1.841368782597634, + "ewc_loss": 0.044576022773981094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2288011678028852e-05, + "grad_norm": 28.574295043945312, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8626497983932495, + "num_tokens": 552079974.0, + "step": 14475 + }, + { + "epoch": 1.8414959928762245, + "ewc_loss": 0.04682489484548569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3412447262671776e-05, + "grad_norm": 29.257373809814453, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8759312629699707, + "num_tokens": 552117507.0, + "step": 14476 + }, + { + "epoch": 1.841623203154815, + "ewc_loss": 0.046652901917696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.332645090064034e-05, + "grad_norm": 28.910099029541016, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8674836754798889, + "num_tokens": 552156009.0, + "step": 14477 + }, + { + "epoch": 1.8417504134334055, + "ewc_loss": 0.04712875187397003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.356437653361354e-05, + "grad_norm": 29.077760696411133, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.876545786857605, + "num_tokens": 552189092.0, + "step": 14478 + }, + { + "epoch": 1.841877623711996, + "ewc_loss": 0.047415852546691895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.370792572037317e-05, + "grad_norm": 29.03094482421875, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.866026759147644, + "num_tokens": 552228305.0, + "step": 14479 + }, + { + "epoch": 1.8420048339905866, + "ewc_loss": 0.047755610197782516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3877804778749123e-05, + "grad_norm": 29.07141876220703, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8704603910446167, + "num_tokens": 552269402.0, + "step": 14480 + }, + { + "epoch": 1.842132044269177, + "ewc_loss": 0.04786260426044464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3931301257107407e-05, + "grad_norm": 29.071380615234375, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8657384514808655, + "num_tokens": 552304870.0, + "step": 14481 + }, + { + "epoch": 1.8422592545477676, + "ewc_loss": 0.04798096790909767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3990483896341175e-05, + "grad_norm": 29.24661636352539, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8635872602462769, + "num_tokens": 552336802.0, + "step": 14482 + }, + { + "epoch": 1.842386464826358, + "ewc_loss": 0.04796779155731201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3983895516721532e-05, + "grad_norm": 29.214683532714844, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.881314218044281, + "num_tokens": 552376124.0, + "step": 14483 + }, + { + "epoch": 1.8425136751049485, + "ewc_loss": 0.0478118397295475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3905919078970328e-05, + "grad_norm": 29.165979385375977, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8596258163452148, + "num_tokens": 552412285.0, + "step": 14484 + }, + { + "epoch": 1.842640885383539, + "ewc_loss": 0.047746140509843826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3873069949331693e-05, + "grad_norm": 29.2308406829834, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8694455027580261, + "num_tokens": 552447527.0, + "step": 14485 + }, + { + "epoch": 1.8427680956621295, + "ewc_loss": 0.047634486109018326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3817243345547467e-05, + "grad_norm": 29.23328399658203, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8713972568511963, + "num_tokens": 552486569.0, + "step": 14486 + }, + { + "epoch": 1.84289530594072, + "ewc_loss": 0.04736858978867531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3684295229031704e-05, + "grad_norm": 29.302427291870117, + "learning_rate": 1e-06, + "loss": 0.566, + "mean_token_accuracy": 0.8329851031303406, + "num_tokens": 552529158.0, + "step": 14487 + }, + { + "epoch": 1.8430225162193106, + "ewc_loss": 0.047183603048324585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.35918014368508e-05, + "grad_norm": 29.228748321533203, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8666523098945618, + "num_tokens": 552572183.0, + "step": 14488 + }, + { + "epoch": 1.8431497264979009, + "ewc_loss": 0.046965911984443665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3482956748921424e-05, + "grad_norm": 29.283002853393555, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8654643297195435, + "num_tokens": 552609785.0, + "step": 14489 + }, + { + "epoch": 1.8432769367764914, + "ewc_loss": 0.046778660267591476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3389329726342112e-05, + "grad_norm": 29.291343688964844, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8745476007461548, + "num_tokens": 552641243.0, + "step": 14490 + }, + { + "epoch": 1.843404147055082, + "ewc_loss": 0.04665277898311615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3326389055000618e-05, + "grad_norm": 29.386215209960938, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8597368001937866, + "num_tokens": 552677595.0, + "step": 14491 + }, + { + "epoch": 1.8435313573336725, + "ewc_loss": 0.04647471010684967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.323735498066526e-05, + "grad_norm": 29.24899673461914, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8691523671150208, + "num_tokens": 552715492.0, + "step": 14492 + }, + { + "epoch": 1.843658567612263, + "ewc_loss": 0.04617936164140701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.308968032593839e-05, + "grad_norm": 29.342557907104492, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8674529194831848, + "num_tokens": 552757410.0, + "step": 14493 + }, + { + "epoch": 1.8437857778908535, + "ewc_loss": 0.04606470465660095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3032353055896237e-05, + "grad_norm": 29.147090911865234, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8833256363868713, + "num_tokens": 552793794.0, + "step": 14494 + }, + { + "epoch": 1.843912988169444, + "ewc_loss": 0.045762259513139725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.288112955284305e-05, + "grad_norm": 29.321414947509766, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8530542254447937, + "num_tokens": 552836010.0, + "step": 14495 + }, + { + "epoch": 1.8440401984480346, + "ewc_loss": 0.04586157202720642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2930786144570448e-05, + "grad_norm": 29.24960708618164, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8704584240913391, + "num_tokens": 552874136.0, + "step": 14496 + }, + { + "epoch": 1.844167408726625, + "ewc_loss": 0.04554518312215805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.277259227412287e-05, + "grad_norm": 29.26782989501953, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8634612560272217, + "num_tokens": 552912314.0, + "step": 14497 + }, + { + "epoch": 1.8442946190052156, + "ewc_loss": 0.04543280974030495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2716405510436743e-05, + "grad_norm": 29.18440818786621, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8562642335891724, + "num_tokens": 552953998.0, + "step": 14498 + }, + { + "epoch": 1.8444218292838062, + "ewc_loss": 0.04537997767329216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2689988327329047e-05, + "grad_norm": 29.31362533569336, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8868901133537292, + "num_tokens": 552987310.0, + "step": 14499 + }, + { + "epoch": 1.8445490395623967, + "ewc_loss": 0.04528123140335083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2640615497948602e-05, + "grad_norm": 29.13851547241211, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8648558855056763, + "num_tokens": 553019830.0, + "step": 14500 + }, + { + "epoch": 1.8446762498409872, + "ewc_loss": 0.04508203640580177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2541018552146852e-05, + "grad_norm": 29.30712127685547, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8668532967567444, + "num_tokens": 553066695.0, + "step": 14501 + }, + { + "epoch": 1.8448034601195777, + "ewc_loss": 0.045176081359386444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2588041247217916e-05, + "grad_norm": 29.176998138427734, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.858388364315033, + "num_tokens": 553098943.0, + "step": 14502 + }, + { + "epoch": 1.8449306703981683, + "ewc_loss": 0.04492709040641785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.246354597446043e-05, + "grad_norm": 29.18488121032715, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8724988698959351, + "num_tokens": 553136106.0, + "step": 14503 + }, + { + "epoch": 1.8450578806767588, + "ewc_loss": 0.04500427097082138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2502135834656656e-05, + "grad_norm": 29.2615966796875, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8745543956756592, + "num_tokens": 553170014.0, + "step": 14504 + }, + { + "epoch": 1.8451850909553493, + "ewc_loss": 0.044825173914432526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2412586986320093e-05, + "grad_norm": 29.20457649230957, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8619992733001709, + "num_tokens": 553213297.0, + "step": 14505 + }, + { + "epoch": 1.8453123012339399, + "ewc_loss": 0.044861383736133575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2430691387853585e-05, + "grad_norm": 29.18924331665039, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8685545921325684, + "num_tokens": 553248559.0, + "step": 14506 + }, + { + "epoch": 1.8454395115125302, + "ewc_loss": 0.044748734682798386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.237436819996219e-05, + "grad_norm": 29.35435676574707, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.869888961315155, + "num_tokens": 553289140.0, + "step": 14507 + }, + { + "epoch": 1.8455667217911207, + "ewc_loss": 0.044703997671604156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2351998268277384e-05, + "grad_norm": 29.190052032470703, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8602132797241211, + "num_tokens": 553330619.0, + "step": 14508 + }, + { + "epoch": 1.8456939320697112, + "ewc_loss": 0.04468678683042526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.234339262940921e-05, + "grad_norm": 29.298582077026367, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8794281482696533, + "num_tokens": 553371295.0, + "step": 14509 + }, + { + "epoch": 1.8458211423483017, + "ewc_loss": 0.04461069777607918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2305348466034047e-05, + "grad_norm": 29.08325958251953, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8645217418670654, + "num_tokens": 553410237.0, + "step": 14510 + }, + { + "epoch": 1.8459483526268923, + "ewc_loss": 0.04458007961511612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2290039851213805e-05, + "grad_norm": 29.335220336914062, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8747872114181519, + "num_tokens": 553446727.0, + "step": 14511 + }, + { + "epoch": 1.8460755629054828, + "ewc_loss": 0.04457934945821762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2289674234343693e-05, + "grad_norm": 29.019535064697266, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8741548657417297, + "num_tokens": 553488626.0, + "step": 14512 + }, + { + "epoch": 1.846202773184073, + "ewc_loss": 0.04453573748469353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2267868189373985e-05, + "grad_norm": 29.35111427307129, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8598366379737854, + "num_tokens": 553528294.0, + "step": 14513 + }, + { + "epoch": 1.8463299834626636, + "ewc_loss": 0.04465815797448158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2329079001792707e-05, + "grad_norm": 29.277141571044922, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8717342615127563, + "num_tokens": 553563738.0, + "step": 14514 + }, + { + "epoch": 1.8464571937412542, + "ewc_loss": 0.04439296945929527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2196485588210635e-05, + "grad_norm": 29.153196334838867, + "learning_rate": 1e-06, + "loss": 0.5401, + "mean_token_accuracy": 0.8353284597396851, + "num_tokens": 553603351.0, + "step": 14515 + }, + { + "epoch": 1.8465844040198447, + "ewc_loss": 0.04455315321683884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2276577510638162e-05, + "grad_norm": 29.349157333374023, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8668340444564819, + "num_tokens": 553643655.0, + "step": 14516 + }, + { + "epoch": 1.8467116142984352, + "ewc_loss": 0.04440357908606529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2201789761311375e-05, + "grad_norm": 29.099014282226562, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8797350525856018, + "num_tokens": 553677944.0, + "step": 14517 + }, + { + "epoch": 1.8468388245770258, + "ewc_loss": 0.04435393959283829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.217696965089999e-05, + "grad_norm": 29.13835334777832, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8639752864837646, + "num_tokens": 553712120.0, + "step": 14518 + }, + { + "epoch": 1.8469660348556163, + "ewc_loss": 0.044479258358478546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2239628378883936e-05, + "grad_norm": 29.073604583740234, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8716104626655579, + "num_tokens": 553751442.0, + "step": 14519 + }, + { + "epoch": 1.8470932451342068, + "ewc_loss": 0.04449617117643356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.224808486062102e-05, + "grad_norm": 29.2100830078125, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8748100996017456, + "num_tokens": 553782924.0, + "step": 14520 + }, + { + "epoch": 1.8472204554127973, + "ewc_loss": 0.04446467012166977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2232334231375717e-05, + "grad_norm": 29.099506378173828, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8586965203285217, + "num_tokens": 553818861.0, + "step": 14521 + }, + { + "epoch": 1.8473476656913879, + "ewc_loss": 0.044437531381845474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2218766389414668e-05, + "grad_norm": 29.282474517822266, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8666799068450928, + "num_tokens": 553855593.0, + "step": 14522 + }, + { + "epoch": 1.8474748759699784, + "ewc_loss": 0.04450318589806557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.225159369118046e-05, + "grad_norm": 29.2804012298584, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8654711842536926, + "num_tokens": 553896028.0, + "step": 14523 + }, + { + "epoch": 1.847602086248569, + "ewc_loss": 0.044431477785110474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2215739591047168e-05, + "grad_norm": 29.152973175048828, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8720126748085022, + "num_tokens": 553935212.0, + "step": 14524 + }, + { + "epoch": 1.8477292965271594, + "ewc_loss": 0.044424526393413544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.221226350229699e-05, + "grad_norm": 29.210908889770508, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8667783141136169, + "num_tokens": 553971231.0, + "step": 14525 + }, + { + "epoch": 1.84785650680575, + "ewc_loss": 0.044395409524440765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.219770431111101e-05, + "grad_norm": 29.157617568969727, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8756119012832642, + "num_tokens": 554006782.0, + "step": 14526 + }, + { + "epoch": 1.8479837170843405, + "ewc_loss": 0.04441387206315994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2206935682334006e-05, + "grad_norm": 29.12809181213379, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.8607074618339539, + "num_tokens": 554043427.0, + "step": 14527 + }, + { + "epoch": 1.848110927362931, + "ewc_loss": 0.04448593407869339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2242966224439442e-05, + "grad_norm": 29.20827293395996, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8578503131866455, + "num_tokens": 554084187.0, + "step": 14528 + }, + { + "epoch": 1.8482381376415216, + "ewc_loss": 0.04436225816607475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.21811296796659e-05, + "grad_norm": 29.129430770874023, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8677607774734497, + "num_tokens": 554122186.0, + "step": 14529 + }, + { + "epoch": 1.848365347920112, + "ewc_loss": 0.04443568363785744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2217842342797667e-05, + "grad_norm": 29.161609649658203, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8674049973487854, + "num_tokens": 554161388.0, + "step": 14530 + }, + { + "epoch": 1.8484925581987026, + "ewc_loss": 0.044449854642152786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2224927306524478e-05, + "grad_norm": 29.08349609375, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8589718341827393, + "num_tokens": 554199837.0, + "step": 14531 + }, + { + "epoch": 1.848619768477293, + "ewc_loss": 0.04453751817345619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2268759494181722e-05, + "grad_norm": 29.30314064025879, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8589648008346558, + "num_tokens": 554239567.0, + "step": 14532 + }, + { + "epoch": 1.8487469787558835, + "ewc_loss": 0.044492628425359726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.224631498393137e-05, + "grad_norm": 29.08260154724121, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8708289861679077, + "num_tokens": 554277040.0, + "step": 14533 + }, + { + "epoch": 1.848874189034474, + "ewc_loss": 0.044432688504457474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2216343495529145e-05, + "grad_norm": 29.178354263305664, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8713411092758179, + "num_tokens": 554318994.0, + "step": 14534 + }, + { + "epoch": 1.8490013993130645, + "ewc_loss": 0.04444289952516556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2221449398784898e-05, + "grad_norm": 29.07292938232422, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8620398044586182, + "num_tokens": 554355334.0, + "step": 14535 + }, + { + "epoch": 1.849128609591655, + "ewc_loss": 0.044518548995256424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2259275283431634e-05, + "grad_norm": 29.194318771362305, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8571601510047913, + "num_tokens": 554396504.0, + "step": 14536 + }, + { + "epoch": 1.8492558198702456, + "ewc_loss": 0.04438825696706772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.219412817794364e-05, + "grad_norm": 29.11284637451172, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.874109148979187, + "num_tokens": 554429772.0, + "step": 14537 + }, + { + "epoch": 1.8493830301488359, + "ewc_loss": 0.04457375407218933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.228687662864104e-05, + "grad_norm": 29.221895217895508, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8700494170188904, + "num_tokens": 554470102.0, + "step": 14538 + }, + { + "epoch": 1.8495102404274264, + "ewc_loss": 0.04447663202881813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.223831688752398e-05, + "grad_norm": 29.15155601501465, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8600244522094727, + "num_tokens": 554511632.0, + "step": 14539 + }, + { + "epoch": 1.849637450706017, + "ewc_loss": 0.04448220506310463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2241101760300808e-05, + "grad_norm": 29.22972297668457, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8642049431800842, + "num_tokens": 554553028.0, + "step": 14540 + }, + { + "epoch": 1.8497646609846075, + "ewc_loss": 0.04441205784678459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.220602982561104e-05, + "grad_norm": 29.151826858520508, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8623571395874023, + "num_tokens": 554585109.0, + "step": 14541 + }, + { + "epoch": 1.849891871263198, + "ewc_loss": 0.04442036896944046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.221018439740874e-05, + "grad_norm": 29.19418716430664, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8678328394889832, + "num_tokens": 554621141.0, + "step": 14542 + }, + { + "epoch": 1.8500190815417885, + "ewc_loss": 0.044466305524110794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2233152776607312e-05, + "grad_norm": 29.15188980102539, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8673052787780762, + "num_tokens": 554660838.0, + "step": 14543 + }, + { + "epoch": 1.850146291820379, + "ewc_loss": 0.044441476464271545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2220738173928112e-05, + "grad_norm": 29.1312198638916, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8696763515472412, + "num_tokens": 554696338.0, + "step": 14544 + }, + { + "epoch": 1.8502735020989696, + "ewc_loss": 0.04443790018558502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2218950107344426e-05, + "grad_norm": 29.250524520874023, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8798712491989136, + "num_tokens": 554729782.0, + "step": 14545 + }, + { + "epoch": 1.85040071237756, + "ewc_loss": 0.04445334151387215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2226669898373075e-05, + "grad_norm": 29.04966163635254, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8840154409408569, + "num_tokens": 554767566.0, + "step": 14546 + }, + { + "epoch": 1.8505279226561506, + "ewc_loss": 0.04439644142985344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2198220904101618e-05, + "grad_norm": 29.2681941986084, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8728893995285034, + "num_tokens": 554806063.0, + "step": 14547 + }, + { + "epoch": 1.8506551329347412, + "ewc_loss": 0.044480498880147934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2240248654270545e-05, + "grad_norm": 29.136587142944336, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8645353317260742, + "num_tokens": 554839418.0, + "step": 14548 + }, + { + "epoch": 1.8507823432133317, + "ewc_loss": 0.04436945170164108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.218472582171671e-05, + "grad_norm": 29.135051727294922, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.863652229309082, + "num_tokens": 554881888.0, + "step": 14549 + }, + { + "epoch": 1.8509095534919222, + "ewc_loss": 0.04450038820505142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2250194888329133e-05, + "grad_norm": 29.097454071044922, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8864421844482422, + "num_tokens": 554921561.0, + "step": 14550 + }, + { + "epoch": 1.8510367637705127, + "ewc_loss": 0.044342298060655594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2171148884808645e-05, + "grad_norm": 29.127992630004883, + "learning_rate": 1e-06, + "loss": 0.4955, + "mean_token_accuracy": 0.8450678586959839, + "num_tokens": 554954380.0, + "step": 14551 + }, + { + "epoch": 1.8511639740491033, + "ewc_loss": 0.04454896226525307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2274480215855874e-05, + "grad_norm": 29.253421783447266, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.868517279624939, + "num_tokens": 554994781.0, + "step": 14552 + }, + { + "epoch": 1.8512911843276938, + "ewc_loss": 0.04442833736538887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2214167984202504e-05, + "grad_norm": 29.176820755004883, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8761917352676392, + "num_tokens": 555024938.0, + "step": 14553 + }, + { + "epoch": 1.8514183946062843, + "ewc_loss": 0.04440867155790329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.220433634647634e-05, + "grad_norm": 29.229137420654297, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8563569784164429, + "num_tokens": 555059785.0, + "step": 14554 + }, + { + "epoch": 1.8515456048848749, + "ewc_loss": 0.04437818378210068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.218909139628522e-05, + "grad_norm": 29.068653106689453, + "learning_rate": 1e-06, + "loss": 0.5036, + "mean_token_accuracy": 0.8429223895072937, + "num_tokens": 555097502.0, + "step": 14555 + }, + { + "epoch": 1.8516728151634652, + "ewc_loss": 0.04444032907485962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2220165192265995e-05, + "grad_norm": 29.305011749267578, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8651684522628784, + "num_tokens": 555139976.0, + "step": 14556 + }, + { + "epoch": 1.8518000254420557, + "ewc_loss": 0.04444443807005882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2222218831302598e-05, + "grad_norm": 29.11366844177246, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8707407712936401, + "num_tokens": 555181250.0, + "step": 14557 + }, + { + "epoch": 1.8519272357206462, + "ewc_loss": 0.04433341696858406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.216670873167459e-05, + "grad_norm": 29.26190948486328, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.848659873008728, + "num_tokens": 555214927.0, + "step": 14558 + }, + { + "epoch": 1.8520544459992367, + "ewc_loss": 0.0444587841629982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2229392925510183e-05, + "grad_norm": 28.997703552246094, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8638452291488647, + "num_tokens": 555255795.0, + "step": 14559 + }, + { + "epoch": 1.8521816562778273, + "ewc_loss": 0.04432279244065285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2161395463626832e-05, + "grad_norm": 29.193437576293945, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8715375661849976, + "num_tokens": 555294177.0, + "step": 14560 + }, + { + "epoch": 1.8523088665564178, + "ewc_loss": 0.044576287269592285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.228814446425531e-05, + "grad_norm": 29.23697853088379, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8800775408744812, + "num_tokens": 555331548.0, + "step": 14561 + }, + { + "epoch": 1.852436076835008, + "ewc_loss": 0.04435931146144867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.217965629824903e-05, + "grad_norm": 29.164791107177734, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8804197311401367, + "num_tokens": 555368426.0, + "step": 14562 + }, + { + "epoch": 1.8525632871135986, + "ewc_loss": 0.04451034218072891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2255171643337235e-05, + "grad_norm": 29.10115623474121, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8842840790748596, + "num_tokens": 555411594.0, + "step": 14563 + }, + { + "epoch": 1.8526904973921892, + "ewc_loss": 0.04439603164792061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2198015358299017e-05, + "grad_norm": 29.224102020263672, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8595600128173828, + "num_tokens": 555450412.0, + "step": 14564 + }, + { + "epoch": 1.8528177076707797, + "ewc_loss": 0.04444536939263344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2222684492589906e-05, + "grad_norm": 29.210451126098633, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8768240809440613, + "num_tokens": 555489067.0, + "step": 14565 + }, + { + "epoch": 1.8529449179493702, + "ewc_loss": 0.04437583312392235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.218791632913053e-05, + "grad_norm": 29.141651153564453, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8731833696365356, + "num_tokens": 555524552.0, + "step": 14566 + }, + { + "epoch": 1.8530721282279607, + "ewc_loss": 0.04437956586480141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.218978261225857e-05, + "grad_norm": 29.17591094970703, + "learning_rate": 1e-06, + "loss": 0.4983, + "mean_token_accuracy": 0.8416735529899597, + "num_tokens": 555563766.0, + "step": 14567 + }, + { + "epoch": 1.8531993385065513, + "ewc_loss": 0.04441957548260689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2209787857718766e-05, + "grad_norm": 29.166494369506836, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8753467798233032, + "num_tokens": 555601410.0, + "step": 14568 + }, + { + "epoch": 1.8533265487851418, + "ewc_loss": 0.04439996927976608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2199985323823057e-05, + "grad_norm": 29.103673934936523, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8664657473564148, + "num_tokens": 555641346.0, + "step": 14569 + }, + { + "epoch": 1.8534537590637323, + "ewc_loss": 0.04441322013735771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.220661008323077e-05, + "grad_norm": 29.193696975708008, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8783231973648071, + "num_tokens": 555676537.0, + "step": 14570 + }, + { + "epoch": 1.8535809693423229, + "ewc_loss": 0.04444076493382454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2220381652005017e-05, + "grad_norm": 29.11794662475586, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8531979918479919, + "num_tokens": 555721634.0, + "step": 14571 + }, + { + "epoch": 1.8537081796209134, + "ewc_loss": 0.044406816363334656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.220340866188053e-05, + "grad_norm": 29.20476531982422, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8684051036834717, + "num_tokens": 555765447.0, + "step": 14572 + }, + { + "epoch": 1.853835389899504, + "ewc_loss": 0.044516097754240036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2258049284573644e-05, + "grad_norm": 29.200944900512695, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8668863773345947, + "num_tokens": 555798598.0, + "step": 14573 + }, + { + "epoch": 1.8539626001780944, + "ewc_loss": 0.044288575649261475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2144287868286483e-05, + "grad_norm": 29.095218658447266, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8769739270210266, + "num_tokens": 555829465.0, + "step": 14574 + }, + { + "epoch": 1.854089810456685, + "ewc_loss": 0.044465042650699615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2232521587284282e-05, + "grad_norm": 29.174352645874023, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8680946826934814, + "num_tokens": 555866577.0, + "step": 14575 + }, + { + "epoch": 1.8542170207352755, + "ewc_loss": 0.044431138783693314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2215568606043234e-05, + "grad_norm": 29.135313034057617, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8707528114318848, + "num_tokens": 555900472.0, + "step": 14576 + }, + { + "epoch": 1.854344231013866, + "ewc_loss": 0.0443834513425827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2191725292941555e-05, + "grad_norm": 29.0667781829834, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8623006343841553, + "num_tokens": 555936356.0, + "step": 14577 + }, + { + "epoch": 1.8544714412924566, + "ewc_loss": 0.04440080747008324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2200403691385873e-05, + "grad_norm": 29.151090621948242, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8617013692855835, + "num_tokens": 555975338.0, + "step": 14578 + }, + { + "epoch": 1.854598651571047, + "ewc_loss": 0.04450930282473564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.225465141236782e-05, + "grad_norm": 29.095617294311523, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.858478307723999, + "num_tokens": 556019651.0, + "step": 14579 + }, + { + "epoch": 1.8547258618496376, + "ewc_loss": 0.04448038339614868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2240192265599035e-05, + "grad_norm": 29.2805118560791, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8628189563751221, + "num_tokens": 556054530.0, + "step": 14580 + }, + { + "epoch": 1.854853072128228, + "ewc_loss": 0.04454299807548523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2271498892223462e-05, + "grad_norm": 29.120405197143555, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8641260266304016, + "num_tokens": 556091985.0, + "step": 14581 + }, + { + "epoch": 1.8549802824068184, + "ewc_loss": 0.044431522488594055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.221576141892001e-05, + "grad_norm": 29.258821487426758, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8711646199226379, + "num_tokens": 556128777.0, + "step": 14582 + }, + { + "epoch": 1.855107492685409, + "ewc_loss": 0.044543828815221786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.227191362180747e-05, + "grad_norm": 29.147830963134766, + "learning_rate": 1e-06, + "loss": 0.3682, + "mean_token_accuracy": 0.8864610195159912, + "num_tokens": 556164286.0, + "step": 14583 + }, + { + "epoch": 1.8552347029639995, + "ewc_loss": 0.04441845789551735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2209229427971877e-05, + "grad_norm": 29.187255859375, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.876003086566925, + "num_tokens": 556199141.0, + "step": 14584 + }, + { + "epoch": 1.85536191324259, + "ewc_loss": 0.0444968082010746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2248405002756044e-05, + "grad_norm": 29.192583084106445, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8737717866897583, + "num_tokens": 556237104.0, + "step": 14585 + }, + { + "epoch": 1.8554891235211806, + "ewc_loss": 0.04439671337604523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2198357328306884e-05, + "grad_norm": 29.1237850189209, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8572818040847778, + "num_tokens": 556275343.0, + "step": 14586 + }, + { + "epoch": 1.8556163337997709, + "ewc_loss": 0.0445612370967865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.228061930509284e-05, + "grad_norm": 29.2767391204834, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8834498524665833, + "num_tokens": 556316539.0, + "step": 14587 + }, + { + "epoch": 1.8557435440783614, + "ewc_loss": 0.044445306062698364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2222653569770046e-05, + "grad_norm": 29.153196334838867, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8807501196861267, + "num_tokens": 556351101.0, + "step": 14588 + }, + { + "epoch": 1.855870754356952, + "ewc_loss": 0.044375937432050705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2187969079823233e-05, + "grad_norm": 29.17169189453125, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8801624774932861, + "num_tokens": 556382458.0, + "step": 14589 + }, + { + "epoch": 1.8559979646355425, + "ewc_loss": 0.04450557753443718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.225278876721859e-05, + "grad_norm": 29.29458999633789, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8704050779342651, + "num_tokens": 556416980.0, + "step": 14590 + }, + { + "epoch": 1.856125174914133, + "ewc_loss": 0.04436267167329788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.21813352254685e-05, + "grad_norm": 29.159469604492188, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8745089769363403, + "num_tokens": 556451868.0, + "step": 14591 + }, + { + "epoch": 1.8562523851927235, + "ewc_loss": 0.04441584274172783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2207921574590728e-05, + "grad_norm": 29.23027992248535, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8561710715293884, + "num_tokens": 556485682.0, + "step": 14592 + }, + { + "epoch": 1.856379595471314, + "ewc_loss": 0.04444175958633423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.222088005510159e-05, + "grad_norm": 29.215862274169922, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8794139623641968, + "num_tokens": 556524857.0, + "step": 14593 + }, + { + "epoch": 1.8565068057499046, + "ewc_loss": 0.04435889050364494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2179445295478217e-05, + "grad_norm": 29.175148010253906, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8729885816574097, + "num_tokens": 556565763.0, + "step": 14594 + }, + { + "epoch": 1.856634016028495, + "ewc_loss": 0.04439976438879967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2199881641427055e-05, + "grad_norm": 29.137271881103516, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8567632436752319, + "num_tokens": 556606202.0, + "step": 14595 + }, + { + "epoch": 1.8567612263070856, + "ewc_loss": 0.04443526268005371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2217631340026855e-05, + "grad_norm": 29.286062240600586, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.876772940158844, + "num_tokens": 556646906.0, + "step": 14596 + }, + { + "epoch": 1.8568884365856761, + "ewc_loss": 0.044408466666936874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2204232664080337e-05, + "grad_norm": 29.084320068359375, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8702511191368103, + "num_tokens": 556684249.0, + "step": 14597 + }, + { + "epoch": 1.8570156468642667, + "ewc_loss": 0.04441535845398903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2207679649000056e-05, + "grad_norm": 29.201261520385742, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8579095602035522, + "num_tokens": 556726840.0, + "step": 14598 + }, + { + "epoch": 1.8571428571428572, + "ewc_loss": 0.0444880872964859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.224404306616634e-05, + "grad_norm": 29.166378021240234, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8710252642631531, + "num_tokens": 556764822.0, + "step": 14599 + }, + { + "epoch": 1.8572700674214477, + "ewc_loss": 0.04444321617484093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2221607650863007e-05, + "grad_norm": 29.224748611450195, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8659566640853882, + "num_tokens": 556800884.0, + "step": 14600 + }, + { + "epoch": 1.8573972777000383, + "ewc_loss": 0.04448525235056877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.224262607342098e-05, + "grad_norm": 29.045469284057617, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8747758269309998, + "num_tokens": 556841233.0, + "step": 14601 + }, + { + "epoch": 1.8575244879786288, + "ewc_loss": 0.04439467191696167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2197336875251494e-05, + "grad_norm": 29.10710334777832, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8626577854156494, + "num_tokens": 556880995.0, + "step": 14602 + }, + { + "epoch": 1.8576516982572193, + "ewc_loss": 0.044568028301000595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2284013539319858e-05, + "grad_norm": 29.165626525878906, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8731977343559265, + "num_tokens": 556914367.0, + "step": 14603 + }, + { + "epoch": 1.8577789085358098, + "ewc_loss": 0.04441656917333603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2208285372471437e-05, + "grad_norm": 29.16908073425293, + "learning_rate": 1e-06, + "loss": 0.4964, + "mean_token_accuracy": 0.8495534062385559, + "num_tokens": 556955944.0, + "step": 14604 + }, + { + "epoch": 1.8579061188144002, + "ewc_loss": 0.044493742287158966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2246871594688855e-05, + "grad_norm": 29.15534210205078, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8516386151313782, + "num_tokens": 556989851.0, + "step": 14605 + }, + { + "epoch": 1.8580333290929907, + "ewc_loss": 0.04449475556612015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.224737727374304e-05, + "grad_norm": 29.142045974731445, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.867859959602356, + "num_tokens": 557033662.0, + "step": 14606 + }, + { + "epoch": 1.8581605393715812, + "ewc_loss": 0.04456738755106926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2283693397184834e-05, + "grad_norm": 29.131391525268555, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.860554575920105, + "num_tokens": 557074869.0, + "step": 14607 + }, + { + "epoch": 1.8582877496501717, + "ewc_loss": 0.04447512701153755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.223756382591091e-05, + "grad_norm": 29.206056594848633, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8574326038360596, + "num_tokens": 557110164.0, + "step": 14608 + }, + { + "epoch": 1.8584149599287623, + "ewc_loss": 0.04451553151011467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.225776552222669e-05, + "grad_norm": 29.10655403137207, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8685647249221802, + "num_tokens": 557149309.0, + "step": 14609 + }, + { + "epoch": 1.8585421702073528, + "ewc_loss": 0.04447853937745094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2239270037971437e-05, + "grad_norm": 29.110340118408203, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8638477325439453, + "num_tokens": 557180510.0, + "step": 14610 + }, + { + "epoch": 1.858669380485943, + "ewc_loss": 0.0445379912853241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2268995962804183e-05, + "grad_norm": 29.187040328979492, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.874251663684845, + "num_tokens": 557218062.0, + "step": 14611 + }, + { + "epoch": 1.8587965907645336, + "ewc_loss": 0.04457340016961098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2286700186668895e-05, + "grad_norm": 29.18158531188965, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8611253499984741, + "num_tokens": 557258963.0, + "step": 14612 + }, + { + "epoch": 1.8589238010431242, + "ewc_loss": 0.044545769691467285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2272884962148964e-05, + "grad_norm": 29.24495506286621, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8687667846679688, + "num_tokens": 557296046.0, + "step": 14613 + }, + { + "epoch": 1.8590510113217147, + "ewc_loss": 0.04450976476073265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.225488242402207e-05, + "grad_norm": 29.266263961791992, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8653532862663269, + "num_tokens": 557328395.0, + "step": 14614 + }, + { + "epoch": 1.8591782216003052, + "ewc_loss": 0.04456693306565285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.228346602350939e-05, + "grad_norm": 29.22054100036621, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8719010353088379, + "num_tokens": 557363888.0, + "step": 14615 + }, + { + "epoch": 1.8593054318788957, + "ewc_loss": 0.04445749521255493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2228747184271924e-05, + "grad_norm": 29.15365982055664, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8601900339126587, + "num_tokens": 557402944.0, + "step": 14616 + }, + { + "epoch": 1.8594326421574863, + "ewc_loss": 0.04458804056048393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2294019800028764e-05, + "grad_norm": 29.3979549407959, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8739742040634155, + "num_tokens": 557442247.0, + "step": 14617 + }, + { + "epoch": 1.8595598524360768, + "ewc_loss": 0.044492293149232864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.224614581791684e-05, + "grad_norm": 29.2252140045166, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8621896505355835, + "num_tokens": 557483580.0, + "step": 14618 + }, + { + "epoch": 1.8596870627146673, + "ewc_loss": 0.04439762979745865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2198815713636577e-05, + "grad_norm": 29.189252853393555, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8718785643577576, + "num_tokens": 557520134.0, + "step": 14619 + }, + { + "epoch": 1.8598142729932579, + "ewc_loss": 0.04449278861284256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2246395019465126e-05, + "grad_norm": 29.311201095581055, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8633142113685608, + "num_tokens": 557565726.0, + "step": 14620 + }, + { + "epoch": 1.8599414832718484, + "ewc_loss": 0.0444553941488266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2227697627386078e-05, + "grad_norm": 29.161178588867188, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8668107390403748, + "num_tokens": 557602871.0, + "step": 14621 + }, + { + "epoch": 1.860068693550439, + "ewc_loss": 0.04442804306745529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2214020646060817e-05, + "grad_norm": 29.207075119018555, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8589882850646973, + "num_tokens": 557640873.0, + "step": 14622 + }, + { + "epoch": 1.8601959038290294, + "ewc_loss": 0.044417690485715866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.220884562120773e-05, + "grad_norm": 29.0992374420166, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8605722784996033, + "num_tokens": 557684233.0, + "step": 14623 + }, + { + "epoch": 1.86032311410762, + "ewc_loss": 0.04447920620441437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2239602913032286e-05, + "grad_norm": 29.19375228881836, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8735103607177734, + "num_tokens": 557716895.0, + "step": 14624 + }, + { + "epoch": 1.8604503243862105, + "ewc_loss": 0.04448283091187477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2241414626478218e-05, + "grad_norm": 29.148038864135742, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8637458682060242, + "num_tokens": 557758197.0, + "step": 14625 + }, + { + "epoch": 1.860577534664801, + "ewc_loss": 0.044464610517024994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.223230512754526e-05, + "grad_norm": 29.135866165161133, + "learning_rate": 1e-06, + "loss": 0.5287, + "mean_token_accuracy": 0.8419407606124878, + "num_tokens": 557794196.0, + "step": 14626 + }, + { + "epoch": 1.8607047449433916, + "ewc_loss": 0.04454301297664642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2271506168181077e-05, + "grad_norm": 29.209598541259766, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8521040678024292, + "num_tokens": 557833856.0, + "step": 14627 + }, + { + "epoch": 1.860831955221982, + "ewc_loss": 0.044445909559726715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2222955522011034e-05, + "grad_norm": 29.17144203186035, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8563327789306641, + "num_tokens": 557866304.0, + "step": 14628 + }, + { + "epoch": 1.8609591655005726, + "ewc_loss": 0.044527336955070496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2263668142841198e-05, + "grad_norm": 29.16221046447754, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8808462023735046, + "num_tokens": 557907065.0, + "step": 14629 + }, + { + "epoch": 1.861086375779163, + "ewc_loss": 0.04443540796637535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2217704099602997e-05, + "grad_norm": 29.17448616027832, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8643792867660522, + "num_tokens": 557947326.0, + "step": 14630 + }, + { + "epoch": 1.8612135860577534, + "ewc_loss": 0.04454197734594345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2270987756201066e-05, + "grad_norm": 29.313980102539062, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8662852048873901, + "num_tokens": 557990059.0, + "step": 14631 + }, + { + "epoch": 1.861340796336344, + "ewc_loss": 0.04444093629717827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2220468963496387e-05, + "grad_norm": 29.18415069580078, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8695336580276489, + "num_tokens": 558028406.0, + "step": 14632 + }, + { + "epoch": 1.8614680066149345, + "ewc_loss": 0.04441966116428375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2209829694475047e-05, + "grad_norm": 29.199962615966797, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8640822768211365, + "num_tokens": 558070239.0, + "step": 14633 + }, + { + "epoch": 1.861595216893525, + "ewc_loss": 0.04448618367314339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2243091734708287e-05, + "grad_norm": 29.17076873779297, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8711730241775513, + "num_tokens": 558107685.0, + "step": 14634 + }, + { + "epoch": 1.8617224271721156, + "ewc_loss": 0.04451770335435867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2258851458900608e-05, + "grad_norm": 29.156723022460938, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8616204857826233, + "num_tokens": 558143185.0, + "step": 14635 + }, + { + "epoch": 1.8618496374507059, + "ewc_loss": 0.044469211250543594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2234606149140745e-05, + "grad_norm": 29.17495346069336, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8769834041595459, + "num_tokens": 558176207.0, + "step": 14636 + }, + { + "epoch": 1.8619768477292964, + "ewc_loss": 0.04449930414557457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2249652829486877e-05, + "grad_norm": 29.18495750427246, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8837844133377075, + "num_tokens": 558212215.0, + "step": 14637 + }, + { + "epoch": 1.862104058007887, + "ewc_loss": 0.044546082615852356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.227304139523767e-05, + "grad_norm": 29.156641006469727, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8773089647293091, + "num_tokens": 558248471.0, + "step": 14638 + }, + { + "epoch": 1.8622312682864774, + "ewc_loss": 0.044494785368442535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.224739182565827e-05, + "grad_norm": 29.35107421875, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8547626733779907, + "num_tokens": 558285211.0, + "step": 14639 + }, + { + "epoch": 1.862358478565068, + "ewc_loss": 0.0444084070622921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.220420356024988e-05, + "grad_norm": 29.120344161987305, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8611273765563965, + "num_tokens": 558314434.0, + "step": 14640 + }, + { + "epoch": 1.8624856888436585, + "ewc_loss": 0.04442504793405533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.22125236177817e-05, + "grad_norm": 29.21144676208496, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8756934404373169, + "num_tokens": 558353710.0, + "step": 14641 + }, + { + "epoch": 1.862612899122249, + "ewc_loss": 0.04451681300997734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.225840580649674e-05, + "grad_norm": 29.11649513244629, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8548362255096436, + "num_tokens": 558390476.0, + "step": 14642 + }, + { + "epoch": 1.8627401094008396, + "ewc_loss": 0.04448320344090462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2241601982386783e-05, + "grad_norm": 29.163360595703125, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8765519857406616, + "num_tokens": 558424917.0, + "step": 14643 + }, + { + "epoch": 1.86286731967943, + "ewc_loss": 0.04450515657663345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.225257776444778e-05, + "grad_norm": 29.138927459716797, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8606692552566528, + "num_tokens": 558459831.0, + "step": 14644 + }, + { + "epoch": 1.8629945299580206, + "ewc_loss": 0.04445739462971687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.222869807155803e-05, + "grad_norm": 29.095232009887695, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8826148509979248, + "num_tokens": 558498600.0, + "step": 14645 + }, + { + "epoch": 1.8631217402366111, + "ewc_loss": 0.04448436200618744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.224218042101711e-05, + "grad_norm": 29.116243362426758, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.857791006565094, + "num_tokens": 558537465.0, + "step": 14646 + }, + { + "epoch": 1.8632489505152017, + "ewc_loss": 0.04458757862448692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2293788788374513e-05, + "grad_norm": 29.189485549926758, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8874670267105103, + "num_tokens": 558575357.0, + "step": 14647 + }, + { + "epoch": 1.8633761607937922, + "ewc_loss": 0.04457348957657814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2286745661403984e-05, + "grad_norm": 29.117826461791992, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8643813133239746, + "num_tokens": 558619422.0, + "step": 14648 + }, + { + "epoch": 1.8635033710723827, + "ewc_loss": 0.04458050802350044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2290254491963424e-05, + "grad_norm": 29.189083099365234, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8582973480224609, + "num_tokens": 558657677.0, + "step": 14649 + }, + { + "epoch": 1.8636305813509733, + "ewc_loss": 0.044668592512607574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2334295863402076e-05, + "grad_norm": 29.181468963623047, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8759464025497437, + "num_tokens": 558692912.0, + "step": 14650 + }, + { + "epoch": 1.8637577916295638, + "ewc_loss": 0.04454343393445015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2271717170951888e-05, + "grad_norm": 29.15723991394043, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8797469139099121, + "num_tokens": 558733470.0, + "step": 14651 + }, + { + "epoch": 1.8638850019081543, + "ewc_loss": 0.04459922015666962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.229960955446586e-05, + "grad_norm": 29.193727493286133, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8634178638458252, + "num_tokens": 558771899.0, + "step": 14652 + }, + { + "epoch": 1.8640122121867448, + "ewc_loss": 0.04453996941447258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2269985493039712e-05, + "grad_norm": 29.080167770385742, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8736097812652588, + "num_tokens": 558813002.0, + "step": 14653 + }, + { + "epoch": 1.8641394224653351, + "ewc_loss": 0.04463418200612068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.231709186162334e-05, + "grad_norm": 29.17644500732422, + "learning_rate": 1e-06, + "loss": 0.4768, + "mean_token_accuracy": 0.8525997400283813, + "num_tokens": 558850827.0, + "step": 14654 + }, + { + "epoch": 1.8642666327439257, + "ewc_loss": 0.044627439230680466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2313719455269165e-05, + "grad_norm": 29.25438690185547, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8651046752929688, + "num_tokens": 558889003.0, + "step": 14655 + }, + { + "epoch": 1.8643938430225162, + "ewc_loss": 0.04455265775322914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2276328309089877e-05, + "grad_norm": 29.039560317993164, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8736355304718018, + "num_tokens": 558925849.0, + "step": 14656 + }, + { + "epoch": 1.8645210533011067, + "ewc_loss": 0.04453081265091896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2265407096710987e-05, + "grad_norm": 29.246349334716797, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8667153120040894, + "num_tokens": 558967471.0, + "step": 14657 + }, + { + "epoch": 1.8646482635796973, + "ewc_loss": 0.044634006917476654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2317002731142566e-05, + "grad_norm": 29.142202377319336, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8715388178825378, + "num_tokens": 559001938.0, + "step": 14658 + }, + { + "epoch": 1.8647754738582878, + "ewc_loss": 0.044522397220134735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2261197955231182e-05, + "grad_norm": 29.18474006652832, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8733155131340027, + "num_tokens": 559040515.0, + "step": 14659 + }, + { + "epoch": 1.864902684136878, + "ewc_loss": 0.0445878729224205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.22939361265162e-05, + "grad_norm": 29.112308502197266, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8797882795333862, + "num_tokens": 559077702.0, + "step": 14660 + }, + { + "epoch": 1.8650298944154686, + "ewc_loss": 0.04453178122639656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.226589094789233e-05, + "grad_norm": 29.160680770874023, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8610185384750366, + "num_tokens": 559114741.0, + "step": 14661 + }, + { + "epoch": 1.8651571046940592, + "ewc_loss": 0.0446380190551281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2319009076454677e-05, + "grad_norm": 29.2337703704834, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.846773624420166, + "num_tokens": 559155844.0, + "step": 14662 + }, + { + "epoch": 1.8652843149726497, + "ewc_loss": 0.04456047713756561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.22802391363075e-05, + "grad_norm": 29.135337829589844, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8703381419181824, + "num_tokens": 559194964.0, + "step": 14663 + }, + { + "epoch": 1.8654115252512402, + "ewc_loss": 0.04456150904297829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2280753910308704e-05, + "grad_norm": 29.168697357177734, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8742823600769043, + "num_tokens": 559230083.0, + "step": 14664 + }, + { + "epoch": 1.8655387355298307, + "ewc_loss": 0.04456859081983566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2284295482677408e-05, + "grad_norm": 29.056671142578125, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8784961104393005, + "num_tokens": 559268397.0, + "step": 14665 + }, + { + "epoch": 1.8656659458084213, + "ewc_loss": 0.04450738802552223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2253694623941556e-05, + "grad_norm": 29.20150375366211, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8590577244758606, + "num_tokens": 559302024.0, + "step": 14666 + }, + { + "epoch": 1.8657931560870118, + "ewc_loss": 0.04456523805856705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2282618374447338e-05, + "grad_norm": 29.010408401489258, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8657916784286499, + "num_tokens": 559343192.0, + "step": 14667 + }, + { + "epoch": 1.8659203663656023, + "ewc_loss": 0.04458403214812279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2292015273706056e-05, + "grad_norm": 29.261381149291992, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.890426516532898, + "num_tokens": 559380616.0, + "step": 14668 + }, + { + "epoch": 1.8660475766441929, + "ewc_loss": 0.044617947190999985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2308973711915314e-05, + "grad_norm": 29.15407371520996, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8779696226119995, + "num_tokens": 559413621.0, + "step": 14669 + }, + { + "epoch": 1.8661747869227834, + "ewc_loss": 0.04454868659377098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2274343791650608e-05, + "grad_norm": 29.22882652282715, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8771947026252747, + "num_tokens": 559447274.0, + "step": 14670 + }, + { + "epoch": 1.866301997201374, + "ewc_loss": 0.044544775038957596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2272388378041796e-05, + "grad_norm": 29.1853084564209, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8772749900817871, + "num_tokens": 559484306.0, + "step": 14671 + }, + { + "epoch": 1.8664292074799644, + "ewc_loss": 0.044486064463853836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.224303170805797e-05, + "grad_norm": 29.155223846435547, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8745852708816528, + "num_tokens": 559527126.0, + "step": 14672 + }, + { + "epoch": 1.866556417758555, + "ewc_loss": 0.04454579949378967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2272899514064193e-05, + "grad_norm": 29.20067596435547, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8733206987380981, + "num_tokens": 559568342.0, + "step": 14673 + }, + { + "epoch": 1.8666836280371455, + "ewc_loss": 0.04453747346997261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2268735847319476e-05, + "grad_norm": 29.077245712280273, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8662437200546265, + "num_tokens": 559609597.0, + "step": 14674 + }, + { + "epoch": 1.866810838315736, + "ewc_loss": 0.044630445539951324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2315221940516494e-05, + "grad_norm": 29.265548706054688, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8666045665740967, + "num_tokens": 559646392.0, + "step": 14675 + }, + { + "epoch": 1.8669380485943265, + "ewc_loss": 0.04456678405404091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2283391444943845e-05, + "grad_norm": 29.144140243530273, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8708499073982239, + "num_tokens": 559684836.0, + "step": 14676 + }, + { + "epoch": 1.867065258872917, + "ewc_loss": 0.0445389598608017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2269479813985527e-05, + "grad_norm": 29.167972564697266, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.872241199016571, + "num_tokens": 559727793.0, + "step": 14677 + }, + { + "epoch": 1.8671924691515076, + "ewc_loss": 0.044611673802137375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.23058377741836e-05, + "grad_norm": 29.20564079284668, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8572916984558105, + "num_tokens": 559768154.0, + "step": 14678 + }, + { + "epoch": 1.867319679430098, + "ewc_loss": 0.04446856677532196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2234284188016318e-05, + "grad_norm": 29.179271697998047, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.867753267288208, + "num_tokens": 559798732.0, + "step": 14679 + }, + { + "epoch": 1.8674468897086884, + "ewc_loss": 0.044560011476278305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2280006305663846e-05, + "grad_norm": 29.18536949157715, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8675253391265869, + "num_tokens": 559842796.0, + "step": 14680 + }, + { + "epoch": 1.867574099987279, + "ewc_loss": 0.04451669007539749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2258345779846422e-05, + "grad_norm": 29.191492080688477, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8656402826309204, + "num_tokens": 559884198.0, + "step": 14681 + }, + { + "epoch": 1.8677013102658695, + "ewc_loss": 0.04448206350207329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2241032638703473e-05, + "grad_norm": 29.073030471801758, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8542830944061279, + "num_tokens": 559916749.0, + "step": 14682 + }, + { + "epoch": 1.86782852054446, + "ewc_loss": 0.04448949918150902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2244748834054917e-05, + "grad_norm": 29.198545455932617, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8642767667770386, + "num_tokens": 559958430.0, + "step": 14683 + }, + { + "epoch": 1.8679557308230506, + "ewc_loss": 0.044544320553541183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2272161004366353e-05, + "grad_norm": 29.158693313598633, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8851451277732849, + "num_tokens": 559995720.0, + "step": 14684 + }, + { + "epoch": 1.8680829411016409, + "ewc_loss": 0.04450339451432228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2251697373576462e-05, + "grad_norm": 29.147052764892578, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8724451065063477, + "num_tokens": 560033304.0, + "step": 14685 + }, + { + "epoch": 1.8682101513802314, + "ewc_loss": 0.04448669031262398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.224334457423538e-05, + "grad_norm": 29.118764877319336, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8673705458641052, + "num_tokens": 560069879.0, + "step": 14686 + }, + { + "epoch": 1.868337361658822, + "ewc_loss": 0.04452468827366829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2262343918555416e-05, + "grad_norm": 29.125253677368164, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8669450283050537, + "num_tokens": 560106853.0, + "step": 14687 + }, + { + "epoch": 1.8684645719374124, + "ewc_loss": 0.04453534260392189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2267671738518402e-05, + "grad_norm": 29.100723266601562, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8704208135604858, + "num_tokens": 560144225.0, + "step": 14688 + }, + { + "epoch": 1.868591782216003, + "ewc_loss": 0.044567886739969254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.228394259873312e-05, + "grad_norm": 29.19681739807129, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8622210025787354, + "num_tokens": 560186520.0, + "step": 14689 + }, + { + "epoch": 1.8687189924945935, + "ewc_loss": 0.044588688760995865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2294343580142595e-05, + "grad_norm": 29.007549285888672, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8713377714157104, + "num_tokens": 560225458.0, + "step": 14690 + }, + { + "epoch": 1.868846202773184, + "ewc_loss": 0.04456246644258499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.228123412351124e-05, + "grad_norm": 29.17251205444336, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.873284101486206, + "num_tokens": 560263401.0, + "step": 14691 + }, + { + "epoch": 1.8689734130517746, + "ewc_loss": 0.044778455048799515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2389227524399757e-05, + "grad_norm": 29.35455322265625, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8645920753479004, + "num_tokens": 560305026.0, + "step": 14692 + }, + { + "epoch": 1.869100623330365, + "ewc_loss": 0.04460873082280159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2304366211756133e-05, + "grad_norm": 29.180927276611328, + "learning_rate": 1e-06, + "loss": 0.5292, + "mean_token_accuracy": 0.836218535900116, + "num_tokens": 560354823.0, + "step": 14693 + }, + { + "epoch": 1.8692278336089556, + "ewc_loss": 0.04451823607087135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.225911885034293e-05, + "grad_norm": 29.1685791015625, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8832959532737732, + "num_tokens": 560391888.0, + "step": 14694 + }, + { + "epoch": 1.8693550438875461, + "ewc_loss": 0.0445459820330143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.227299046353437e-05, + "grad_norm": 29.172672271728516, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8715654611587524, + "num_tokens": 560425350.0, + "step": 14695 + }, + { + "epoch": 1.8694822541661367, + "ewc_loss": 0.044695086777210236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.23475435632281e-05, + "grad_norm": 29.374832153320312, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8787209987640381, + "num_tokens": 560458521.0, + "step": 14696 + }, + { + "epoch": 1.8696094644447272, + "ewc_loss": 0.04457973316311836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.228986704722047e-05, + "grad_norm": 29.150909423828125, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8634876012802124, + "num_tokens": 560506378.0, + "step": 14697 + }, + { + "epoch": 1.8697366747233177, + "ewc_loss": 0.04450543597340584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.225271782663185e-05, + "grad_norm": 29.155059814453125, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8901114463806152, + "num_tokens": 560544055.0, + "step": 14698 + }, + { + "epoch": 1.8698638850019083, + "ewc_loss": 0.04452546313405037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.226273136329837e-05, + "grad_norm": 29.1654109954834, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8781160116195679, + "num_tokens": 560577240.0, + "step": 14699 + }, + { + "epoch": 1.8699910952804988, + "ewc_loss": 0.04456626996397972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2283134967437945e-05, + "grad_norm": 29.18311882019043, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8631202578544617, + "num_tokens": 560615662.0, + "step": 14700 + }, + { + "epoch": 1.8701183055590893, + "ewc_loss": 0.04453999549150467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2269998225965537e-05, + "grad_norm": 29.143760681152344, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8631314039230347, + "num_tokens": 560658386.0, + "step": 14701 + }, + { + "epoch": 1.8702455158376798, + "ewc_loss": 0.044539034366607666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2269518012763e-05, + "grad_norm": 29.133729934692383, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8671025037765503, + "num_tokens": 560696366.0, + "step": 14702 + }, + { + "epoch": 1.8703727261162701, + "ewc_loss": 0.04457835480570793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2289177650236525e-05, + "grad_norm": 29.27666473388672, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8575937747955322, + "num_tokens": 560736419.0, + "step": 14703 + }, + { + "epoch": 1.8704999363948607, + "ewc_loss": 0.044625040143728256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.231252074125223e-05, + "grad_norm": 29.17759132385254, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8674242496490479, + "num_tokens": 560777839.0, + "step": 14704 + }, + { + "epoch": 1.8706271466734512, + "ewc_loss": 0.04456668347120285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.228334233222995e-05, + "grad_norm": 29.08433723449707, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8787211179733276, + "num_tokens": 560816167.0, + "step": 14705 + }, + { + "epoch": 1.8707543569520417, + "ewc_loss": 0.04461032152175903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2305161110125482e-05, + "grad_norm": 29.23790168762207, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8541504740715027, + "num_tokens": 560854524.0, + "step": 14706 + }, + { + "epoch": 1.8708815672306323, + "ewc_loss": 0.04458793252706528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.229396704933606e-05, + "grad_norm": 28.990936279296875, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8654061555862427, + "num_tokens": 560894393.0, + "step": 14707 + }, + { + "epoch": 1.8710087775092228, + "ewc_loss": 0.04461049661040306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2305248421616852e-05, + "grad_norm": 29.185075759887695, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8786563873291016, + "num_tokens": 560938640.0, + "step": 14708 + }, + { + "epoch": 1.871135987787813, + "ewc_loss": 0.04462450370192528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2312251530820504e-05, + "grad_norm": 28.996177673339844, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8669897317886353, + "num_tokens": 560976805.0, + "step": 14709 + }, + { + "epoch": 1.8712631980664036, + "ewc_loss": 0.0446321927011013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.23160968744196e-05, + "grad_norm": 29.25528335571289, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8649888634681702, + "num_tokens": 561018940.0, + "step": 14710 + }, + { + "epoch": 1.8713904083449941, + "ewc_loss": 0.04469689726829529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2348449419951066e-05, + "grad_norm": 29.03759002685547, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8727754354476929, + "num_tokens": 561065228.0, + "step": 14711 + }, + { + "epoch": 1.8715176186235847, + "ewc_loss": 0.044539257884025574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2269628971116617e-05, + "grad_norm": 29.304351806640625, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8594576120376587, + "num_tokens": 561107540.0, + "step": 14712 + }, + { + "epoch": 1.8716448289021752, + "ewc_loss": 0.044667430222034454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2333715605782345e-05, + "grad_norm": 29.16028594970703, + "learning_rate": 1e-06, + "loss": 0.5206, + "mean_token_accuracy": 0.848331093788147, + "num_tokens": 561144774.0, + "step": 14713 + }, + { + "epoch": 1.8717720391807657, + "ewc_loss": 0.04463210329413414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.231605139968451e-05, + "grad_norm": 29.38011932373047, + "learning_rate": 1e-06, + "loss": 0.4914, + "mean_token_accuracy": 0.8493858575820923, + "num_tokens": 561186774.0, + "step": 14714 + }, + { + "epoch": 1.8718992494593563, + "ewc_loss": 0.044666703790426254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2333351807901636e-05, + "grad_norm": 29.270706176757812, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8811730742454529, + "num_tokens": 561223997.0, + "step": 14715 + }, + { + "epoch": 1.8720264597379468, + "ewc_loss": 0.0445328950881958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2266447558649816e-05, + "grad_norm": 29.17957878112793, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8763282299041748, + "num_tokens": 561261583.0, + "step": 14716 + }, + { + "epoch": 1.8721536700165373, + "ewc_loss": 0.04454772174358368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2273859940469265e-05, + "grad_norm": 29.173070907592773, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8732237815856934, + "num_tokens": 561302681.0, + "step": 14717 + }, + { + "epoch": 1.8722808802951278, + "ewc_loss": 0.04465111717581749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2325559257296845e-05, + "grad_norm": 29.2333984375, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8669792413711548, + "num_tokens": 561338402.0, + "step": 14718 + }, + { + "epoch": 1.8724080905737184, + "ewc_loss": 0.044552937150001526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.227646837127395e-05, + "grad_norm": 29.192447662353516, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8722146153450012, + "num_tokens": 561372892.0, + "step": 14719 + }, + { + "epoch": 1.872535300852309, + "ewc_loss": 0.044601231813430786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.230061545560602e-05, + "grad_norm": 29.137348175048828, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8657995462417603, + "num_tokens": 561409474.0, + "step": 14720 + }, + { + "epoch": 1.8726625111308994, + "ewc_loss": 0.04453328624367714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.22666440095054e-05, + "grad_norm": 29.244062423706055, + "learning_rate": 1e-06, + "loss": 0.5075, + "mean_token_accuracy": 0.8420041799545288, + "num_tokens": 561450152.0, + "step": 14721 + }, + { + "epoch": 1.87278972140949, + "ewc_loss": 0.04459630697965622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2298154362943023e-05, + "grad_norm": 29.144746780395508, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8618509769439697, + "num_tokens": 561489448.0, + "step": 14722 + }, + { + "epoch": 1.8729169316880805, + "ewc_loss": 0.04455822706222534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2279113181866705e-05, + "grad_norm": 29.136842727661133, + "learning_rate": 1e-06, + "loss": 0.5048, + "mean_token_accuracy": 0.8448154330253601, + "num_tokens": 561529684.0, + "step": 14723 + }, + { + "epoch": 1.873044141966671, + "ewc_loss": 0.04465314373373985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.232657243439462e-05, + "grad_norm": 29.191606521606445, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8689191937446594, + "num_tokens": 561564259.0, + "step": 14724 + }, + { + "epoch": 1.8731713522452615, + "ewc_loss": 0.04459521919488907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2297610485111363e-05, + "grad_norm": 29.114212036132812, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8759565949440002, + "num_tokens": 561609650.0, + "step": 14725 + }, + { + "epoch": 1.873298562523852, + "ewc_loss": 0.0446152426302433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.230762220278848e-05, + "grad_norm": 29.23223304748535, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8745844960212708, + "num_tokens": 561645520.0, + "step": 14726 + }, + { + "epoch": 1.8734257728024426, + "ewc_loss": 0.044714491814374924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2357246052706614e-05, + "grad_norm": 29.22454261779785, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8738764524459839, + "num_tokens": 561682771.0, + "step": 14727 + }, + { + "epoch": 1.873552983081033, + "ewc_loss": 0.0445714108645916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2285705199465156e-05, + "grad_norm": 29.184505462646484, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8577768802642822, + "num_tokens": 561721317.0, + "step": 14728 + }, + { + "epoch": 1.8736801933596234, + "ewc_loss": 0.044645756483078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.232287806691602e-05, + "grad_norm": 29.16132354736328, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8677684664726257, + "num_tokens": 561763178.0, + "step": 14729 + }, + { + "epoch": 1.873807403638214, + "ewc_loss": 0.044640012085437775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.232000588264782e-05, + "grad_norm": 29.210994720458984, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8691409826278687, + "num_tokens": 561802292.0, + "step": 14730 + }, + { + "epoch": 1.8739346139168045, + "ewc_loss": 0.044642914086580276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.232145743619185e-05, + "grad_norm": 29.1955509185791, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8765181303024292, + "num_tokens": 561838610.0, + "step": 14731 + }, + { + "epoch": 1.874061824195395, + "ewc_loss": 0.044685736298561096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2342868760460988e-05, + "grad_norm": 29.185741424560547, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8722985982894897, + "num_tokens": 561881992.0, + "step": 14732 + }, + { + "epoch": 1.8741890344739855, + "ewc_loss": 0.0446738600730896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.233692976005841e-05, + "grad_norm": 29.21534538269043, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8579877614974976, + "num_tokens": 561917661.0, + "step": 14733 + }, + { + "epoch": 1.8743162447525759, + "ewc_loss": 0.044608548283576965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2304273443296552e-05, + "grad_norm": 29.243215560913086, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8693701028823853, + "num_tokens": 561958218.0, + "step": 14734 + }, + { + "epoch": 1.8744434550311664, + "ewc_loss": 0.0446263886988163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.231319376733154e-05, + "grad_norm": 29.168344497680664, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8801972270011902, + "num_tokens": 561993882.0, + "step": 14735 + }, + { + "epoch": 1.874570665309757, + "ewc_loss": 0.0445900559425354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2295027520158328e-05, + "grad_norm": 29.289777755737305, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8698021769523621, + "num_tokens": 562035443.0, + "step": 14736 + }, + { + "epoch": 1.8746978755883474, + "ewc_loss": 0.044595252722501755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2297626856015995e-05, + "grad_norm": 29.1643009185791, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8670331239700317, + "num_tokens": 562072806.0, + "step": 14737 + }, + { + "epoch": 1.874825085866938, + "ewc_loss": 0.04457797855138779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2288988475338556e-05, + "grad_norm": 29.15721893310547, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8520498275756836, + "num_tokens": 562113652.0, + "step": 14738 + }, + { + "epoch": 1.8749522961455285, + "ewc_loss": 0.04465125501155853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.232562837889418e-05, + "grad_norm": 29.152854919433594, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.860293984413147, + "num_tokens": 562150600.0, + "step": 14739 + }, + { + "epoch": 1.875079506424119, + "ewc_loss": 0.0445755198597908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2287760657491162e-05, + "grad_norm": 29.293127059936523, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8702365756034851, + "num_tokens": 562187730.0, + "step": 14740 + }, + { + "epoch": 1.8752067167027096, + "ewc_loss": 0.044637661427259445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.231883081549313e-05, + "grad_norm": 29.24656867980957, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8686831593513489, + "num_tokens": 562228229.0, + "step": 14741 + }, + { + "epoch": 1.8753339269813, + "ewc_loss": 0.044558264315128326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.227913137176074e-05, + "grad_norm": 29.160694122314453, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8738865852355957, + "num_tokens": 562262293.0, + "step": 14742 + }, + { + "epoch": 1.8754611372598906, + "ewc_loss": 0.04459257051348686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2296284441836178e-05, + "grad_norm": 29.23295021057129, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8660730123519897, + "num_tokens": 562303389.0, + "step": 14743 + }, + { + "epoch": 1.8755883475384811, + "ewc_loss": 0.044633571058511734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2316786271403544e-05, + "grad_norm": 29.143543243408203, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8618566393852234, + "num_tokens": 562344461.0, + "step": 14744 + }, + { + "epoch": 1.8757155578170717, + "ewc_loss": 0.04465630277991295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2328151317196898e-05, + "grad_norm": 29.257429122924805, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8506234884262085, + "num_tokens": 562387068.0, + "step": 14745 + }, + { + "epoch": 1.8758427680956622, + "ewc_loss": 0.044676121324300766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2338061171467416e-05, + "grad_norm": 29.181482315063477, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8657852411270142, + "num_tokens": 562428225.0, + "step": 14746 + }, + { + "epoch": 1.8759699783742527, + "ewc_loss": 0.044630661606788635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2315331079880707e-05, + "grad_norm": 29.222267150878906, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8679547309875488, + "num_tokens": 562463879.0, + "step": 14747 + }, + { + "epoch": 1.8760971886528433, + "ewc_loss": 0.04470399394631386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.235199644928798e-05, + "grad_norm": 29.222572326660156, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8607437610626221, + "num_tokens": 562502327.0, + "step": 14748 + }, + { + "epoch": 1.8762243989314338, + "ewc_loss": 0.04454725980758667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2273630747804418e-05, + "grad_norm": 29.1426944732666, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8718969821929932, + "num_tokens": 562542162.0, + "step": 14749 + }, + { + "epoch": 1.8763516092100243, + "ewc_loss": 0.04463520273566246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2317601178656332e-05, + "grad_norm": 29.20275115966797, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8628043532371521, + "num_tokens": 562583364.0, + "step": 14750 + }, + { + "epoch": 1.8764788194886148, + "ewc_loss": 0.04461182653903961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.230591417173855e-05, + "grad_norm": 29.215192794799805, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8701896667480469, + "num_tokens": 562615870.0, + "step": 14751 + }, + { + "epoch": 1.8766060297672051, + "ewc_loss": 0.04470198601484299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2350992367137223e-05, + "grad_norm": 29.18706703186035, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8529165983200073, + "num_tokens": 562655086.0, + "step": 14752 + }, + { + "epoch": 1.8767332400457957, + "ewc_loss": 0.044513992965221405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.225699608970899e-05, + "grad_norm": 29.180328369140625, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8608715534210205, + "num_tokens": 562697532.0, + "step": 14753 + }, + { + "epoch": 1.8768604503243862, + "ewc_loss": 0.04467598348855972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.233799204987008e-05, + "grad_norm": 29.252052307128906, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8471697568893433, + "num_tokens": 562731057.0, + "step": 14754 + }, + { + "epoch": 1.8769876606029767, + "ewc_loss": 0.044665880501270294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2332940716296434e-05, + "grad_norm": 29.21046257019043, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8571923971176147, + "num_tokens": 562770402.0, + "step": 14755 + }, + { + "epoch": 1.8771148708815673, + "ewc_loss": 0.0445915088057518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2295755115919746e-05, + "grad_norm": 29.024852752685547, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8641019463539124, + "num_tokens": 562806191.0, + "step": 14756 + }, + { + "epoch": 1.8772420811601578, + "ewc_loss": 0.044646479189395905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2323240045807324e-05, + "grad_norm": 29.289201736450195, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8657846450805664, + "num_tokens": 562846864.0, + "step": 14757 + }, + { + "epoch": 1.877369291438748, + "ewc_loss": 0.04466242343187332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2331210857373662e-05, + "grad_norm": 29.20656967163086, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8618890047073364, + "num_tokens": 562882887.0, + "step": 14758 + }, + { + "epoch": 1.8774965017173386, + "ewc_loss": 0.0445750392973423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2287520550889894e-05, + "grad_norm": 29.186832427978516, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8670812249183655, + "num_tokens": 562919096.0, + "step": 14759 + }, + { + "epoch": 1.8776237119959291, + "ewc_loss": 0.04469704255461693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2348522179527208e-05, + "grad_norm": 29.22072982788086, + "learning_rate": 1e-06, + "loss": 0.5171, + "mean_token_accuracy": 0.8403297662734985, + "num_tokens": 562959336.0, + "step": 14760 + }, + { + "epoch": 1.8777509222745197, + "ewc_loss": 0.0446646548807621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.233232771686744e-05, + "grad_norm": 29.298810958862305, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8673795461654663, + "num_tokens": 562997883.0, + "step": 14761 + }, + { + "epoch": 1.8778781325531102, + "ewc_loss": 0.04466107487678528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.233053783129435e-05, + "grad_norm": 29.198312759399414, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.876461386680603, + "num_tokens": 563036763.0, + "step": 14762 + }, + { + "epoch": 1.8780053428317007, + "ewc_loss": 0.04464856907725334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.232428414572496e-05, + "grad_norm": 29.283029556274414, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8835825324058533, + "num_tokens": 563076815.0, + "step": 14763 + }, + { + "epoch": 1.8781325531102913, + "ewc_loss": 0.0446506030857563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2325300960801542e-05, + "grad_norm": 29.22281837463379, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8775932788848877, + "num_tokens": 563110511.0, + "step": 14764 + }, + { + "epoch": 1.8782597633888818, + "ewc_loss": 0.04463880509138107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2319401978165843e-05, + "grad_norm": 29.30898666381836, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8620777130126953, + "num_tokens": 563140757.0, + "step": 14765 + }, + { + "epoch": 1.8783869736674723, + "ewc_loss": 0.04459881782531738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.229940946563147e-05, + "grad_norm": 29.172893524169922, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8673595786094666, + "num_tokens": 563179961.0, + "step": 14766 + }, + { + "epoch": 1.8785141839460628, + "ewc_loss": 0.044582780450582504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2291389541351236e-05, + "grad_norm": 29.175901412963867, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8743388056755066, + "num_tokens": 563221355.0, + "step": 14767 + }, + { + "epoch": 1.8786413942246534, + "ewc_loss": 0.04462306573987007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2311533030006103e-05, + "grad_norm": 29.231027603149414, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8885737657546997, + "num_tokens": 563257427.0, + "step": 14768 + }, + { + "epoch": 1.878768604503244, + "ewc_loss": 0.044638440012931824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2319220079225488e-05, + "grad_norm": 29.167007446289062, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8646904826164246, + "num_tokens": 563294371.0, + "step": 14769 + }, + { + "epoch": 1.8788958147818344, + "ewc_loss": 0.04456828162074089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2284140868578106e-05, + "grad_norm": 29.27817153930664, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8776302337646484, + "num_tokens": 563338592.0, + "step": 14770 + }, + { + "epoch": 1.879023025060425, + "ewc_loss": 0.04472535848617554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2362679374055006e-05, + "grad_norm": 29.27438735961914, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8712280988693237, + "num_tokens": 563373853.0, + "step": 14771 + }, + { + "epoch": 1.8791502353390155, + "ewc_loss": 0.04456200823187828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2281004930846393e-05, + "grad_norm": 29.15641212463379, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8666477203369141, + "num_tokens": 563411137.0, + "step": 14772 + }, + { + "epoch": 1.879277445617606, + "ewc_loss": 0.04468894749879837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2344473109114915e-05, + "grad_norm": 29.27042007446289, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8721334338188171, + "num_tokens": 563450294.0, + "step": 14773 + }, + { + "epoch": 1.8794046558961965, + "ewc_loss": 0.04462413489818573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2312067812890746e-05, + "grad_norm": 29.14678192138672, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8700937628746033, + "num_tokens": 563496858.0, + "step": 14774 + }, + { + "epoch": 1.879531866174787, + "ewc_loss": 0.04466315731406212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.233157829323318e-05, + "grad_norm": 29.360523223876953, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8625519275665283, + "num_tokens": 563539566.0, + "step": 14775 + }, + { + "epoch": 1.8796590764533776, + "ewc_loss": 0.04461231455206871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2306157916318625e-05, + "grad_norm": 29.260019302368164, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8748246431350708, + "num_tokens": 563574363.0, + "step": 14776 + }, + { + "epoch": 1.879786286731968, + "ewc_loss": 0.04458038881421089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2290194465313107e-05, + "grad_norm": 29.11958885192871, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8687862157821655, + "num_tokens": 563605540.0, + "step": 14777 + }, + { + "epoch": 1.8799134970105584, + "ewc_loss": 0.04462171345949173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2310856365947984e-05, + "grad_norm": 29.290119171142578, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8541456460952759, + "num_tokens": 563646640.0, + "step": 14778 + }, + { + "epoch": 1.880040707289149, + "ewc_loss": 0.04462907090783119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2314536181511357e-05, + "grad_norm": 29.129066467285156, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8779303431510925, + "num_tokens": 563683576.0, + "step": 14779 + }, + { + "epoch": 1.8801679175677395, + "ewc_loss": 0.04462328553199768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2311642169370316e-05, + "grad_norm": 29.221710205078125, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8596609234809875, + "num_tokens": 563724088.0, + "step": 14780 + }, + { + "epoch": 1.88029512784633, + "ewc_loss": 0.044600293040275574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2300146156339906e-05, + "grad_norm": 29.15540885925293, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8661234378814697, + "num_tokens": 563760897.0, + "step": 14781 + }, + { + "epoch": 1.8804223381249205, + "ewc_loss": 0.0446091964840889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2304599042399786e-05, + "grad_norm": 29.191450119018555, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.873053789138794, + "num_tokens": 563800158.0, + "step": 14782 + }, + { + "epoch": 1.8805495484035109, + "ewc_loss": 0.044653791934251785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.232689621450845e-05, + "grad_norm": 29.12181854248047, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8723621368408203, + "num_tokens": 563833288.0, + "step": 14783 + }, + { + "epoch": 1.8806767586821014, + "ewc_loss": 0.04462822899222374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2314114175969735e-05, + "grad_norm": 29.204347610473633, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8794803619384766, + "num_tokens": 563874507.0, + "step": 14784 + }, + { + "epoch": 1.880803968960692, + "ewc_loss": 0.04472183436155319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.236091677332297e-05, + "grad_norm": 29.18756866455078, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8680732846260071, + "num_tokens": 563912452.0, + "step": 14785 + }, + { + "epoch": 1.8809311792392824, + "ewc_loss": 0.04464954510331154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2324773453874514e-05, + "grad_norm": 29.260046005249023, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.8510254621505737, + "num_tokens": 563950565.0, + "step": 14786 + }, + { + "epoch": 1.881058389517873, + "ewc_loss": 0.04464433714747429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2322168661048636e-05, + "grad_norm": 29.11474609375, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8585577607154846, + "num_tokens": 563984924.0, + "step": 14787 + }, + { + "epoch": 1.8811855997964635, + "ewc_loss": 0.04463648423552513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.231824146292638e-05, + "grad_norm": 29.317197799682617, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8750604391098022, + "num_tokens": 564022887.0, + "step": 14788 + }, + { + "epoch": 1.881312810075054, + "ewc_loss": 0.04468972235918045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.234486055385787e-05, + "grad_norm": 29.11225700378418, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.866760790348053, + "num_tokens": 564064829.0, + "step": 14789 + }, + { + "epoch": 1.8814400203536445, + "ewc_loss": 0.04459397494792938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2296986571745947e-05, + "grad_norm": 29.216251373291016, + "learning_rate": 1e-06, + "loss": 0.5047, + "mean_token_accuracy": 0.8464847803115845, + "num_tokens": 564100858.0, + "step": 14790 + }, + { + "epoch": 1.881567230632235, + "ewc_loss": 0.044671330600976944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2335665562422946e-05, + "grad_norm": 29.13475227355957, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.861035943031311, + "num_tokens": 564141477.0, + "step": 14791 + }, + { + "epoch": 1.8816944409108256, + "ewc_loss": 0.04463298246264458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.231649159512017e-05, + "grad_norm": 29.200220108032227, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.861129641532898, + "num_tokens": 564179271.0, + "step": 14792 + }, + { + "epoch": 1.8818216511894161, + "ewc_loss": 0.04467261582612991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2336307665682398e-05, + "grad_norm": 29.141277313232422, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8739632964134216, + "num_tokens": 564224336.0, + "step": 14793 + }, + { + "epoch": 1.8819488614680067, + "ewc_loss": 0.04468074068427086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2340369469020516e-05, + "grad_norm": 29.357955932617188, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8615072965621948, + "num_tokens": 564265157.0, + "step": 14794 + }, + { + "epoch": 1.8820760717465972, + "ewc_loss": 0.0447019599378109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.23509796342114e-05, + "grad_norm": 29.163305282592773, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.873360276222229, + "num_tokens": 564295023.0, + "step": 14795 + }, + { + "epoch": 1.8822032820251877, + "ewc_loss": 0.04460042342543602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2300211639958434e-05, + "grad_norm": 29.172969818115234, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8692115545272827, + "num_tokens": 564333700.0, + "step": 14796 + }, + { + "epoch": 1.8823304923037782, + "ewc_loss": 0.04471367970108986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2356840418069623e-05, + "grad_norm": 29.172861099243164, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.8542087078094482, + "num_tokens": 564374342.0, + "step": 14797 + }, + { + "epoch": 1.8824577025823688, + "ewc_loss": 0.0446385033428669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2319251002045348e-05, + "grad_norm": 29.16596031188965, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8696287274360657, + "num_tokens": 564422877.0, + "step": 14798 + }, + { + "epoch": 1.8825849128609593, + "ewc_loss": 0.04468556493520737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.234278326795902e-05, + "grad_norm": 29.176881790161133, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8672784566879272, + "num_tokens": 564464713.0, + "step": 14799 + }, + { + "epoch": 1.8827121231395498, + "ewc_loss": 0.04461608827114105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2308044208330102e-05, + "grad_norm": 29.108680725097656, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8568761944770813, + "num_tokens": 564500667.0, + "step": 14800 + }, + { + "epoch": 1.8828393334181401, + "ewc_loss": 0.04474971443414688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2374857508111745e-05, + "grad_norm": 29.228124618530273, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8756141066551208, + "num_tokens": 564541460.0, + "step": 14801 + }, + { + "epoch": 1.8829665436967307, + "ewc_loss": 0.04470386728644371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2351932784658857e-05, + "grad_norm": 29.219799041748047, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.859587550163269, + "num_tokens": 564580983.0, + "step": 14802 + }, + { + "epoch": 1.8830937539753212, + "ewc_loss": 0.044733982533216476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.236699037894141e-05, + "grad_norm": 29.327442169189453, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8768276572227478, + "num_tokens": 564615967.0, + "step": 14803 + }, + { + "epoch": 1.8832209642539117, + "ewc_loss": 0.04471129924058914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.23556489800103e-05, + "grad_norm": 29.20377540588379, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8587666153907776, + "num_tokens": 564653255.0, + "step": 14804 + }, + { + "epoch": 1.8833481745325023, + "ewc_loss": 0.04464234784245491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2321173673844896e-05, + "grad_norm": 29.18979835510254, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8657670617103577, + "num_tokens": 564688420.0, + "step": 14805 + }, + { + "epoch": 1.8834753848110928, + "ewc_loss": 0.04474112391471863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.237056105514057e-05, + "grad_norm": 29.21048927307129, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8615338206291199, + "num_tokens": 564733123.0, + "step": 14806 + }, + { + "epoch": 1.883602595089683, + "ewc_loss": 0.04469692334532738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.234846215287689e-05, + "grad_norm": 29.283395767211914, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8751530647277832, + "num_tokens": 564767712.0, + "step": 14807 + }, + { + "epoch": 1.8837298053682736, + "ewc_loss": 0.04464361444115639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.232180668215733e-05, + "grad_norm": 29.26496696472168, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8650802373886108, + "num_tokens": 564810663.0, + "step": 14808 + }, + { + "epoch": 1.8838570156468641, + "ewc_loss": 0.04454358294606209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2271791749517433e-05, + "grad_norm": 29.13438606262207, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8663733005523682, + "num_tokens": 564848689.0, + "step": 14809 + }, + { + "epoch": 1.8839842259254547, + "ewc_loss": 0.04471605271100998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2358026399160735e-05, + "grad_norm": 29.24053192138672, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8555775880813599, + "num_tokens": 564887426.0, + "step": 14810 + }, + { + "epoch": 1.8841114362040452, + "ewc_loss": 0.044638942927122116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2319471099763177e-05, + "grad_norm": 29.27629852294922, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8720299005508423, + "num_tokens": 564924852.0, + "step": 14811 + }, + { + "epoch": 1.8842386464826357, + "ewc_loss": 0.044662728905677795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.233136365248356e-05, + "grad_norm": 29.18890953063965, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.863651692867279, + "num_tokens": 564964201.0, + "step": 14812 + }, + { + "epoch": 1.8843658567612263, + "ewc_loss": 0.044590096920728683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.229504934803117e-05, + "grad_norm": 29.186357498168945, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8788225650787354, + "num_tokens": 565004939.0, + "step": 14813 + }, + { + "epoch": 1.8844930670398168, + "ewc_loss": 0.04460732266306877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2303660443867557e-05, + "grad_norm": 29.154827117919922, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8719496130943298, + "num_tokens": 565044067.0, + "step": 14814 + }, + { + "epoch": 1.8846202773184073, + "ewc_loss": 0.044633131474256516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2316566173685715e-05, + "grad_norm": 29.245691299438477, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8702371120452881, + "num_tokens": 565082114.0, + "step": 14815 + }, + { + "epoch": 1.8847474875969978, + "ewc_loss": 0.04463325813412666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.231662983831484e-05, + "grad_norm": 29.23687171936035, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8678296208381653, + "num_tokens": 565118350.0, + "step": 14816 + }, + { + "epoch": 1.8848746978755884, + "ewc_loss": 0.04465760290622711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2328800696413964e-05, + "grad_norm": 29.211105346679688, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8608142137527466, + "num_tokens": 565153313.0, + "step": 14817 + }, + { + "epoch": 1.885001908154179, + "ewc_loss": 0.04454841464757919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2274207367445342e-05, + "grad_norm": 29.209922790527344, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8553841710090637, + "num_tokens": 565191332.0, + "step": 14818 + }, + { + "epoch": 1.8851291184327694, + "ewc_loss": 0.044645629823207855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2322814402286895e-05, + "grad_norm": 29.185550689697266, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8641606569290161, + "num_tokens": 565229937.0, + "step": 14819 + }, + { + "epoch": 1.88525632871136, + "ewc_loss": 0.04473578929901123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.236789441667497e-05, + "grad_norm": 29.28116798400879, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8628742694854736, + "num_tokens": 565266271.0, + "step": 14820 + }, + { + "epoch": 1.8853835389899505, + "ewc_loss": 0.04458049684762955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2290249034995213e-05, + "grad_norm": 29.138946533203125, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8653244376182556, + "num_tokens": 565300593.0, + "step": 14821 + }, + { + "epoch": 1.885510749268541, + "ewc_loss": 0.04466713219881058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2333566448651254e-05, + "grad_norm": 29.26885414123535, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8666208982467651, + "num_tokens": 565341801.0, + "step": 14822 + }, + { + "epoch": 1.8856379595471315, + "ewc_loss": 0.04472781717777252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2363909010891803e-05, + "grad_norm": 29.333351135253906, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8853201866149902, + "num_tokens": 565374380.0, + "step": 14823 + }, + { + "epoch": 1.885765169825722, + "ewc_loss": 0.04460866376757622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2304331650957465e-05, + "grad_norm": 29.203676223754883, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8602758049964905, + "num_tokens": 565413096.0, + "step": 14824 + }, + { + "epoch": 1.8858923801043126, + "ewc_loss": 0.04464316740632057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2321582946460694e-05, + "grad_norm": 29.29075813293457, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8766297101974487, + "num_tokens": 565453292.0, + "step": 14825 + }, + { + "epoch": 1.886019590382903, + "ewc_loss": 0.044639259576797485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2319629351841286e-05, + "grad_norm": 29.198135375976562, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8625538349151611, + "num_tokens": 565489311.0, + "step": 14826 + }, + { + "epoch": 1.8861468006614934, + "ewc_loss": 0.044703155755996704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2351578081725165e-05, + "grad_norm": 29.23674964904785, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8509942889213562, + "num_tokens": 565526293.0, + "step": 14827 + }, + { + "epoch": 1.886274010940084, + "ewc_loss": 0.04465293139219284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.232646511401981e-05, + "grad_norm": 29.307920455932617, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8610569834709167, + "num_tokens": 565569215.0, + "step": 14828 + }, + { + "epoch": 1.8864012212186745, + "ewc_loss": 0.044675178825855255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2337590053211898e-05, + "grad_norm": 29.251184463500977, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8782594203948975, + "num_tokens": 565610755.0, + "step": 14829 + }, + { + "epoch": 1.886528431497265, + "ewc_loss": 0.044660765677690506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.233038321719505e-05, + "grad_norm": 29.220422744750977, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8728907108306885, + "num_tokens": 565641964.0, + "step": 14830 + }, + { + "epoch": 1.8866556417758555, + "ewc_loss": 0.04468114301562309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.234057137684431e-05, + "grad_norm": 29.238109588623047, + "learning_rate": 1e-06, + "loss": 0.5328, + "mean_token_accuracy": 0.8339132070541382, + "num_tokens": 565682852.0, + "step": 14831 + }, + { + "epoch": 1.8867828520544458, + "ewc_loss": 0.04467344284057617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2336720576277003e-05, + "grad_norm": 29.130401611328125, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8835504055023193, + "num_tokens": 565721352.0, + "step": 14832 + }, + { + "epoch": 1.8869100623330364, + "ewc_loss": 0.044714607298374176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2357304260367528e-05, + "grad_norm": 29.274059295654297, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8717169165611267, + "num_tokens": 565764045.0, + "step": 14833 + }, + { + "epoch": 1.887037272611627, + "ewc_loss": 0.04472014307975769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.236007094325032e-05, + "grad_norm": 29.237653732299805, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8756017088890076, + "num_tokens": 565797617.0, + "step": 14834 + }, + { + "epoch": 1.8871644828902174, + "ewc_loss": 0.04471004009246826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2355019609676674e-05, + "grad_norm": 29.17243766784668, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8664975762367249, + "num_tokens": 565839254.0, + "step": 14835 + }, + { + "epoch": 1.887291693168808, + "ewc_loss": 0.0447164885699749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.235824467788916e-05, + "grad_norm": 29.100971221923828, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.867394745349884, + "num_tokens": 565874742.0, + "step": 14836 + }, + { + "epoch": 1.8874189034473985, + "ewc_loss": 0.04476053640246391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2380269001587294e-05, + "grad_norm": 29.320226669311523, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8677642345428467, + "num_tokens": 565911043.0, + "step": 14837 + }, + { + "epoch": 1.887546113725989, + "ewc_loss": 0.044764671474695206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2382335373549722e-05, + "grad_norm": 29.106109619140625, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8627327084541321, + "num_tokens": 565950228.0, + "step": 14838 + }, + { + "epoch": 1.8876733240045795, + "ewc_loss": 0.04479803889989853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2399019144359045e-05, + "grad_norm": 29.423812866210938, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8793824911117554, + "num_tokens": 565982891.0, + "step": 14839 + }, + { + "epoch": 1.88780053428317, + "ewc_loss": 0.04480884224176407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2404421542887576e-05, + "grad_norm": 29.153738021850586, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8656584620475769, + "num_tokens": 566022601.0, + "step": 14840 + }, + { + "epoch": 1.8879277445617606, + "ewc_loss": 0.04465247318148613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2326235921354964e-05, + "grad_norm": 29.31987762451172, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8683926463127136, + "num_tokens": 566054928.0, + "step": 14841 + }, + { + "epoch": 1.8880549548403511, + "ewc_loss": 0.04480016604065895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.240008325316012e-05, + "grad_norm": 29.243009567260742, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8605618476867676, + "num_tokens": 566094900.0, + "step": 14842 + }, + { + "epoch": 1.8881821651189417, + "ewc_loss": 0.04471644014120102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.235821921203751e-05, + "grad_norm": 29.15704917907715, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.866115927696228, + "num_tokens": 566125230.0, + "step": 14843 + }, + { + "epoch": 1.8883093753975322, + "ewc_loss": 0.04472882300615311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.236441105196718e-05, + "grad_norm": 29.098329544067383, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8821202516555786, + "num_tokens": 566165320.0, + "step": 14844 + }, + { + "epoch": 1.8884365856761227, + "ewc_loss": 0.044803738594055176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.24018695007544e-05, + "grad_norm": 29.210323333740234, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8576511144638062, + "num_tokens": 566202769.0, + "step": 14845 + }, + { + "epoch": 1.8885637959547132, + "ewc_loss": 0.044777244329452515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2388621800928377e-05, + "grad_norm": 29.125802993774414, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8685259819030762, + "num_tokens": 566245874.0, + "step": 14846 + }, + { + "epoch": 1.8886910062333038, + "ewc_loss": 0.04473040625452995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2365202312357724e-05, + "grad_norm": 29.202831268310547, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.849873423576355, + "num_tokens": 566284855.0, + "step": 14847 + }, + { + "epoch": 1.8888182165118943, + "ewc_loss": 0.04485337436199188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.242668779217638e-05, + "grad_norm": 29.149877548217773, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8591539859771729, + "num_tokens": 566323422.0, + "step": 14848 + }, + { + "epoch": 1.8889454267904848, + "ewc_loss": 0.04472104460000992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.23605220526224e-05, + "grad_norm": 29.21272087097168, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8746544122695923, + "num_tokens": 566364357.0, + "step": 14849 + }, + { + "epoch": 1.8890726370690751, + "ewc_loss": 0.04478718712925911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2393593098968267e-05, + "grad_norm": 29.25175666809082, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8648793697357178, + "num_tokens": 566404498.0, + "step": 14850 + }, + { + "epoch": 1.8891998473476657, + "ewc_loss": 0.04485658183693886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2428290321840905e-05, + "grad_norm": 29.325223922729492, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8639383316040039, + "num_tokens": 566442960.0, + "step": 14851 + }, + { + "epoch": 1.8893270576262562, + "ewc_loss": 0.04478093609213829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2390468075172976e-05, + "grad_norm": 29.196998596191406, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8759863376617432, + "num_tokens": 566475087.0, + "step": 14852 + }, + { + "epoch": 1.8894542679048467, + "ewc_loss": 0.04465348646044731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2326743419398554e-05, + "grad_norm": 29.299577713012695, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8574594855308533, + "num_tokens": 566514337.0, + "step": 14853 + }, + { + "epoch": 1.8895814781834372, + "ewc_loss": 0.04473228380084038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2366142729879357e-05, + "grad_norm": 29.2053279876709, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8574162125587463, + "num_tokens": 566553204.0, + "step": 14854 + }, + { + "epoch": 1.8897086884620278, + "ewc_loss": 0.044690851122140884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2345426259562373e-05, + "grad_norm": 29.253549575805664, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8657934069633484, + "num_tokens": 566589697.0, + "step": 14855 + }, + { + "epoch": 1.889835898740618, + "ewc_loss": 0.04477550461888313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2387752323993482e-05, + "grad_norm": 29.319416046142578, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8532810211181641, + "num_tokens": 566629680.0, + "step": 14856 + }, + { + "epoch": 1.8899631090192086, + "ewc_loss": 0.04468866065144539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2344329408952035e-05, + "grad_norm": 29.224571228027344, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8897657990455627, + "num_tokens": 566670931.0, + "step": 14857 + }, + { + "epoch": 1.8900903192977991, + "ewc_loss": 0.04465612769126892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2328064005705528e-05, + "grad_norm": 29.185327529907227, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8620470762252808, + "num_tokens": 566706673.0, + "step": 14858 + }, + { + "epoch": 1.8902175295763897, + "ewc_loss": 0.04481792077422142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2408959921449423e-05, + "grad_norm": 29.21773338317871, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8633479475975037, + "num_tokens": 566749503.0, + "step": 14859 + }, + { + "epoch": 1.8903447398549802, + "ewc_loss": 0.04464738443493843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2323692974168807e-05, + "grad_norm": 29.162033081054688, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8816397190093994, + "num_tokens": 566790700.0, + "step": 14860 + }, + { + "epoch": 1.8904719501335707, + "ewc_loss": 0.04475404694676399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2377023924491368e-05, + "grad_norm": 29.211097717285156, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8660744428634644, + "num_tokens": 566832995.0, + "step": 14861 + }, + { + "epoch": 1.8905991604121613, + "ewc_loss": 0.0447649210691452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2382460883818567e-05, + "grad_norm": 29.342790603637695, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8550143241882324, + "num_tokens": 566868190.0, + "step": 14862 + }, + { + "epoch": 1.8907263706907518, + "ewc_loss": 0.044760994613170624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.238049819425214e-05, + "grad_norm": 29.28584098815918, + "learning_rate": 1e-06, + "loss": 0.5128, + "mean_token_accuracy": 0.8394728899002075, + "num_tokens": 566906637.0, + "step": 14863 + }, + { + "epoch": 1.8908535809693423, + "ewc_loss": 0.04461954906582832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2309774067252874e-05, + "grad_norm": 29.189146041870117, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8674052953720093, + "num_tokens": 566946006.0, + "step": 14864 + }, + { + "epoch": 1.8909807912479328, + "ewc_loss": 0.0447341650724411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.236708314740099e-05, + "grad_norm": 29.24429702758789, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8555614352226257, + "num_tokens": 566987726.0, + "step": 14865 + }, + { + "epoch": 1.8911080015265234, + "ewc_loss": 0.04479111358523369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2395557607524097e-05, + "grad_norm": 29.229280471801758, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8820124268531799, + "num_tokens": 567027432.0, + "step": 14866 + }, + { + "epoch": 1.891235211805114, + "ewc_loss": 0.044614799320697784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2307400286081247e-05, + "grad_norm": 29.162302017211914, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8735255002975464, + "num_tokens": 567060607.0, + "step": 14867 + }, + { + "epoch": 1.8913624220837044, + "ewc_loss": 0.0448320098221302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2416004867409356e-05, + "grad_norm": 29.24324607849121, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8598410487174988, + "num_tokens": 567100691.0, + "step": 14868 + }, + { + "epoch": 1.891489632362295, + "ewc_loss": 0.044728510081768036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2364254618878476e-05, + "grad_norm": 29.214370727539062, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8863111734390259, + "num_tokens": 567139460.0, + "step": 14869 + }, + { + "epoch": 1.8916168426408855, + "ewc_loss": 0.044781215488910675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.239060813735705e-05, + "grad_norm": 29.222198486328125, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.855790376663208, + "num_tokens": 567185384.0, + "step": 14870 + }, + { + "epoch": 1.891744052919476, + "ewc_loss": 0.044682081788778305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2341040676110424e-05, + "grad_norm": 29.16915512084961, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8657101392745972, + "num_tokens": 567219898.0, + "step": 14871 + }, + { + "epoch": 1.8918712631980665, + "ewc_loss": 0.04485141113400459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2425705537898466e-05, + "grad_norm": 29.25564956665039, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8746172189712524, + "num_tokens": 567260322.0, + "step": 14872 + }, + { + "epoch": 1.891998473476657, + "ewc_loss": 0.04482698068022728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2413491024053656e-05, + "grad_norm": 29.293691635131836, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8623315095901489, + "num_tokens": 567293739.0, + "step": 14873 + }, + { + "epoch": 1.8921256837552476, + "ewc_loss": 0.044741466641426086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2370733859133907e-05, + "grad_norm": 29.114103317260742, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8681302666664124, + "num_tokens": 567327282.0, + "step": 14874 + }, + { + "epoch": 1.892252894033838, + "ewc_loss": 0.04475812986493111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2379064830602147e-05, + "grad_norm": 29.33591079711914, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8605103492736816, + "num_tokens": 567366402.0, + "step": 14875 + }, + { + "epoch": 1.8923801043124284, + "ewc_loss": 0.04477878659963608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.238939305243548e-05, + "grad_norm": 29.19816017150879, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8597742319107056, + "num_tokens": 567409924.0, + "step": 14876 + }, + { + "epoch": 1.892507314591019, + "ewc_loss": 0.04466106370091438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.233053237432614e-05, + "grad_norm": 29.320446014404297, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8696277737617493, + "num_tokens": 567450355.0, + "step": 14877 + }, + { + "epoch": 1.8926345248696095, + "ewc_loss": 0.044747043401002884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2373522369889542e-05, + "grad_norm": 29.16806983947754, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8804247975349426, + "num_tokens": 567487701.0, + "step": 14878 + }, + { + "epoch": 1.8927617351482, + "ewc_loss": 0.04469965770840645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2349828213918954e-05, + "grad_norm": 29.308250427246094, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8672297596931458, + "num_tokens": 567527619.0, + "step": 14879 + }, + { + "epoch": 1.8928889454267905, + "ewc_loss": 0.044737204909324646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.236860200355295e-05, + "grad_norm": 29.2072811126709, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8670221567153931, + "num_tokens": 567567016.0, + "step": 14880 + }, + { + "epoch": 1.8930161557053808, + "ewc_loss": 0.04472874104976654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2364371034200303e-05, + "grad_norm": 29.309009552001953, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.855640172958374, + "num_tokens": 567605465.0, + "step": 14881 + }, + { + "epoch": 1.8931433659839714, + "ewc_loss": 0.0447867177426815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.239335844933521e-05, + "grad_norm": 29.121274948120117, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8702120780944824, + "num_tokens": 567646680.0, + "step": 14882 + }, + { + "epoch": 1.893270576262562, + "ewc_loss": 0.04458574950695038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2292875655693933e-05, + "grad_norm": 29.180438995361328, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8682358860969543, + "num_tokens": 567685685.0, + "step": 14883 + }, + { + "epoch": 1.8933977865411524, + "ewc_loss": 0.04479333758354187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2396669010049663e-05, + "grad_norm": 29.191455841064453, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8566460609436035, + "num_tokens": 567717222.0, + "step": 14884 + }, + { + "epoch": 1.893524996819743, + "ewc_loss": 0.044717468321323395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2358733986038715e-05, + "grad_norm": 29.183557510375977, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8574603796005249, + "num_tokens": 567761050.0, + "step": 14885 + }, + { + "epoch": 1.8936522070983335, + "ewc_loss": 0.04477490112185478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2387450371752493e-05, + "grad_norm": 29.18617820739746, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8658008575439453, + "num_tokens": 567801423.0, + "step": 14886 + }, + { + "epoch": 1.893779417376924, + "ewc_loss": 0.04479779675602913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.239889909105841e-05, + "grad_norm": 29.178714752197266, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8582499027252197, + "num_tokens": 567844614.0, + "step": 14887 + }, + { + "epoch": 1.8939066276555145, + "ewc_loss": 0.04484349489212036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.242174741695635e-05, + "grad_norm": 29.24490737915039, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8705124258995056, + "num_tokens": 567878803.0, + "step": 14888 + }, + { + "epoch": 1.894033837934105, + "ewc_loss": 0.044822461903095245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.241123183921445e-05, + "grad_norm": 29.328088760375977, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8790157437324524, + "num_tokens": 567909419.0, + "step": 14889 + }, + { + "epoch": 1.8941610482126956, + "ewc_loss": 0.04476892575621605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.238446359115187e-05, + "grad_norm": 29.267614364624023, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8685874938964844, + "num_tokens": 567949221.0, + "step": 14890 + }, + { + "epoch": 1.8942882584912861, + "ewc_loss": 0.04477819427847862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2389096557162702e-05, + "grad_norm": 29.254274368286133, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8831981420516968, + "num_tokens": 567985101.0, + "step": 14891 + }, + { + "epoch": 1.8944154687698767, + "ewc_loss": 0.04478507861495018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2392539904103614e-05, + "grad_norm": 29.176189422607422, + "learning_rate": 1e-06, + "loss": 0.4741, + "mean_token_accuracy": 0.8505774736404419, + "num_tokens": 568030542.0, + "step": 14892 + }, + { + "epoch": 1.8945426790484672, + "ewc_loss": 0.044723544269800186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2361771698342636e-05, + "grad_norm": 29.190889358520508, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.869020938873291, + "num_tokens": 568070723.0, + "step": 14893 + }, + { + "epoch": 1.8946698893270577, + "ewc_loss": 0.04480533301830292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2402666218113154e-05, + "grad_norm": 29.2352294921875, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8598092198371887, + "num_tokens": 568113444.0, + "step": 14894 + }, + { + "epoch": 1.8947970996056482, + "ewc_loss": 0.04480388015508652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.240194044134114e-05, + "grad_norm": 29.277935028076172, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8703840970993042, + "num_tokens": 568152646.0, + "step": 14895 + }, + { + "epoch": 1.8949243098842388, + "ewc_loss": 0.04482908546924591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2414542399928905e-05, + "grad_norm": 29.318260192871094, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8667463064193726, + "num_tokens": 568187046.0, + "step": 14896 + }, + { + "epoch": 1.8950515201628293, + "ewc_loss": 0.0447855181992054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2392760001821443e-05, + "grad_norm": 29.20631980895996, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8631761074066162, + "num_tokens": 568225052.0, + "step": 14897 + }, + { + "epoch": 1.8951787304414198, + "ewc_loss": 0.04471341893076897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2356709450832568e-05, + "grad_norm": 29.310998916625977, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8587409257888794, + "num_tokens": 568258299.0, + "step": 14898 + }, + { + "epoch": 1.8953059407200101, + "ewc_loss": 0.044789668172597885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2394833649741486e-05, + "grad_norm": 29.158550262451172, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8734632730484009, + "num_tokens": 568297137.0, + "step": 14899 + }, + { + "epoch": 1.8954331509986007, + "ewc_loss": 0.04476045072078705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.238022534584161e-05, + "grad_norm": 29.34503746032715, + "learning_rate": 1e-06, + "loss": 0.3588, + "mean_token_accuracy": 0.8888379335403442, + "num_tokens": 568331102.0, + "step": 14900 + }, + { + "epoch": 1.8955603612771912, + "ewc_loss": 0.044855035841464996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2427517251344398e-05, + "grad_norm": 29.106292724609375, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8654868602752686, + "num_tokens": 568373926.0, + "step": 14901 + }, + { + "epoch": 1.8956875715557817, + "ewc_loss": 0.04469171538949013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2345857360051014e-05, + "grad_norm": 29.15226173400879, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8754129409790039, + "num_tokens": 568406470.0, + "step": 14902 + }, + { + "epoch": 1.8958147818343722, + "ewc_loss": 0.04487359896302223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2436799554270692e-05, + "grad_norm": 29.280670166015625, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8748098015785217, + "num_tokens": 568443452.0, + "step": 14903 + }, + { + "epoch": 1.8959419921129628, + "ewc_loss": 0.04477778822183609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.238889464933891e-05, + "grad_norm": 29.18154525756836, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8618127107620239, + "num_tokens": 568480085.0, + "step": 14904 + }, + { + "epoch": 1.896069202391553, + "ewc_loss": 0.04484453424811363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2422267647925764e-05, + "grad_norm": 29.32925033569336, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8637892603874207, + "num_tokens": 568519191.0, + "step": 14905 + }, + { + "epoch": 1.8961964126701436, + "ewc_loss": 0.04490797221660614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2453985366155393e-05, + "grad_norm": 29.28598976135254, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8732997179031372, + "num_tokens": 568554224.0, + "step": 14906 + }, + { + "epoch": 1.8963236229487341, + "ewc_loss": 0.04480818659067154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2404094124794938e-05, + "grad_norm": 29.270401000976562, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8842648267745972, + "num_tokens": 568586102.0, + "step": 14907 + }, + { + "epoch": 1.8964508332273247, + "ewc_loss": 0.04487309232354164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.24365467147436e-05, + "grad_norm": 29.19452476501465, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8616822957992554, + "num_tokens": 568625224.0, + "step": 14908 + }, + { + "epoch": 1.8965780435059152, + "ewc_loss": 0.04483439400792122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.241719630546868e-05, + "grad_norm": 29.26434898376465, + "learning_rate": 1e-06, + "loss": 0.5002, + "mean_token_accuracy": 0.8458641767501831, + "num_tokens": 568665588.0, + "step": 14909 + }, + { + "epoch": 1.8967052537845057, + "ewc_loss": 0.044844236224889755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2422118490794674e-05, + "grad_norm": 29.194931030273438, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8840259909629822, + "num_tokens": 568698408.0, + "step": 14910 + }, + { + "epoch": 1.8968324640630962, + "ewc_loss": 0.04479158669710159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2395794076146558e-05, + "grad_norm": 29.256698608398438, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8552998900413513, + "num_tokens": 568731982.0, + "step": 14911 + }, + { + "epoch": 1.8969596743416868, + "ewc_loss": 0.04485533386468887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.242766640847549e-05, + "grad_norm": 29.207059860229492, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8662134408950806, + "num_tokens": 568769388.0, + "step": 14912 + }, + { + "epoch": 1.8970868846202773, + "ewc_loss": 0.04480602219700813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2403011826099828e-05, + "grad_norm": 29.28354835510254, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8742935657501221, + "num_tokens": 568810329.0, + "step": 14913 + }, + { + "epoch": 1.8972140948988678, + "ewc_loss": 0.044836509972810745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2418254957301542e-05, + "grad_norm": 29.2630558013916, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8721272349357605, + "num_tokens": 568852241.0, + "step": 14914 + }, + { + "epoch": 1.8973413051774584, + "ewc_loss": 0.04473569244146347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2367845303961076e-05, + "grad_norm": 29.155988693237305, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8624356985092163, + "num_tokens": 568889660.0, + "step": 14915 + }, + { + "epoch": 1.8974685154560489, + "ewc_loss": 0.04488244652748108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2441223336500116e-05, + "grad_norm": 29.461091995239258, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8647467494010925, + "num_tokens": 568934489.0, + "step": 14916 + }, + { + "epoch": 1.8975957257346394, + "ewc_loss": 0.04477078095078468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.238539127574768e-05, + "grad_norm": 29.33884620666504, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8884932994842529, + "num_tokens": 568971904.0, + "step": 14917 + }, + { + "epoch": 1.89772293601323, + "ewc_loss": 0.04481081664562225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.24054092541337e-05, + "grad_norm": 29.235694885253906, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8484640717506409, + "num_tokens": 569011863.0, + "step": 14918 + }, + { + "epoch": 1.8978501462918205, + "ewc_loss": 0.04474254325032234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2371272279997356e-05, + "grad_norm": 29.401947021484375, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8878375291824341, + "num_tokens": 569050295.0, + "step": 14919 + }, + { + "epoch": 1.897977356570411, + "ewc_loss": 0.04491826519370079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2459133106167428e-05, + "grad_norm": 29.41905975341797, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8615071177482605, + "num_tokens": 569087013.0, + "step": 14920 + }, + { + "epoch": 1.8981045668490015, + "ewc_loss": 0.044677671045064926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2338836060953327e-05, + "grad_norm": 29.208511352539062, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8611487150192261, + "num_tokens": 569125226.0, + "step": 14921 + }, + { + "epoch": 1.898231777127592, + "ewc_loss": 0.044773299247026443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2386650016414933e-05, + "grad_norm": 29.344694137573242, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8642385601997375, + "num_tokens": 569161910.0, + "step": 14922 + }, + { + "epoch": 1.8983589874061826, + "ewc_loss": 0.04475674033164978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2378369976649992e-05, + "grad_norm": 29.280614852905273, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8696630001068115, + "num_tokens": 569196519.0, + "step": 14923 + }, + { + "epoch": 1.898486197684773, + "ewc_loss": 0.044660162180662155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.233008126495406e-05, + "grad_norm": 29.20412254333496, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8630919456481934, + "num_tokens": 569234404.0, + "step": 14924 + }, + { + "epoch": 1.8986134079633634, + "ewc_loss": 0.04474562406539917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.237281296402216e-05, + "grad_norm": 29.24391746520996, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8590025901794434, + "num_tokens": 569272728.0, + "step": 14925 + }, + { + "epoch": 1.898740618241954, + "ewc_loss": 0.04480824992060661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2404125047614798e-05, + "grad_norm": 29.38109588623047, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8591703772544861, + "num_tokens": 569315434.0, + "step": 14926 + }, + { + "epoch": 1.8988678285205445, + "ewc_loss": 0.04477882385253906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2389411242329516e-05, + "grad_norm": 29.269201278686523, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8504492044448853, + "num_tokens": 569357473.0, + "step": 14927 + }, + { + "epoch": 1.898995038799135, + "ewc_loss": 0.044657330960035324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.23286660911981e-05, + "grad_norm": 29.255687713623047, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8552064299583435, + "num_tokens": 569398747.0, + "step": 14928 + }, + { + "epoch": 1.8991222490777255, + "ewc_loss": 0.04475090280175209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2375452317646705e-05, + "grad_norm": 29.278533935546875, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8744980692863464, + "num_tokens": 569434855.0, + "step": 14929 + }, + { + "epoch": 1.8992494593563158, + "ewc_loss": 0.044742073863744736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.23710376303643e-05, + "grad_norm": 29.30963134765625, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8678720593452454, + "num_tokens": 569475236.0, + "step": 14930 + }, + { + "epoch": 1.8993766696349064, + "ewc_loss": 0.04479246959090233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2396234271582216e-05, + "grad_norm": 29.298547744750977, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8657333254814148, + "num_tokens": 569513599.0, + "step": 14931 + }, + { + "epoch": 1.899503879913497, + "ewc_loss": 0.044726550579071045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.236327600257937e-05, + "grad_norm": 29.439111709594727, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8743124604225159, + "num_tokens": 569559059.0, + "step": 14932 + }, + { + "epoch": 1.8996310901920874, + "ewc_loss": 0.04473632574081421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2368163627106696e-05, + "grad_norm": 29.303678512573242, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8847494721412659, + "num_tokens": 569595240.0, + "step": 14933 + }, + { + "epoch": 1.899758300470678, + "ewc_loss": 0.044600773602724075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2300386262941174e-05, + "grad_norm": 29.329896926879883, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8564218282699585, + "num_tokens": 569629336.0, + "step": 14934 + }, + { + "epoch": 1.8998855107492685, + "ewc_loss": 0.044709451496601105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.23547249333933e-05, + "grad_norm": 29.337247848510742, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8776721954345703, + "num_tokens": 569663613.0, + "step": 14935 + }, + { + "epoch": 1.900012721027859, + "ewc_loss": 0.04463820159435272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2319100025924854e-05, + "grad_norm": 29.200464248657227, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8512783050537109, + "num_tokens": 569702884.0, + "step": 14936 + }, + { + "epoch": 1.9001399313064495, + "ewc_loss": 0.044707536697387695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2353768144967034e-05, + "grad_norm": 29.313322067260742, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8819084763526917, + "num_tokens": 569744618.0, + "step": 14937 + }, + { + "epoch": 1.90026714158504, + "ewc_loss": 0.04469599947333336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.234800012956839e-05, + "grad_norm": 29.22266960144043, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8759965896606445, + "num_tokens": 569777083.0, + "step": 14938 + }, + { + "epoch": 1.9003943518636306, + "ewc_loss": 0.044676244258880615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2338121198117733e-05, + "grad_norm": 29.10357093811035, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8748865127563477, + "num_tokens": 569809652.0, + "step": 14939 + }, + { + "epoch": 1.9005215621422211, + "ewc_loss": 0.044841207563877106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.242060327262152e-05, + "grad_norm": 29.275192260742188, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8723200559616089, + "num_tokens": 569846465.0, + "step": 14940 + }, + { + "epoch": 1.9006487724208116, + "ewc_loss": 0.04482205957174301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2411029931390658e-05, + "grad_norm": 29.184661865234375, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.860565721988678, + "num_tokens": 569883655.0, + "step": 14941 + }, + { + "epoch": 1.9007759826994022, + "ewc_loss": 0.044812966138124466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2406482457881793e-05, + "grad_norm": 29.276182174682617, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8654979467391968, + "num_tokens": 569923358.0, + "step": 14942 + }, + { + "epoch": 1.9009031929779927, + "ewc_loss": 0.04474230855703354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2371154045686126e-05, + "grad_norm": 29.220932006835938, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8629595041275024, + "num_tokens": 569961393.0, + "step": 14943 + }, + { + "epoch": 1.9010304032565832, + "ewc_loss": 0.04479887709021568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2399439330911264e-05, + "grad_norm": 29.150318145751953, + "learning_rate": 1e-06, + "loss": 0.5165, + "mean_token_accuracy": 0.8401063680648804, + "num_tokens": 570002092.0, + "step": 14944 + }, + { + "epoch": 1.9011576135351738, + "ewc_loss": 0.044790469110012054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2395233827410266e-05, + "grad_norm": 29.35141372680664, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8524760603904724, + "num_tokens": 570037699.0, + "step": 14945 + }, + { + "epoch": 1.9012848238137643, + "ewc_loss": 0.04489904269576073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2449521566159092e-05, + "grad_norm": 29.25876235961914, + "learning_rate": 1e-06, + "loss": 0.4993, + "mean_token_accuracy": 0.8439119458198547, + "num_tokens": 570077693.0, + "step": 14946 + }, + { + "epoch": 1.9014120340923548, + "ewc_loss": 0.04482189565896988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.24109480768675e-05, + "grad_norm": 29.309886932373047, + "learning_rate": 1e-06, + "loss": 0.3762, + "mean_token_accuracy": 0.8865479230880737, + "num_tokens": 570115354.0, + "step": 14947 + }, + { + "epoch": 1.9015392443709451, + "ewc_loss": 0.044932205229997635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2466103473561816e-05, + "grad_norm": 29.294824600219727, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8752561211585999, + "num_tokens": 570154387.0, + "step": 14948 + }, + { + "epoch": 1.9016664546495357, + "ewc_loss": 0.044769287109375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2384643671102822e-05, + "grad_norm": 29.163204193115234, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.863470733165741, + "num_tokens": 570189743.0, + "step": 14949 + }, + { + "epoch": 1.9017936649281262, + "ewc_loss": 0.044891681522130966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2445839931606315e-05, + "grad_norm": 29.308109283447266, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8784345984458923, + "num_tokens": 570224009.0, + "step": 14950 + }, + { + "epoch": 1.9019208752067167, + "ewc_loss": 0.04486975818872452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2434878701460548e-05, + "grad_norm": 29.254051208496094, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8742455244064331, + "num_tokens": 570266703.0, + "step": 14951 + }, + { + "epoch": 1.9020480854853072, + "ewc_loss": 0.04486234113574028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2431169782066718e-05, + "grad_norm": 29.30211067199707, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8664775490760803, + "num_tokens": 570308241.0, + "step": 14952 + }, + { + "epoch": 1.9021752957638978, + "ewc_loss": 0.04493017494678497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2465086658485234e-05, + "grad_norm": 29.271163940429688, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8671339750289917, + "num_tokens": 570346931.0, + "step": 14953 + }, + { + "epoch": 1.902302506042488, + "ewc_loss": 0.04480251297354698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2401256501325406e-05, + "grad_norm": 29.239376068115234, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8807562589645386, + "num_tokens": 570380373.0, + "step": 14954 + }, + { + "epoch": 1.9024297163210786, + "ewc_loss": 0.04486502707004547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2432514015235938e-05, + "grad_norm": 29.207059860229492, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8792258501052856, + "num_tokens": 570420106.0, + "step": 14955 + }, + { + "epoch": 1.9025569265996691, + "ewc_loss": 0.044775135815143585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2387568606063724e-05, + "grad_norm": 29.251869201660156, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8884139060974121, + "num_tokens": 570464534.0, + "step": 14956 + }, + { + "epoch": 1.9026841368782597, + "ewc_loss": 0.04488867148756981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2444335627369583e-05, + "grad_norm": 29.281959533691406, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8565314412117004, + "num_tokens": 570505734.0, + "step": 14957 + }, + { + "epoch": 1.9028113471568502, + "ewc_loss": 0.04482463002204895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.241231413790956e-05, + "grad_norm": 29.213228225708008, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8764259815216064, + "num_tokens": 570545913.0, + "step": 14958 + }, + { + "epoch": 1.9029385574354407, + "ewc_loss": 0.04477850720286369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2389252990251407e-05, + "grad_norm": 29.312009811401367, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.874614417552948, + "num_tokens": 570584397.0, + "step": 14959 + }, + { + "epoch": 1.9030657677140312, + "ewc_loss": 0.0448615625500679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.243078051833436e-05, + "grad_norm": 29.199567794799805, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8734644651412964, + "num_tokens": 570622285.0, + "step": 14960 + }, + { + "epoch": 1.9031929779926218, + "ewc_loss": 0.04474777728319168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2373887986759655e-05, + "grad_norm": 29.268423080444336, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8527895212173462, + "num_tokens": 570656331.0, + "step": 14961 + }, + { + "epoch": 1.9033201882712123, + "ewc_loss": 0.04486648365855217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2433241610997356e-05, + "grad_norm": 29.23253631591797, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8719898462295532, + "num_tokens": 570699944.0, + "step": 14962 + }, + { + "epoch": 1.9034473985498028, + "ewc_loss": 0.04483441263437271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2417205400415696e-05, + "grad_norm": 29.268362045288086, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8615833520889282, + "num_tokens": 570737649.0, + "step": 14963 + }, + { + "epoch": 1.9035746088283934, + "ewc_loss": 0.04479271173477173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2396356143872254e-05, + "grad_norm": 29.24553871154785, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8654444813728333, + "num_tokens": 570773193.0, + "step": 14964 + }, + { + "epoch": 1.9037018191069839, + "ewc_loss": 0.04482528567314148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2412643374991603e-05, + "grad_norm": 29.386962890625, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8606094121932983, + "num_tokens": 570809117.0, + "step": 14965 + }, + { + "epoch": 1.9038290293855744, + "ewc_loss": 0.044871482998132706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.243574090243783e-05, + "grad_norm": 29.265993118286133, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.879697322845459, + "num_tokens": 570844268.0, + "step": 14966 + }, + { + "epoch": 1.903956239664165, + "ewc_loss": 0.04477919265627861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2389596779248677e-05, + "grad_norm": 29.285598754882812, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8588680028915405, + "num_tokens": 570890166.0, + "step": 14967 + }, + { + "epoch": 1.9040834499427555, + "ewc_loss": 0.04480001702904701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2400008674594574e-05, + "grad_norm": 29.333770751953125, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8615382313728333, + "num_tokens": 570921419.0, + "step": 14968 + }, + { + "epoch": 1.904210660221346, + "ewc_loss": 0.04473608732223511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2368043573806062e-05, + "grad_norm": 29.076183319091797, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8582264184951782, + "num_tokens": 570966444.0, + "step": 14969 + }, + { + "epoch": 1.9043378704999365, + "ewc_loss": 0.044871531426906586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2435766368289478e-05, + "grad_norm": 29.270448684692383, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8780562877655029, + "num_tokens": 571011656.0, + "step": 14970 + }, + { + "epoch": 1.904465080778527, + "ewc_loss": 0.0448344387114048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2417219952330925e-05, + "grad_norm": 29.206249237060547, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8719691038131714, + "num_tokens": 571047828.0, + "step": 14971 + }, + { + "epoch": 1.9045922910571176, + "ewc_loss": 0.04486900568008423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2434502170654014e-05, + "grad_norm": 29.309825897216797, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8651746511459351, + "num_tokens": 571084541.0, + "step": 14972 + }, + { + "epoch": 1.9047195013357079, + "ewc_loss": 0.04490257054567337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.245128598588053e-05, + "grad_norm": 29.316986083984375, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8618271350860596, + "num_tokens": 571120978.0, + "step": 14973 + }, + { + "epoch": 1.9048467116142984, + "ewc_loss": 0.04486380144953728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2431901015806943e-05, + "grad_norm": 29.171234130859375, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8816304206848145, + "num_tokens": 571160181.0, + "step": 14974 + }, + { + "epoch": 1.904973921892889, + "ewc_loss": 0.04491310939192772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2456555598182604e-05, + "grad_norm": 29.436378479003906, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8726681470870972, + "num_tokens": 571194648.0, + "step": 14975 + }, + { + "epoch": 1.9051011321714795, + "ewc_loss": 0.04494938254356384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2474690922535956e-05, + "grad_norm": 29.168010711669922, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8629879951477051, + "num_tokens": 571238449.0, + "step": 14976 + }, + { + "epoch": 1.90522834245007, + "ewc_loss": 0.044799286872148514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.239964305772446e-05, + "grad_norm": 29.43686294555664, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8676984906196594, + "num_tokens": 571277266.0, + "step": 14977 + }, + { + "epoch": 1.9053555527286605, + "ewc_loss": 0.04498078674077988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2490392439067364e-05, + "grad_norm": 29.304101943969727, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8796818256378174, + "num_tokens": 571316891.0, + "step": 14978 + }, + { + "epoch": 1.9054827630072508, + "ewc_loss": 0.04477521777153015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.23876086238306e-05, + "grad_norm": 29.288599014282227, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.868748664855957, + "num_tokens": 571353097.0, + "step": 14979 + }, + { + "epoch": 1.9056099732858414, + "ewc_loss": 0.04491977021098137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2459884348791093e-05, + "grad_norm": 29.384092330932617, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8819888830184937, + "num_tokens": 571392217.0, + "step": 14980 + }, + { + "epoch": 1.905737183564432, + "ewc_loss": 0.04486658424139023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2433292542700656e-05, + "grad_norm": 29.30913543701172, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8762089610099792, + "num_tokens": 571430440.0, + "step": 14981 + }, + { + "epoch": 1.9058643938430224, + "ewc_loss": 0.04481598734855652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.240799403807614e-05, + "grad_norm": 29.28427505493164, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8639506697654724, + "num_tokens": 571472002.0, + "step": 14982 + }, + { + "epoch": 1.905991604121613, + "ewc_loss": 0.044881559908390045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.244077950308565e-05, + "grad_norm": 29.29987144470215, + "learning_rate": 1e-06, + "loss": 0.4939, + "mean_token_accuracy": 0.8492891788482666, + "num_tokens": 571509368.0, + "step": 14983 + }, + { + "epoch": 1.9061188144002035, + "ewc_loss": 0.04479437321424484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2397187422029674e-05, + "grad_norm": 29.281614303588867, + "learning_rate": 1e-06, + "loss": 0.4834, + "mean_token_accuracy": 0.8530384302139282, + "num_tokens": 571543898.0, + "step": 14984 + }, + { + "epoch": 1.906246024678794, + "ewc_loss": 0.04493863135576248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2469315808848478e-05, + "grad_norm": 29.39807891845703, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8628724813461304, + "num_tokens": 571584395.0, + "step": 14985 + }, + { + "epoch": 1.9063732349573845, + "ewc_loss": 0.04480647295713425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2403237380785868e-05, + "grad_norm": 29.280353546142578, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8674516677856445, + "num_tokens": 571624792.0, + "step": 14986 + }, + { + "epoch": 1.906500445235975, + "ewc_loss": 0.044878166168928146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2439082385972142e-05, + "grad_norm": 29.358448028564453, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8691818714141846, + "num_tokens": 571667197.0, + "step": 14987 + }, + { + "epoch": 1.9066276555145656, + "ewc_loss": 0.04483826458454132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.241913171019405e-05, + "grad_norm": 29.191965103149414, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8620280027389526, + "num_tokens": 571703648.0, + "step": 14988 + }, + { + "epoch": 1.9067548657931561, + "ewc_loss": 0.04482533782720566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2412668840843253e-05, + "grad_norm": 29.427927017211914, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.85975182056427, + "num_tokens": 571738980.0, + "step": 14989 + }, + { + "epoch": 1.9068820760717466, + "ewc_loss": 0.044837094843387604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2418547814595513e-05, + "grad_norm": 29.243789672851562, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8703950643539429, + "num_tokens": 571774540.0, + "step": 14990 + }, + { + "epoch": 1.9070092863503372, + "ewc_loss": 0.044849175959825516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.242458867840469e-05, + "grad_norm": 29.44453239440918, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8734562397003174, + "num_tokens": 571805286.0, + "step": 14991 + }, + { + "epoch": 1.9071364966289277, + "ewc_loss": 0.04484222084283829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.242111077066511e-05, + "grad_norm": 29.294755935668945, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8612635135650635, + "num_tokens": 571847936.0, + "step": 14992 + }, + { + "epoch": 1.9072637069075182, + "ewc_loss": 0.044780876487493515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.239043897134252e-05, + "grad_norm": 29.472414016723633, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8638590574264526, + "num_tokens": 571885454.0, + "step": 14993 + }, + { + "epoch": 1.9073909171861088, + "ewc_loss": 0.04484495148062706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.242247501271777e-05, + "grad_norm": 29.33334732055664, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8658908605575562, + "num_tokens": 571924870.0, + "step": 14994 + }, + { + "epoch": 1.9075181274646993, + "ewc_loss": 0.04475763440132141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2378817448043264e-05, + "grad_norm": 29.287761688232422, + "learning_rate": 1e-06, + "loss": 0.5026, + "mean_token_accuracy": 0.8463056087493896, + "num_tokens": 571957870.0, + "step": 14995 + }, + { + "epoch": 1.9076453377432898, + "ewc_loss": 0.04485844448208809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.242922164441552e-05, + "grad_norm": 29.400226593017578, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.860293984413147, + "num_tokens": 571993645.0, + "step": 14996 + }, + { + "epoch": 1.9077725480218801, + "ewc_loss": 0.044821176677942276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2410587916965596e-05, + "grad_norm": 29.236392974853516, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8760316967964172, + "num_tokens": 572028594.0, + "step": 14997 + }, + { + "epoch": 1.9078997583004706, + "ewc_loss": 0.04483991488814354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.241995753138326e-05, + "grad_norm": 29.356393814086914, + "learning_rate": 1e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8409650921821594, + "num_tokens": 572066971.0, + "step": 14998 + }, + { + "epoch": 1.9080269685790612, + "ewc_loss": 0.04489297792315483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.244648931082338e-05, + "grad_norm": 29.219799041748047, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8714921474456787, + "num_tokens": 572107609.0, + "step": 14999 + }, + { + "epoch": 1.9081541788576517, + "ewc_loss": 0.04485504701733589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2427522708312608e-05, + "grad_norm": 29.320510864257812, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8754372596740723, + "num_tokens": 572146940.0, + "step": 15000 + }, + { + "epoch": 1.9082813891362422, + "ewc_loss": 0.04496306553483009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2481532141682692e-05, + "grad_norm": 29.302227020263672, + "learning_rate": 1e-06, + "loss": 0.3518, + "mean_token_accuracy": 0.8921990394592285, + "num_tokens": 572184010.0, + "step": 15001 + }, + { + "epoch": 1.9084085994148328, + "ewc_loss": 0.044879425317049026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.243971175630577e-05, + "grad_norm": 29.329269409179688, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8506948947906494, + "num_tokens": 572223184.0, + "step": 15002 + }, + { + "epoch": 1.908535809693423, + "ewc_loss": 0.04487690329551697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2438451196649112e-05, + "grad_norm": 29.306461334228516, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8674058318138123, + "num_tokens": 572261770.0, + "step": 15003 + }, + { + "epoch": 1.9086630199720136, + "ewc_loss": 0.044760119169950485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2380059817805886e-05, + "grad_norm": 29.30462074279785, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8696794509887695, + "num_tokens": 572303229.0, + "step": 15004 + }, + { + "epoch": 1.9087902302506041, + "ewc_loss": 0.04484722390770912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2423611881094985e-05, + "grad_norm": 29.273334503173828, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8552314043045044, + "num_tokens": 572337839.0, + "step": 15005 + }, + { + "epoch": 1.9089174405291947, + "ewc_loss": 0.04493466392159462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.246733129140921e-05, + "grad_norm": 29.3911075592041, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8664902448654175, + "num_tokens": 572373604.0, + "step": 15006 + }, + { + "epoch": 1.9090446508077852, + "ewc_loss": 0.04491185024380684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2455924408859573e-05, + "grad_norm": 29.30066680908203, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8745054006576538, + "num_tokens": 572410425.0, + "step": 15007 + }, + { + "epoch": 1.9091718610863757, + "ewc_loss": 0.04485528543591499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2427642761613242e-05, + "grad_norm": 29.273923873901367, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.87624192237854, + "num_tokens": 572443998.0, + "step": 15008 + }, + { + "epoch": 1.9092990713649662, + "ewc_loss": 0.044980909675359726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2490454284707084e-05, + "grad_norm": 29.397891998291016, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8547755479812622, + "num_tokens": 572485337.0, + "step": 15009 + }, + { + "epoch": 1.9094262816435568, + "ewc_loss": 0.044872671365737915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2436335711972788e-05, + "grad_norm": 29.303632736206055, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8561252355575562, + "num_tokens": 572528273.0, + "step": 15010 + }, + { + "epoch": 1.9095534919221473, + "ewc_loss": 0.044911712408065796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2455857106251642e-05, + "grad_norm": 29.394662857055664, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8711810111999512, + "num_tokens": 572569467.0, + "step": 15011 + }, + { + "epoch": 1.9096807022007378, + "ewc_loss": 0.04487910121679306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2439549866248854e-05, + "grad_norm": 29.299896240234375, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8753571510314941, + "num_tokens": 572609626.0, + "step": 15012 + }, + { + "epoch": 1.9098079124793284, + "ewc_loss": 0.04481526464223862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2407632059184834e-05, + "grad_norm": 29.299148559570312, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.8514512777328491, + "num_tokens": 572652907.0, + "step": 15013 + }, + { + "epoch": 1.9099351227579189, + "ewc_loss": 0.044961463660001755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2480731786345132e-05, + "grad_norm": 29.293128967285156, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8728644847869873, + "num_tokens": 572687460.0, + "step": 15014 + }, + { + "epoch": 1.9100623330365094, + "ewc_loss": 0.04494497552514076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2472488126368262e-05, + "grad_norm": 29.37471580505371, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.870242178440094, + "num_tokens": 572723937.0, + "step": 15015 + }, + { + "epoch": 1.9101895433151, + "ewc_loss": 0.04496610537171364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2483052816824056e-05, + "grad_norm": 29.279754638671875, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8628577589988708, + "num_tokens": 572764929.0, + "step": 15016 + }, + { + "epoch": 1.9103167535936905, + "ewc_loss": 0.044900648295879364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2450323740486056e-05, + "grad_norm": 29.443702697753906, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8649406433105469, + "num_tokens": 572802545.0, + "step": 15017 + }, + { + "epoch": 1.910443963872281, + "ewc_loss": 0.04499369114637375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2496846213471144e-05, + "grad_norm": 29.343887329101562, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8614593744277954, + "num_tokens": 572850008.0, + "step": 15018 + }, + { + "epoch": 1.9105711741508715, + "ewc_loss": 0.044835031032562256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2417516447603703e-05, + "grad_norm": 29.325950622558594, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8752404451370239, + "num_tokens": 572888968.0, + "step": 15019 + }, + { + "epoch": 1.910698384429462, + "ewc_loss": 0.044871389865875244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.243569542770274e-05, + "grad_norm": 29.348987579345703, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8655071258544922, + "num_tokens": 572926314.0, + "step": 15020 + }, + { + "epoch": 1.9108255947080524, + "ewc_loss": 0.04483238607645035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.241619222331792e-05, + "grad_norm": 29.24746322631836, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8713133335113525, + "num_tokens": 572961789.0, + "step": 15021 + }, + { + "epoch": 1.9109528049866429, + "ewc_loss": 0.04484723135828972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2423615519073792e-05, + "grad_norm": 29.38567352294922, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8682386875152588, + "num_tokens": 572993680.0, + "step": 15022 + }, + { + "epoch": 1.9110800152652334, + "ewc_loss": 0.04485404118895531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.242702066723723e-05, + "grad_norm": 29.191041946411133, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8700351119041443, + "num_tokens": 573024003.0, + "step": 15023 + }, + { + "epoch": 1.911207225543824, + "ewc_loss": 0.04481424391269684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.240712274215184e-05, + "grad_norm": 29.219070434570312, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8747913837432861, + "num_tokens": 573055138.0, + "step": 15024 + }, + { + "epoch": 1.9113344358224145, + "ewc_loss": 0.044962771236896515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2481384803541005e-05, + "grad_norm": 29.360788345336914, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8727113008499146, + "num_tokens": 573092373.0, + "step": 15025 + }, + { + "epoch": 1.911461646101005, + "ewc_loss": 0.04489341005682945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2446705770562403e-05, + "grad_norm": 29.20203399658203, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8626261353492737, + "num_tokens": 573131486.0, + "step": 15026 + }, + { + "epoch": 1.9115888563795955, + "ewc_loss": 0.044874414801597595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2437207007897086e-05, + "grad_norm": 29.293359756469727, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8769768476486206, + "num_tokens": 573170114.0, + "step": 15027 + }, + { + "epoch": 1.9117160666581858, + "ewc_loss": 0.04493654519319534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2468271708930843e-05, + "grad_norm": 29.207740783691406, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8718124628067017, + "num_tokens": 573200906.0, + "step": 15028 + }, + { + "epoch": 1.9118432769367764, + "ewc_loss": 0.045015279203653336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2507640096591786e-05, + "grad_norm": 29.533979415893555, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8544329404830933, + "num_tokens": 573243731.0, + "step": 15029 + }, + { + "epoch": 1.9119704872153669, + "ewc_loss": 0.04498928040266037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2494639779324643e-05, + "grad_norm": 29.176273345947266, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.874090313911438, + "num_tokens": 573283335.0, + "step": 15030 + }, + { + "epoch": 1.9120976974939574, + "ewc_loss": 0.044821273535490036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2410637029679492e-05, + "grad_norm": 29.347270965576172, + "learning_rate": 1e-06, + "loss": 0.4763, + "mean_token_accuracy": 0.854882001876831, + "num_tokens": 573326628.0, + "step": 15031 + }, + { + "epoch": 1.912224907772548, + "ewc_loss": 0.045041948556900024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.252097510790918e-05, + "grad_norm": 29.221656799316406, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8731995820999146, + "num_tokens": 573369025.0, + "step": 15032 + }, + { + "epoch": 1.9123521180511385, + "ewc_loss": 0.04486127942800522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2430640456150286e-05, + "grad_norm": 29.30016326904297, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8604505062103271, + "num_tokens": 573411923.0, + "step": 15033 + }, + { + "epoch": 1.912479328329729, + "ewc_loss": 0.04500206559896469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2501033527078107e-05, + "grad_norm": 29.18859100341797, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8629867434501648, + "num_tokens": 573450455.0, + "step": 15034 + }, + { + "epoch": 1.9126065386083195, + "ewc_loss": 0.04496452212333679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2482261556433514e-05, + "grad_norm": 29.4393310546875, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8582164645195007, + "num_tokens": 573492886.0, + "step": 15035 + }, + { + "epoch": 1.91273374888691, + "ewc_loss": 0.045068178325891495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2534090021508746e-05, + "grad_norm": 29.195362091064453, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8718563318252563, + "num_tokens": 573532073.0, + "step": 15036 + }, + { + "epoch": 1.9128609591655006, + "ewc_loss": 0.0448535718023777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2426786017604172e-05, + "grad_norm": 29.23497200012207, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8838135600090027, + "num_tokens": 573573632.0, + "step": 15037 + }, + { + "epoch": 1.9129881694440911, + "ewc_loss": 0.04508331045508385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2541655198438093e-05, + "grad_norm": 29.306434631347656, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.877242386341095, + "num_tokens": 573610194.0, + "step": 15038 + }, + { + "epoch": 1.9131153797226816, + "ewc_loss": 0.04502180591225624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2510903363581747e-05, + "grad_norm": 29.412721633911133, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8596884608268738, + "num_tokens": 573650237.0, + "step": 15039 + }, + { + "epoch": 1.9132425900012722, + "ewc_loss": 0.045052554458379745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2526277462020516e-05, + "grad_norm": 29.34907341003418, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8704577684402466, + "num_tokens": 573686859.0, + "step": 15040 + }, + { + "epoch": 1.9133698002798627, + "ewc_loss": 0.04487811401486397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2439056920120493e-05, + "grad_norm": 29.264368057250977, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8678675293922424, + "num_tokens": 573726770.0, + "step": 15041 + }, + { + "epoch": 1.9134970105584532, + "ewc_loss": 0.04492746293544769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2463731511379592e-05, + "grad_norm": 29.411767959594727, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8617196083068848, + "num_tokens": 573768668.0, + "step": 15042 + }, + { + "epoch": 1.9136242208370438, + "ewc_loss": 0.04495609551668167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2478046957985498e-05, + "grad_norm": 29.364219665527344, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8582954406738281, + "num_tokens": 573807880.0, + "step": 15043 + }, + { + "epoch": 1.9137514311156343, + "ewc_loss": 0.0448581799864769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2429090677178465e-05, + "grad_norm": 29.412534713745117, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8651652336120605, + "num_tokens": 573847018.0, + "step": 15044 + }, + { + "epoch": 1.9138786413942248, + "ewc_loss": 0.044893525540828705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2446762159233913e-05, + "grad_norm": 29.41708755493164, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8835542798042297, + "num_tokens": 573881193.0, + "step": 15045 + }, + { + "epoch": 1.9140058516728151, + "ewc_loss": 0.044817786663770676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.240889261884149e-05, + "grad_norm": 29.334819793701172, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8751773834228516, + "num_tokens": 573918457.0, + "step": 15046 + }, + { + "epoch": 1.9141330619514056, + "ewc_loss": 0.044847939163446426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2423970222007483e-05, + "grad_norm": 29.28368377685547, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8867212533950806, + "num_tokens": 573958818.0, + "step": 15047 + }, + { + "epoch": 1.9142602722299962, + "ewc_loss": 0.044801149517297745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2400574380299076e-05, + "grad_norm": 29.3835506439209, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8617786169052124, + "num_tokens": 573996735.0, + "step": 15048 + }, + { + "epoch": 1.9143874825085867, + "ewc_loss": 0.04485831409692764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2429156160796992e-05, + "grad_norm": 29.24301528930664, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8716980218887329, + "num_tokens": 574035005.0, + "step": 15049 + }, + { + "epoch": 1.9145146927871772, + "ewc_loss": 0.044785503298044205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2392750906874426e-05, + "grad_norm": 29.300596237182617, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8667386174201965, + "num_tokens": 574077817.0, + "step": 15050 + }, + { + "epoch": 1.9146419030657678, + "ewc_loss": 0.04486856237053871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2434282072936185e-05, + "grad_norm": 29.27860450744629, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8750647306442261, + "num_tokens": 574113083.0, + "step": 15051 + }, + { + "epoch": 1.914769113344358, + "ewc_loss": 0.04481998085975647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2409991288441233e-05, + "grad_norm": 29.428123474121094, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8715991973876953, + "num_tokens": 574147324.0, + "step": 15052 + }, + { + "epoch": 1.9148963236229486, + "ewc_loss": 0.04489944502711296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2449721654993482e-05, + "grad_norm": 29.302452087402344, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8828685283660889, + "num_tokens": 574186108.0, + "step": 15053 + }, + { + "epoch": 1.9150235339015391, + "ewc_loss": 0.04480931907892227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.240465983049944e-05, + "grad_norm": 29.25730323791504, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8801780343055725, + "num_tokens": 574218076.0, + "step": 15054 + }, + { + "epoch": 1.9151507441801296, + "ewc_loss": 0.04478370398283005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2391852326109074e-05, + "grad_norm": 29.249755859375, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8781514763832092, + "num_tokens": 574256416.0, + "step": 15055 + }, + { + "epoch": 1.9152779544587202, + "ewc_loss": 0.04489079490303993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2445397917181253e-05, + "grad_norm": 29.41851806640625, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8803688287734985, + "num_tokens": 574295262.0, + "step": 15056 + }, + { + "epoch": 1.9154051647373107, + "ewc_loss": 0.0448915958404541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2445798094850034e-05, + "grad_norm": 29.224882125854492, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8573089838027954, + "num_tokens": 574329836.0, + "step": 15057 + }, + { + "epoch": 1.9155323750159012, + "ewc_loss": 0.04486125707626343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.243062772322446e-05, + "grad_norm": 29.431232452392578, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8704376816749573, + "num_tokens": 574363189.0, + "step": 15058 + }, + { + "epoch": 1.9156595852944918, + "ewc_loss": 0.04484458640217781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2422293113777414e-05, + "grad_norm": 29.25862693786621, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8764687776565552, + "num_tokens": 574396374.0, + "step": 15059 + }, + { + "epoch": 1.9157867955730823, + "ewc_loss": 0.044910676777362823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.245533869427163e-05, + "grad_norm": 29.330339431762695, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8706443905830383, + "num_tokens": 574435488.0, + "step": 15060 + }, + { + "epoch": 1.9159140058516728, + "ewc_loss": 0.044903017580509186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2451507902587764e-05, + "grad_norm": 29.307628631591797, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8685305714607239, + "num_tokens": 574475261.0, + "step": 15061 + }, + { + "epoch": 1.9160412161302633, + "ewc_loss": 0.04489715024828911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2448575691669248e-05, + "grad_norm": 29.223346710205078, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8873000144958496, + "num_tokens": 574513531.0, + "step": 15062 + }, + { + "epoch": 1.9161684264088539, + "ewc_loss": 0.044852953404188156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.242647678940557e-05, + "grad_norm": 29.33885383605957, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.854017436504364, + "num_tokens": 574551736.0, + "step": 15063 + }, + { + "epoch": 1.9162956366874444, + "ewc_loss": 0.04491105675697327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.24555278691696e-05, + "grad_norm": 29.259668350219727, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8616927862167358, + "num_tokens": 574589978.0, + "step": 15064 + }, + { + "epoch": 1.916422846966035, + "ewc_loss": 0.044904470443725586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2452235498349182e-05, + "grad_norm": 29.36134910583496, + "learning_rate": 1e-06, + "loss": 0.4893, + "mean_token_accuracy": 0.8472816944122314, + "num_tokens": 574628845.0, + "step": 15065 + }, + { + "epoch": 1.9165500572446255, + "ewc_loss": 0.04489269107580185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.24463456106605e-05, + "grad_norm": 29.22532844543457, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8639682531356812, + "num_tokens": 574660700.0, + "step": 15066 + }, + { + "epoch": 1.916677267523216, + "ewc_loss": 0.04494525119662285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.247262636956293e-05, + "grad_norm": 29.212797164916992, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8591597080230713, + "num_tokens": 574692195.0, + "step": 15067 + }, + { + "epoch": 1.9168044778018065, + "ewc_loss": 0.0450536385178566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2526819520862773e-05, + "grad_norm": 29.425161361694336, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8580158948898315, + "num_tokens": 574732462.0, + "step": 15068 + }, + { + "epoch": 1.916931688080397, + "ewc_loss": 0.04502372443675995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2511861970997415e-05, + "grad_norm": 29.194438934326172, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8749433159828186, + "num_tokens": 574769353.0, + "step": 15069 + }, + { + "epoch": 1.9170588983589874, + "ewc_loss": 0.044961098581552505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2480549887404777e-05, + "grad_norm": 29.40985870361328, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8672724962234497, + "num_tokens": 574804958.0, + "step": 15070 + }, + { + "epoch": 1.9171861086375779, + "ewc_loss": 0.04501521587371826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2507607354782522e-05, + "grad_norm": 29.25115394592285, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8674200773239136, + "num_tokens": 574843966.0, + "step": 15071 + }, + { + "epoch": 1.9173133189161684, + "ewc_loss": 0.044961992651224136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2480995539808646e-05, + "grad_norm": 29.321216583251953, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8733816146850586, + "num_tokens": 574882916.0, + "step": 15072 + }, + { + "epoch": 1.917440529194759, + "ewc_loss": 0.045070234686136246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.253511775052175e-05, + "grad_norm": 29.24266815185547, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8758546710014343, + "num_tokens": 574922154.0, + "step": 15073 + }, + { + "epoch": 1.9175677394733495, + "ewc_loss": 0.04507436975836754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2537184122484177e-05, + "grad_norm": 29.305219650268555, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8637238144874573, + "num_tokens": 574956894.0, + "step": 15074 + }, + { + "epoch": 1.91769494975194, + "ewc_loss": 0.04509858414530754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.254929131595418e-05, + "grad_norm": 29.313005447387695, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8824660181999207, + "num_tokens": 574998343.0, + "step": 15075 + }, + { + "epoch": 1.9178221600305305, + "ewc_loss": 0.044981326907873154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2490663468488492e-05, + "grad_norm": 29.390548706054688, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8774657845497131, + "num_tokens": 575036689.0, + "step": 15076 + }, + { + "epoch": 1.9179493703091208, + "ewc_loss": 0.04506782069802284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2533909941557795e-05, + "grad_norm": 29.32600212097168, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8626492619514465, + "num_tokens": 575075510.0, + "step": 15077 + }, + { + "epoch": 1.9180765805877114, + "ewc_loss": 0.04502333700656891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.251166915812064e-05, + "grad_norm": 29.3465633392334, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.865279495716095, + "num_tokens": 575115797.0, + "step": 15078 + }, + { + "epoch": 1.9182037908663019, + "ewc_loss": 0.0450807586312294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2540380086866207e-05, + "grad_norm": 29.40804672241211, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8551807999610901, + "num_tokens": 575155240.0, + "step": 15079 + }, + { + "epoch": 1.9183310011448924, + "ewc_loss": 0.04499854892492294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2499274564324878e-05, + "grad_norm": 29.231678009033203, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8546698689460754, + "num_tokens": 575191048.0, + "step": 15080 + }, + { + "epoch": 1.918458211423483, + "ewc_loss": 0.04506910964846611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2534555682796054e-05, + "grad_norm": 29.339208602905273, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8576078414916992, + "num_tokens": 575227778.0, + "step": 15081 + }, + { + "epoch": 1.9185854217020735, + "ewc_loss": 0.045122403651475906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2561202058568597e-05, + "grad_norm": 29.423206329345703, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8693569898605347, + "num_tokens": 575269774.0, + "step": 15082 + }, + { + "epoch": 1.918712631980664, + "ewc_loss": 0.04509510472416878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.254755236208439e-05, + "grad_norm": 29.243894577026367, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8590387105941772, + "num_tokens": 575310710.0, + "step": 15083 + }, + { + "epoch": 1.9188398422592545, + "ewc_loss": 0.04510043188929558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.255021536257118e-05, + "grad_norm": 29.43903160095215, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8750459551811218, + "num_tokens": 575344073.0, + "step": 15084 + }, + { + "epoch": 1.918967052537845, + "ewc_loss": 0.0451514832675457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.257574124087114e-05, + "grad_norm": 29.3395938873291, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8585439324378967, + "num_tokens": 575386737.0, + "step": 15085 + }, + { + "epoch": 1.9190942628164356, + "ewc_loss": 0.044950827956199646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2475414880318567e-05, + "grad_norm": 29.351951599121094, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8714203238487244, + "num_tokens": 575424287.0, + "step": 15086 + }, + { + "epoch": 1.919221473095026, + "ewc_loss": 0.04507889971137047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2539450583280995e-05, + "grad_norm": 29.474056243896484, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8703562617301941, + "num_tokens": 575463859.0, + "step": 15087 + }, + { + "epoch": 1.9193486833736166, + "ewc_loss": 0.04500293359160423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.250146644655615e-05, + "grad_norm": 29.347332000732422, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8629475235939026, + "num_tokens": 575500838.0, + "step": 15088 + }, + { + "epoch": 1.9194758936522072, + "ewc_loss": 0.044937144964933395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2468571842182428e-05, + "grad_norm": 29.37685203552246, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8597891330718994, + "num_tokens": 575535164.0, + "step": 15089 + }, + { + "epoch": 1.9196031039307977, + "ewc_loss": 0.04500661790370941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2503309082821943e-05, + "grad_norm": 29.35760498046875, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8740783929824829, + "num_tokens": 575569493.0, + "step": 15090 + }, + { + "epoch": 1.9197303142093882, + "ewc_loss": 0.04498371481895447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2491856725537218e-05, + "grad_norm": 29.36473846435547, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8528919219970703, + "num_tokens": 575603396.0, + "step": 15091 + }, + { + "epoch": 1.9198575244879788, + "ewc_loss": 0.04507771134376526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2538855773746036e-05, + "grad_norm": 29.427152633666992, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8749168515205383, + "num_tokens": 575638255.0, + "step": 15092 + }, + { + "epoch": 1.9199847347665693, + "ewc_loss": 0.044956546276807785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2478272512671538e-05, + "grad_norm": 29.25795555114746, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8571767807006836, + "num_tokens": 575685281.0, + "step": 15093 + }, + { + "epoch": 1.9201119450451598, + "ewc_loss": 0.04504948481917381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2524742234963924e-05, + "grad_norm": 29.488384246826172, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8743542432785034, + "num_tokens": 575725030.0, + "step": 15094 + }, + { + "epoch": 1.9202391553237501, + "ewc_loss": 0.04499455913901329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.249727913294919e-05, + "grad_norm": 29.322053909301758, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8616842031478882, + "num_tokens": 575764237.0, + "step": 15095 + }, + { + "epoch": 1.9203663656023406, + "ewc_loss": 0.04493255913257599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.246627991553396e-05, + "grad_norm": 29.38282012939453, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8657639622688293, + "num_tokens": 575796732.0, + "step": 15096 + }, + { + "epoch": 1.9204935758809312, + "ewc_loss": 0.04500241577625275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2501208150060847e-05, + "grad_norm": 29.31572151184082, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.872442364692688, + "num_tokens": 575833133.0, + "step": 15097 + }, + { + "epoch": 1.9206207861595217, + "ewc_loss": 0.04498576000332832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2492880816571414e-05, + "grad_norm": 29.379961013793945, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8643311262130737, + "num_tokens": 575870920.0, + "step": 15098 + }, + { + "epoch": 1.9207479964381122, + "ewc_loss": 0.04503735527396202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2518677724292502e-05, + "grad_norm": 29.460201263427734, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8703209161758423, + "num_tokens": 575909132.0, + "step": 15099 + }, + { + "epoch": 1.9208752067167028, + "ewc_loss": 0.04501301050186157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2506505047203973e-05, + "grad_norm": 29.39410400390625, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8677128553390503, + "num_tokens": 575947979.0, + "step": 15100 + }, + { + "epoch": 1.921002416995293, + "ewc_loss": 0.04493383690714836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2466918380814604e-05, + "grad_norm": 29.383180618286133, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8577790856361389, + "num_tokens": 575988888.0, + "step": 15101 + }, + { + "epoch": 1.9211296272738836, + "ewc_loss": 0.04501340910792351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2506705136038363e-05, + "grad_norm": 29.30889320373535, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8692989349365234, + "num_tokens": 576028614.0, + "step": 15102 + }, + { + "epoch": 1.9212568375524741, + "ewc_loss": 0.04500142112374306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2500709746964276e-05, + "grad_norm": 29.261701583862305, + "learning_rate": 1e-06, + "loss": 0.3449, + "mean_token_accuracy": 0.8956733345985413, + "num_tokens": 576066648.0, + "step": 15103 + }, + { + "epoch": 1.9213840478310646, + "ewc_loss": 0.04504670575261116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2523352527059615e-05, + "grad_norm": 29.36331558227539, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8659374713897705, + "num_tokens": 576104045.0, + "step": 15104 + }, + { + "epoch": 1.9215112581096552, + "ewc_loss": 0.04506930336356163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.253465208923444e-05, + "grad_norm": 29.32316780090332, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8606393933296204, + "num_tokens": 576145419.0, + "step": 15105 + }, + { + "epoch": 1.9216384683882457, + "ewc_loss": 0.04503960907459259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2519803678733297e-05, + "grad_norm": 29.26653480529785, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8641610145568848, + "num_tokens": 576191410.0, + "step": 15106 + }, + { + "epoch": 1.9217656786668362, + "ewc_loss": 0.04505465552210808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2527327018906362e-05, + "grad_norm": 29.441425323486328, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8617115020751953, + "num_tokens": 576226383.0, + "step": 15107 + }, + { + "epoch": 1.9218928889454268, + "ewc_loss": 0.04502316191792488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.251158184662927e-05, + "grad_norm": 29.253215789794922, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8596805334091187, + "num_tokens": 576259958.0, + "step": 15108 + }, + { + "epoch": 1.9220200992240173, + "ewc_loss": 0.04508902505040169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2544512830791064e-05, + "grad_norm": 29.396469116210938, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8705624341964722, + "num_tokens": 576299467.0, + "step": 15109 + }, + { + "epoch": 1.9221473095026078, + "ewc_loss": 0.045012861490249634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2506430468638428e-05, + "grad_norm": 29.398849487304688, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8614670038223267, + "num_tokens": 576334137.0, + "step": 15110 + }, + { + "epoch": 1.9222745197811983, + "ewc_loss": 0.04505101218819618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2525506210513413e-05, + "grad_norm": 29.326475143432617, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8639906644821167, + "num_tokens": 576379582.0, + "step": 15111 + }, + { + "epoch": 1.9224017300597889, + "ewc_loss": 0.04494128003716469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.247064003313426e-05, + "grad_norm": 29.316390991210938, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8813906908035278, + "num_tokens": 576414590.0, + "step": 15112 + }, + { + "epoch": 1.9225289403383794, + "ewc_loss": 0.04502832144498825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2514161173603497e-05, + "grad_norm": 29.432432174682617, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.859495222568512, + "num_tokens": 576445023.0, + "step": 15113 + }, + { + "epoch": 1.92265615061697, + "ewc_loss": 0.045011356472969055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.250567740702536e-05, + "grad_norm": 29.367761611938477, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8640879392623901, + "num_tokens": 576477861.0, + "step": 15114 + }, + { + "epoch": 1.9227833608955605, + "ewc_loss": 0.045000527054071426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2500264094560407e-05, + "grad_norm": 29.3692684173584, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8632703423500061, + "num_tokens": 576513121.0, + "step": 15115 + }, + { + "epoch": 1.922910571174151, + "ewc_loss": 0.045095279812812805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.254763967357576e-05, + "grad_norm": 29.500200271606445, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8738133311271667, + "num_tokens": 576551039.0, + "step": 15116 + }, + { + "epoch": 1.9230377814527415, + "ewc_loss": 0.044961363077163696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2480680854641832e-05, + "grad_norm": 29.251564025878906, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8606410026550293, + "num_tokens": 576590651.0, + "step": 15117 + }, + { + "epoch": 1.923164991731332, + "ewc_loss": 0.04491880163550377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.245940049760975e-05, + "grad_norm": 29.400991439819336, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8582256436347961, + "num_tokens": 576629119.0, + "step": 15118 + }, + { + "epoch": 1.9232922020099223, + "ewc_loss": 0.045076534152030945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2538266421179287e-05, + "grad_norm": 29.30415153503418, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.87715083360672, + "num_tokens": 576664240.0, + "step": 15119 + }, + { + "epoch": 1.9234194122885129, + "ewc_loss": 0.04496825486421585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2484127839561552e-05, + "grad_norm": 29.376237869262695, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8618922829627991, + "num_tokens": 576701946.0, + "step": 15120 + }, + { + "epoch": 1.9235466225671034, + "ewc_loss": 0.045031823217868805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.251591104140971e-05, + "grad_norm": 29.405550003051758, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8784789443016052, + "num_tokens": 576733541.0, + "step": 15121 + }, + { + "epoch": 1.923673832845694, + "ewc_loss": 0.0449652262032032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2482612621388398e-05, + "grad_norm": 29.382766723632812, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8534964323043823, + "num_tokens": 576770930.0, + "step": 15122 + }, + { + "epoch": 1.9238010431242845, + "ewc_loss": 0.04503756761550903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2518783225677907e-05, + "grad_norm": 29.314298629760742, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.867677628993988, + "num_tokens": 576807455.0, + "step": 15123 + }, + { + "epoch": 1.923928253402875, + "ewc_loss": 0.044981103390455246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2490552510134876e-05, + "grad_norm": 29.344585418701172, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8738219738006592, + "num_tokens": 576849577.0, + "step": 15124 + }, + { + "epoch": 1.9240554636814655, + "ewc_loss": 0.045051585882902145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2525793610839173e-05, + "grad_norm": 29.292314529418945, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8610817193984985, + "num_tokens": 576883506.0, + "step": 15125 + }, + { + "epoch": 1.9241826739600558, + "ewc_loss": 0.04508034512400627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2540172722074203e-05, + "grad_norm": 29.34250831604004, + "learning_rate": 1e-06, + "loss": 0.4836, + "mean_token_accuracy": 0.8504935503005981, + "num_tokens": 576923892.0, + "step": 15126 + }, + { + "epoch": 1.9243098842386464, + "ewc_loss": 0.04513178765773773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.256589323224034e-05, + "grad_norm": 29.410348892211914, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8844302892684937, + "num_tokens": 576970832.0, + "step": 15127 + }, + { + "epoch": 1.9244370945172369, + "ewc_loss": 0.0451325923204422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.256629704788793e-05, + "grad_norm": 29.336856842041016, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8494789600372314, + "num_tokens": 577006990.0, + "step": 15128 + }, + { + "epoch": 1.9245643047958274, + "ewc_loss": 0.045092854648828506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2546428226632997e-05, + "grad_norm": 29.2790470123291, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8719463348388672, + "num_tokens": 577044932.0, + "step": 15129 + }, + { + "epoch": 1.924691515074418, + "ewc_loss": 0.04512087255716324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2560436264029704e-05, + "grad_norm": 29.4571475982666, + "learning_rate": 1e-06, + "loss": 0.4851, + "mean_token_accuracy": 0.8508483171463013, + "num_tokens": 577076217.0, + "step": 15130 + }, + { + "epoch": 1.9248187253530085, + "ewc_loss": 0.04511816427111626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2559082935913466e-05, + "grad_norm": 29.39550018310547, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8833177089691162, + "num_tokens": 577112696.0, + "step": 15131 + }, + { + "epoch": 1.924945935631599, + "ewc_loss": 0.0450674332678318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.253371712868102e-05, + "grad_norm": 29.29949188232422, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8666379451751709, + "num_tokens": 577156397.0, + "step": 15132 + }, + { + "epoch": 1.9250731459101895, + "ewc_loss": 0.045057766139507294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2528882254846394e-05, + "grad_norm": 29.373483657836914, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.871566653251648, + "num_tokens": 577194192.0, + "step": 15133 + }, + { + "epoch": 1.92520035618878, + "ewc_loss": 0.045095790177583694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2547894332092255e-05, + "grad_norm": 29.34469985961914, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8617222309112549, + "num_tokens": 577232639.0, + "step": 15134 + }, + { + "epoch": 1.9253275664673706, + "ewc_loss": 0.045048318803310394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.252416015835479e-05, + "grad_norm": 29.317779541015625, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8760683536529541, + "num_tokens": 577267349.0, + "step": 15135 + }, + { + "epoch": 1.925454776745961, + "ewc_loss": 0.04512019455432892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2560097932000645e-05, + "grad_norm": 29.396921157836914, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8615372180938721, + "num_tokens": 577308329.0, + "step": 15136 + }, + { + "epoch": 1.9255819870245516, + "ewc_loss": 0.04508291929960251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2541460566571914e-05, + "grad_norm": 29.2719783782959, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8682482242584229, + "num_tokens": 577347825.0, + "step": 15137 + }, + { + "epoch": 1.9257091973031422, + "ewc_loss": 0.04507619887590408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.253809907415416e-05, + "grad_norm": 29.25859260559082, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8676102161407471, + "num_tokens": 577379981.0, + "step": 15138 + }, + { + "epoch": 1.9258364075817327, + "ewc_loss": 0.04518209397792816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2591046217712574e-05, + "grad_norm": 29.37483024597168, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8709468841552734, + "num_tokens": 577414134.0, + "step": 15139 + }, + { + "epoch": 1.9259636178603232, + "ewc_loss": 0.04506485164165497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.25324256462045e-05, + "grad_norm": 29.37367057800293, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.866051197052002, + "num_tokens": 577451170.0, + "step": 15140 + }, + { + "epoch": 1.9260908281389137, + "ewc_loss": 0.04512061923742294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2560308934771456e-05, + "grad_norm": 29.326248168945312, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.8530802726745605, + "num_tokens": 577487447.0, + "step": 15141 + }, + { + "epoch": 1.9262180384175043, + "ewc_loss": 0.04507958143949509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.253979073429946e-05, + "grad_norm": 29.269441604614258, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8714984059333801, + "num_tokens": 577525732.0, + "step": 15142 + }, + { + "epoch": 1.9263452486960948, + "ewc_loss": 0.045099660754203796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2549829736817628e-05, + "grad_norm": 29.416393280029297, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8660238981246948, + "num_tokens": 577563303.0, + "step": 15143 + }, + { + "epoch": 1.926472458974685, + "ewc_loss": 0.04515533149242401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.257766573166009e-05, + "grad_norm": 29.265289306640625, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8814530372619629, + "num_tokens": 577607592.0, + "step": 15144 + }, + { + "epoch": 1.9265996692532756, + "ewc_loss": 0.04503119736909866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.25155981752323e-05, + "grad_norm": 29.296138763427734, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8588732481002808, + "num_tokens": 577651646.0, + "step": 15145 + }, + { + "epoch": 1.9267268795318662, + "ewc_loss": 0.04513310268521309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2566551706404425e-05, + "grad_norm": 29.41645622253418, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8756254315376282, + "num_tokens": 577689530.0, + "step": 15146 + }, + { + "epoch": 1.9268540898104567, + "ewc_loss": 0.045163728296756744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2581863959203474e-05, + "grad_norm": 29.339380264282227, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8617402911186218, + "num_tokens": 577732524.0, + "step": 15147 + }, + { + "epoch": 1.9269813000890472, + "ewc_loss": 0.04502318426966667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.251159276056569e-05, + "grad_norm": 29.3228702545166, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8712461590766907, + "num_tokens": 577767164.0, + "step": 15148 + }, + { + "epoch": 1.9271085103676378, + "ewc_loss": 0.04511890932917595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.255945400975179e-05, + "grad_norm": 29.398591995239258, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8752453327178955, + "num_tokens": 577802347.0, + "step": 15149 + }, + { + "epoch": 1.927235720646228, + "ewc_loss": 0.04513592645525932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2567963242181577e-05, + "grad_norm": 29.398866653442383, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8601094484329224, + "num_tokens": 577842899.0, + "step": 15150 + }, + { + "epoch": 1.9273629309248186, + "ewc_loss": 0.04509488865733147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.254744504170958e-05, + "grad_norm": 29.464405059814453, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8666242361068726, + "num_tokens": 577885137.0, + "step": 15151 + }, + { + "epoch": 1.9274901412034091, + "ewc_loss": 0.04506566375494003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2532831280841492e-05, + "grad_norm": 29.337602615356445, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8806958198547363, + "num_tokens": 577925727.0, + "step": 15152 + }, + { + "epoch": 1.9276173514819996, + "ewc_loss": 0.045009784400463104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2504891603603028e-05, + "grad_norm": 29.38388442993164, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8709672689437866, + "num_tokens": 577965399.0, + "step": 15153 + }, + { + "epoch": 1.9277445617605902, + "ewc_loss": 0.04503261670470238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.251630758109968e-05, + "grad_norm": 29.256305694580078, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8670627474784851, + "num_tokens": 578007608.0, + "step": 15154 + }, + { + "epoch": 1.9278717720391807, + "ewc_loss": 0.04496384039521217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2481919586425647e-05, + "grad_norm": 29.390541076660156, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8782647848129272, + "num_tokens": 578043425.0, + "step": 15155 + }, + { + "epoch": 1.9279989823177712, + "ewc_loss": 0.0450633205473423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.253165985166561e-05, + "grad_norm": 29.3554630279541, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8653005361557007, + "num_tokens": 578075838.0, + "step": 15156 + }, + { + "epoch": 1.9281261925963618, + "ewc_loss": 0.044947411864995956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2473705030279234e-05, + "grad_norm": 29.31085968017578, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8622334003448486, + "num_tokens": 578110113.0, + "step": 15157 + }, + { + "epoch": 1.9282534028749523, + "ewc_loss": 0.044980790466070175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.249039607704617e-05, + "grad_norm": 29.34495735168457, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8684788346290588, + "num_tokens": 578145743.0, + "step": 15158 + }, + { + "epoch": 1.9283806131535428, + "ewc_loss": 0.04510899633169174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.255449908261653e-05, + "grad_norm": 29.300613403320312, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8748645782470703, + "num_tokens": 578181588.0, + "step": 15159 + }, + { + "epoch": 1.9285078234321333, + "ewc_loss": 0.045002348721027374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.250117358926218e-05, + "grad_norm": 29.559560775756836, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.865595281124115, + "num_tokens": 578222048.0, + "step": 15160 + }, + { + "epoch": 1.9286350337107239, + "ewc_loss": 0.04510699212551117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2553496819455177e-05, + "grad_norm": 29.343181610107422, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8700510263442993, + "num_tokens": 578262664.0, + "step": 15161 + }, + { + "epoch": 1.9287622439893144, + "ewc_loss": 0.044927146285772324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2463573259301484e-05, + "grad_norm": 29.392955780029297, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8560535907745361, + "num_tokens": 578303431.0, + "step": 15162 + }, + { + "epoch": 1.928889454267905, + "ewc_loss": 0.04493333399295807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2466667360276915e-05, + "grad_norm": 29.391725540161133, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8579503893852234, + "num_tokens": 578338978.0, + "step": 15163 + }, + { + "epoch": 1.9290166645464955, + "ewc_loss": 0.04499012604355812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.249506360385567e-05, + "grad_norm": 29.50702667236328, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8638738989830017, + "num_tokens": 578376340.0, + "step": 15164 + }, + { + "epoch": 1.929143874825086, + "ewc_loss": 0.04501684382557869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.250842226203531e-05, + "grad_norm": 29.35024070739746, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8696548342704773, + "num_tokens": 578414714.0, + "step": 15165 + }, + { + "epoch": 1.9292710851036765, + "ewc_loss": 0.04484187439084053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2420937966671772e-05, + "grad_norm": 29.301483154296875, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8528305888175964, + "num_tokens": 578453668.0, + "step": 15166 + }, + { + "epoch": 1.929398295382267, + "ewc_loss": 0.04505326598882675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2526632164954208e-05, + "grad_norm": 29.379884719848633, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8599377870559692, + "num_tokens": 578494148.0, + "step": 15167 + }, + { + "epoch": 1.9295255056608573, + "ewc_loss": 0.044874463230371475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2437232473748736e-05, + "grad_norm": 29.193326950073242, + "learning_rate": 1e-06, + "loss": 0.495, + "mean_token_accuracy": 0.8467491865158081, + "num_tokens": 578530743.0, + "step": 15168 + }, + { + "epoch": 1.9296527159394479, + "ewc_loss": 0.04511997103691101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2559985154657625e-05, + "grad_norm": 29.414236068725586, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8891687393188477, + "num_tokens": 578567613.0, + "step": 15169 + }, + { + "epoch": 1.9297799262180384, + "ewc_loss": 0.04505991190671921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2529955458594486e-05, + "grad_norm": 29.29671859741211, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8663944602012634, + "num_tokens": 578602981.0, + "step": 15170 + }, + { + "epoch": 1.929907136496629, + "ewc_loss": 0.04507814720273018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.253907405247446e-05, + "grad_norm": 29.425132751464844, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8608324527740479, + "num_tokens": 578641027.0, + "step": 15171 + }, + { + "epoch": 1.9300343467752195, + "ewc_loss": 0.04501871019601822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.250935540359933e-05, + "grad_norm": 29.27832794189453, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8804309368133545, + "num_tokens": 578683006.0, + "step": 15172 + }, + { + "epoch": 1.93016155705381, + "ewc_loss": 0.04495703801512718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.247851989523042e-05, + "grad_norm": 29.29440689086914, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8713527917861938, + "num_tokens": 578716722.0, + "step": 15173 + }, + { + "epoch": 1.9302887673324005, + "ewc_loss": 0.04507466033101082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.253732964163646e-05, + "grad_norm": 29.367637634277344, + "learning_rate": 1e-06, + "loss": 0.4872, + "mean_token_accuracy": 0.847373366355896, + "num_tokens": 578750740.0, + "step": 15174 + }, + { + "epoch": 1.9304159776109908, + "ewc_loss": 0.04503229260444641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2516145691042766e-05, + "grad_norm": 29.283153533935547, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8699724674224854, + "num_tokens": 578791358.0, + "step": 15175 + }, + { + "epoch": 1.9305431878895813, + "ewc_loss": 0.04504316672682762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2521582650369965e-05, + "grad_norm": 29.314285278320312, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8510987758636475, + "num_tokens": 578834514.0, + "step": 15176 + }, + { + "epoch": 1.9306703981681719, + "ewc_loss": 0.04516982659697533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2584912585443817e-05, + "grad_norm": 29.268798828125, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8599028587341309, + "num_tokens": 578875869.0, + "step": 15177 + }, + { + "epoch": 1.9307976084467624, + "ewc_loss": 0.045063622295856476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2531810827786103e-05, + "grad_norm": 29.360244750976562, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8707183599472046, + "num_tokens": 578921079.0, + "step": 15178 + }, + { + "epoch": 1.930924818725353, + "ewc_loss": 0.04521377757191658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2606889615417458e-05, + "grad_norm": 29.335620880126953, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8657928705215454, + "num_tokens": 578959837.0, + "step": 15179 + }, + { + "epoch": 1.9310520290039435, + "ewc_loss": 0.045011620968580246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.250581019325182e-05, + "grad_norm": 29.374574661254883, + "learning_rate": 1e-06, + "loss": 0.5335, + "mean_token_accuracy": 0.8391181230545044, + "num_tokens": 579001629.0, + "step": 15180 + }, + { + "epoch": 1.931179239282534, + "ewc_loss": 0.04517968371510506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2589842046727426e-05, + "grad_norm": 29.366769790649414, + "learning_rate": 1e-06, + "loss": 0.3635, + "mean_token_accuracy": 0.8899536728858948, + "num_tokens": 579036762.0, + "step": 15181 + }, + { + "epoch": 1.9313064495611245, + "ewc_loss": 0.045021869242191315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2510934286401607e-05, + "grad_norm": 29.341394424438477, + "learning_rate": 1e-06, + "loss": 0.4934, + "mean_token_accuracy": 0.8448419570922852, + "num_tokens": 579078455.0, + "step": 15182 + }, + { + "epoch": 1.931433659839715, + "ewc_loss": 0.0450950562953949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2547528715222143e-05, + "grad_norm": 29.383012771606445, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8699629902839661, + "num_tokens": 579116942.0, + "step": 15183 + }, + { + "epoch": 1.9315608701183056, + "ewc_loss": 0.04512716084718704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2563581296708435e-05, + "grad_norm": 29.477468490600586, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8791816830635071, + "num_tokens": 579155180.0, + "step": 15184 + }, + { + "epoch": 1.931688080396896, + "ewc_loss": 0.04507341980934143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2536709366249852e-05, + "grad_norm": 29.490327835083008, + "learning_rate": 1e-06, + "loss": 0.4762, + "mean_token_accuracy": 0.8597630858421326, + "num_tokens": 579194336.0, + "step": 15185 + }, + { + "epoch": 1.9318152906754866, + "ewc_loss": 0.04497082903981209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.248541386506986e-05, + "grad_norm": 29.297183990478516, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8631627559661865, + "num_tokens": 579226609.0, + "step": 15186 + }, + { + "epoch": 1.9319425009540772, + "ewc_loss": 0.04503297060728073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.251648584206123e-05, + "grad_norm": 29.455228805541992, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8714035749435425, + "num_tokens": 579259288.0, + "step": 15187 + }, + { + "epoch": 1.9320697112326677, + "ewc_loss": 0.04505467787384987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2527339751832187e-05, + "grad_norm": 29.368385314941406, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8749786615371704, + "num_tokens": 579294797.0, + "step": 15188 + }, + { + "epoch": 1.9321969215112582, + "ewc_loss": 0.0450112447142601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2505622837343253e-05, + "grad_norm": 29.42575454711914, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.877255916595459, + "num_tokens": 579334352.0, + "step": 15189 + }, + { + "epoch": 1.9323241317898487, + "ewc_loss": 0.045101240277290344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.255062099720817e-05, + "grad_norm": 29.354555130004883, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8560497164726257, + "num_tokens": 579379618.0, + "step": 15190 + }, + { + "epoch": 1.9324513420684393, + "ewc_loss": 0.044970445334911346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2485222871182486e-05, + "grad_norm": 29.35270881652832, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.866094708442688, + "num_tokens": 579420230.0, + "step": 15191 + }, + { + "epoch": 1.9325785523470298, + "ewc_loss": 0.045109011232852936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2554506358574145e-05, + "grad_norm": 29.387733459472656, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8671174645423889, + "num_tokens": 579458236.0, + "step": 15192 + }, + { + "epoch": 1.93270576262562, + "ewc_loss": 0.04506615176796913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2533075025421567e-05, + "grad_norm": 29.435426712036133, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8659865856170654, + "num_tokens": 579495561.0, + "step": 15193 + }, + { + "epoch": 1.9328329729042106, + "ewc_loss": 0.045051153749227524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.252557715110015e-05, + "grad_norm": 29.327062606811523, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8653432130813599, + "num_tokens": 579530225.0, + "step": 15194 + }, + { + "epoch": 1.9329601831828012, + "ewc_loss": 0.045147720724344254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2573860405827872e-05, + "grad_norm": 29.48863983154297, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8580299615859985, + "num_tokens": 579570143.0, + "step": 15195 + }, + { + "epoch": 1.9330873934613917, + "ewc_loss": 0.04515116289258003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2575581169803627e-05, + "grad_norm": 29.400236129760742, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8807676434516907, + "num_tokens": 579607105.0, + "step": 15196 + }, + { + "epoch": 1.9332146037399822, + "ewc_loss": 0.04503432288765907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2517160687129945e-05, + "grad_norm": 29.2659912109375, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8815443515777588, + "num_tokens": 579640018.0, + "step": 15197 + }, + { + "epoch": 1.9333418140185727, + "ewc_loss": 0.045067138969898224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2533569790539332e-05, + "grad_norm": 29.368188858032227, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.861766517162323, + "num_tokens": 579681250.0, + "step": 15198 + }, + { + "epoch": 1.933469024297163, + "ewc_loss": 0.0451958030462265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2597901988774538e-05, + "grad_norm": 29.505647659301758, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8565859794616699, + "num_tokens": 579718535.0, + "step": 15199 + }, + { + "epoch": 1.9335962345757536, + "ewc_loss": 0.0451011098921299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2550555513589643e-05, + "grad_norm": 29.35257911682129, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.855009913444519, + "num_tokens": 579756959.0, + "step": 15200 + }, + { + "epoch": 1.933723444854344, + "ewc_loss": 0.04504294693470001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2521473511005752e-05, + "grad_norm": 29.49080467224121, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8572310209274292, + "num_tokens": 579798266.0, + "step": 15201 + }, + { + "epoch": 1.9338506551329346, + "ewc_loss": 0.045179810374975204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.258990571135655e-05, + "grad_norm": 29.43828773498535, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.885472297668457, + "num_tokens": 579831869.0, + "step": 15202 + }, + { + "epoch": 1.9339778654115252, + "ewc_loss": 0.04504389688372612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2521948267240077e-05, + "grad_norm": 29.47791290283203, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.868552565574646, + "num_tokens": 579874168.0, + "step": 15203 + }, + { + "epoch": 1.9341050756901157, + "ewc_loss": 0.04513555020093918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.256777588627301e-05, + "grad_norm": 29.379745483398438, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8737912178039551, + "num_tokens": 579916313.0, + "step": 15204 + }, + { + "epoch": 1.9342322859687062, + "ewc_loss": 0.045097652822732925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.254882565466687e-05, + "grad_norm": 29.373876571655273, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8777298927307129, + "num_tokens": 579959022.0, + "step": 15205 + }, + { + "epoch": 1.9343594962472968, + "ewc_loss": 0.04506920650601387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2534602976520546e-05, + "grad_norm": 29.424169540405273, + "learning_rate": 1e-06, + "loss": 0.4747, + "mean_token_accuracy": 0.8525656461715698, + "num_tokens": 580003675.0, + "step": 15206 + }, + { + "epoch": 1.9344867065258873, + "ewc_loss": 0.045097921043634415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2548960259882733e-05, + "grad_norm": 29.381717681884766, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.876562237739563, + "num_tokens": 580044384.0, + "step": 15207 + }, + { + "epoch": 1.9346139168044778, + "ewc_loss": 0.045084401965141296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2542200895259157e-05, + "grad_norm": 29.4758243560791, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8564543724060059, + "num_tokens": 580078955.0, + "step": 15208 + }, + { + "epoch": 1.9347411270830683, + "ewc_loss": 0.04501776397228241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2508882466354407e-05, + "grad_norm": 29.440780639648438, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8638765215873718, + "num_tokens": 580116793.0, + "step": 15209 + }, + { + "epoch": 1.9348683373616589, + "ewc_loss": 0.044951822608709335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2475911464425735e-05, + "grad_norm": 29.38068389892578, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.866034209728241, + "num_tokens": 580153622.0, + "step": 15210 + }, + { + "epoch": 1.9349955476402494, + "ewc_loss": 0.04502199962735176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2510999770020135e-05, + "grad_norm": 29.3210506439209, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8655633926391602, + "num_tokens": 580195287.0, + "step": 15211 + }, + { + "epoch": 1.93512275791884, + "ewc_loss": 0.045051686465740204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.252584272355307e-05, + "grad_norm": 29.380544662475586, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8556604385375977, + "num_tokens": 580229601.0, + "step": 15212 + }, + { + "epoch": 1.9352499681974304, + "ewc_loss": 0.0450672022998333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2533600713359192e-05, + "grad_norm": 29.405778884887695, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8697986006736755, + "num_tokens": 580268079.0, + "step": 15213 + }, + { + "epoch": 1.935377178476021, + "ewc_loss": 0.04506629332900047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2533145966008306e-05, + "grad_norm": 29.323318481445312, + "learning_rate": 1e-06, + "loss": 0.5012, + "mean_token_accuracy": 0.8490056991577148, + "num_tokens": 580305379.0, + "step": 15214 + }, + { + "epoch": 1.9355043887546115, + "ewc_loss": 0.045128677040338516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.256433799630031e-05, + "grad_norm": 29.4918270111084, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8660478591918945, + "num_tokens": 580338438.0, + "step": 15215 + }, + { + "epoch": 1.935631599033202, + "ewc_loss": 0.0450834296643734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.254171522508841e-05, + "grad_norm": 29.302095413208008, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8670260906219482, + "num_tokens": 580370017.0, + "step": 15216 + }, + { + "epoch": 1.9357588093117923, + "ewc_loss": 0.04508891701698303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2544458261108957e-05, + "grad_norm": 29.354902267456055, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8774140477180481, + "num_tokens": 580409572.0, + "step": 15217 + }, + { + "epoch": 1.9358860195903829, + "ewc_loss": 0.04514826461672783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2574131435249e-05, + "grad_norm": 29.36076545715332, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8721612095832825, + "num_tokens": 580448481.0, + "step": 15218 + }, + { + "epoch": 1.9360132298689734, + "ewc_loss": 0.04510999098420143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.25549956667237e-05, + "grad_norm": 29.24201202392578, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8790417909622192, + "num_tokens": 580481919.0, + "step": 15219 + }, + { + "epoch": 1.936140440147564, + "ewc_loss": 0.04523075371980667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2615377019974403e-05, + "grad_norm": 29.51218605041504, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8619070053100586, + "num_tokens": 580520758.0, + "step": 15220 + }, + { + "epoch": 1.9362676504261545, + "ewc_loss": 0.045166656374931335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.258332824567333e-05, + "grad_norm": 29.307964324951172, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.858360230922699, + "num_tokens": 580559839.0, + "step": 15221 + }, + { + "epoch": 1.936394860704745, + "ewc_loss": 0.04507746547460556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2538732082466595e-05, + "grad_norm": 29.452272415161133, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8930724263191223, + "num_tokens": 580591767.0, + "step": 15222 + }, + { + "epoch": 1.9365220709833355, + "ewc_loss": 0.04521017149090767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.260508517792914e-05, + "grad_norm": 29.310407638549805, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8751965761184692, + "num_tokens": 580633180.0, + "step": 15223 + }, + { + "epoch": 1.9366492812619258, + "ewc_loss": 0.045153189450502396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2576594346901402e-05, + "grad_norm": 29.415849685668945, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8687506914138794, + "num_tokens": 580677045.0, + "step": 15224 + }, + { + "epoch": 1.9367764915405163, + "ewc_loss": 0.04521994665265083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.260997280245647e-05, + "grad_norm": 29.419239044189453, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8802116513252258, + "num_tokens": 580717423.0, + "step": 15225 + }, + { + "epoch": 1.9369037018191069, + "ewc_loss": 0.045071836560964584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.253591810585931e-05, + "grad_norm": 29.252506256103516, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8718717694282532, + "num_tokens": 580756677.0, + "step": 15226 + }, + { + "epoch": 1.9370309120976974, + "ewc_loss": 0.04514307528734207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2571537556359544e-05, + "grad_norm": 29.462438583374023, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8722434043884277, + "num_tokens": 580796993.0, + "step": 15227 + }, + { + "epoch": 1.937158122376288, + "ewc_loss": 0.045195404440164566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2597701899940148e-05, + "grad_norm": 29.405872344970703, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8697494268417358, + "num_tokens": 580834616.0, + "step": 15228 + }, + { + "epoch": 1.9372853326548785, + "ewc_loss": 0.04508039355278015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.254019636893645e-05, + "grad_norm": 29.381145477294922, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8703970313072205, + "num_tokens": 580873718.0, + "step": 15229 + }, + { + "epoch": 1.937412542933469, + "ewc_loss": 0.045086149126291275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.254307401017286e-05, + "grad_norm": 29.237998962402344, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8621469736099243, + "num_tokens": 580911678.0, + "step": 15230 + }, + { + "epoch": 1.9375397532120595, + "ewc_loss": 0.04515881463885307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2579406504519284e-05, + "grad_norm": 29.406282424926758, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8738580346107483, + "num_tokens": 580953282.0, + "step": 15231 + }, + { + "epoch": 1.93766696349065, + "ewc_loss": 0.04513690993189812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2568454369320534e-05, + "grad_norm": 29.34682846069336, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.873003363609314, + "num_tokens": 580997954.0, + "step": 15232 + }, + { + "epoch": 1.9377941737692406, + "ewc_loss": 0.04518439993262291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2592199456994422e-05, + "grad_norm": 29.450542449951172, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8687208890914917, + "num_tokens": 581035623.0, + "step": 15233 + }, + { + "epoch": 1.937921384047831, + "ewc_loss": 0.04517747834324837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2588739739148878e-05, + "grad_norm": 29.41981315612793, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8771932125091553, + "num_tokens": 581068727.0, + "step": 15234 + }, + { + "epoch": 1.9380485943264216, + "ewc_loss": 0.04515063017606735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.257531559735071e-05, + "grad_norm": 29.517045974731445, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8658939003944397, + "num_tokens": 581111180.0, + "step": 15235 + }, + { + "epoch": 1.9381758046050122, + "ewc_loss": 0.04511360079050064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2556800104212016e-05, + "grad_norm": 29.457544326782227, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8475785255432129, + "num_tokens": 581152373.0, + "step": 15236 + }, + { + "epoch": 1.9383030148836027, + "ewc_loss": 0.04499837011098862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2499185433844104e-05, + "grad_norm": 29.33467674255371, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8649019598960876, + "num_tokens": 581195854.0, + "step": 15237 + }, + { + "epoch": 1.9384302251621932, + "ewc_loss": 0.045132774859666824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2566387997358106e-05, + "grad_norm": 29.4766845703125, + "learning_rate": 1e-06, + "loss": 0.5235, + "mean_token_accuracy": 0.8400287628173828, + "num_tokens": 581236414.0, + "step": 15238 + }, + { + "epoch": 1.9385574354407837, + "ewc_loss": 0.04507095366716385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2535476091434248e-05, + "grad_norm": 29.427885055541992, + "learning_rate": 1e-06, + "loss": 0.472, + "mean_token_accuracy": 0.8504467010498047, + "num_tokens": 581275349.0, + "step": 15239 + }, + { + "epoch": 1.9386846457193743, + "ewc_loss": 0.04502083733677864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2510419512400404e-05, + "grad_norm": 29.464447021484375, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.87473464012146, + "num_tokens": 581313231.0, + "step": 15240 + }, + { + "epoch": 1.9388118559979648, + "ewc_loss": 0.04507819563150406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2539097699336708e-05, + "grad_norm": 29.325334548950195, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8680630922317505, + "num_tokens": 581355295.0, + "step": 15241 + }, + { + "epoch": 1.938939066276555, + "ewc_loss": 0.04502011463046074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2510057533509098e-05, + "grad_norm": 29.316864013671875, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8708181381225586, + "num_tokens": 581392633.0, + "step": 15242 + }, + { + "epoch": 1.9390662765551456, + "ewc_loss": 0.04512939974665642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2564699975191616e-05, + "grad_norm": 29.43331527709961, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8812423944473267, + "num_tokens": 581425552.0, + "step": 15243 + }, + { + "epoch": 1.9391934868337362, + "ewc_loss": 0.045111991465091705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.255599611089565e-05, + "grad_norm": 29.351810455322266, + "learning_rate": 1e-06, + "loss": 0.4727, + "mean_token_accuracy": 0.8553024530410767, + "num_tokens": 581467707.0, + "step": 15244 + }, + { + "epoch": 1.9393206971123267, + "ewc_loss": 0.04512058570981026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2560292563866824e-05, + "grad_norm": 29.400711059570312, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8731576204299927, + "num_tokens": 581504888.0, + "step": 15245 + }, + { + "epoch": 1.9394479073909172, + "ewc_loss": 0.04518916830420494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.259458415210247e-05, + "grad_norm": 29.392662048339844, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8670706748962402, + "num_tokens": 581537800.0, + "step": 15246 + }, + { + "epoch": 1.9395751176695077, + "ewc_loss": 0.045091696083545685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2545847969013266e-05, + "grad_norm": 29.324182510375977, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8882744908332825, + "num_tokens": 581570594.0, + "step": 15247 + }, + { + "epoch": 1.939702327948098, + "ewc_loss": 0.04514167830348015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2570839064428583e-05, + "grad_norm": 29.4527587890625, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8605721592903137, + "num_tokens": 581610764.0, + "step": 15248 + }, + { + "epoch": 1.9398295382266886, + "ewc_loss": 0.04507794603705406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2538972189067863e-05, + "grad_norm": 29.29530906677246, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8674403429031372, + "num_tokens": 581652341.0, + "step": 15249 + }, + { + "epoch": 1.939956748505279, + "ewc_loss": 0.04517121985554695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2585609258385375e-05, + "grad_norm": 29.50008201599121, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8599694967269897, + "num_tokens": 581689978.0, + "step": 15250 + }, + { + "epoch": 1.9400839587838696, + "ewc_loss": 0.04515037313103676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2575186449103057e-05, + "grad_norm": 29.347702026367188, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8756124377250671, + "num_tokens": 581728811.0, + "step": 15251 + }, + { + "epoch": 1.9402111690624602, + "ewc_loss": 0.045141641050577164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2570820874534547e-05, + "grad_norm": 29.436126708984375, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8763298392295837, + "num_tokens": 581768016.0, + "step": 15252 + }, + { + "epoch": 1.9403383793410507, + "ewc_loss": 0.04518875479698181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2594376787310466e-05, + "grad_norm": 29.41672134399414, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8699652552604675, + "num_tokens": 581805405.0, + "step": 15253 + }, + { + "epoch": 1.9404655896196412, + "ewc_loss": 0.04509453848004341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2547268599737436e-05, + "grad_norm": 29.468868255615234, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8477067947387695, + "num_tokens": 581845302.0, + "step": 15254 + }, + { + "epoch": 1.9405927998982317, + "ewc_loss": 0.04511348530650139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2556741896551102e-05, + "grad_norm": 29.3289794921875, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.876073956489563, + "num_tokens": 581875391.0, + "step": 15255 + }, + { + "epoch": 1.9407200101768223, + "ewc_loss": 0.04518980532884598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.259490247524809e-05, + "grad_norm": 29.44365882873535, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8667691946029663, + "num_tokens": 581916580.0, + "step": 15256 + }, + { + "epoch": 1.9408472204554128, + "ewc_loss": 0.04512219503521919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2561098376172595e-05, + "grad_norm": 29.41304588317871, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8679676055908203, + "num_tokens": 581958539.0, + "step": 15257 + }, + { + "epoch": 1.9409744307340033, + "ewc_loss": 0.045082464814186096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.254123319289647e-05, + "grad_norm": 29.292091369628906, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8658087849617004, + "num_tokens": 581996981.0, + "step": 15258 + }, + { + "epoch": 1.9411016410125939, + "ewc_loss": 0.045074425637722015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2537213226314634e-05, + "grad_norm": 29.423355102539062, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8771450519561768, + "num_tokens": 582036068.0, + "step": 15259 + }, + { + "epoch": 1.9412288512911844, + "ewc_loss": 0.04520744830369949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.260372457385529e-05, + "grad_norm": 29.371461868286133, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8702685832977295, + "num_tokens": 582073322.0, + "step": 15260 + }, + { + "epoch": 1.941356061569775, + "ewc_loss": 0.04512852057814598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2564259779755957e-05, + "grad_norm": 29.495805740356445, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8707336187362671, + "num_tokens": 582110736.0, + "step": 15261 + }, + { + "epoch": 1.9414832718483654, + "ewc_loss": 0.04513421282172203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2567106498172507e-05, + "grad_norm": 29.318201065063477, + "learning_rate": 1e-06, + "loss": 0.4783, + "mean_token_accuracy": 0.8508555889129639, + "num_tokens": 582153431.0, + "step": 15262 + }, + { + "epoch": 1.941610482126956, + "ewc_loss": 0.045107658952474594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2553829694516025e-05, + "grad_norm": 29.380455017089844, + "learning_rate": 1e-06, + "loss": 0.4888, + "mean_token_accuracy": 0.8493357300758362, + "num_tokens": 582191793.0, + "step": 15263 + }, + { + "epoch": 1.9417376924055465, + "ewc_loss": 0.045193810015916824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2596905182581395e-05, + "grad_norm": 29.455772399902344, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8580551743507385, + "num_tokens": 582227923.0, + "step": 15264 + }, + { + "epoch": 1.941864902684137, + "ewc_loss": 0.04510178416967392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.25508920266293e-05, + "grad_norm": 29.324420928955078, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.8526266813278198, + "num_tokens": 582268029.0, + "step": 15265 + }, + { + "epoch": 1.9419921129627273, + "ewc_loss": 0.045141272246837616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2570635337615386e-05, + "grad_norm": 29.414106369018555, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8619790077209473, + "num_tokens": 582302173.0, + "step": 15266 + }, + { + "epoch": 1.9421193232413179, + "ewc_loss": 0.04513709247112274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2568547137780115e-05, + "grad_norm": 29.312665939331055, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8824977874755859, + "num_tokens": 582341642.0, + "step": 15267 + }, + { + "epoch": 1.9422465335199084, + "ewc_loss": 0.04519783705472946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.259891880385112e-05, + "grad_norm": 29.46207046508789, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8695761561393738, + "num_tokens": 582378960.0, + "step": 15268 + }, + { + "epoch": 1.942373743798499, + "ewc_loss": 0.04522458836436272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2612293832935393e-05, + "grad_norm": 29.365314483642578, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8748436570167542, + "num_tokens": 582422149.0, + "step": 15269 + }, + { + "epoch": 1.9425009540770894, + "ewc_loss": 0.045207783579826355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2603891920880415e-05, + "grad_norm": 29.552059173583984, + "learning_rate": 1e-06, + "loss": 0.4754, + "mean_token_accuracy": 0.8515391945838928, + "num_tokens": 582462657.0, + "step": 15270 + }, + { + "epoch": 1.94262816435568, + "ewc_loss": 0.04520213603973389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.260106884932611e-05, + "grad_norm": 29.348758697509766, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8796796798706055, + "num_tokens": 582499264.0, + "step": 15271 + }, + { + "epoch": 1.9427553746342705, + "ewc_loss": 0.0450856052339077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.254280298075173e-05, + "grad_norm": 29.303434371948242, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8704210519790649, + "num_tokens": 582534744.0, + "step": 15272 + }, + { + "epoch": 1.9428825849128608, + "ewc_loss": 0.04523921757936478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.261960798932705e-05, + "grad_norm": 29.53093910217285, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8637259602546692, + "num_tokens": 582577010.0, + "step": 15273 + }, + { + "epoch": 1.9430097951914513, + "ewc_loss": 0.04519030451774597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2595151676796377e-05, + "grad_norm": 29.27983856201172, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8626266717910767, + "num_tokens": 582617103.0, + "step": 15274 + }, + { + "epoch": 1.9431370054700419, + "ewc_loss": 0.04507588967680931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.253794446005486e-05, + "grad_norm": 29.416152954101562, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8612056970596313, + "num_tokens": 582661754.0, + "step": 15275 + }, + { + "epoch": 1.9432642157486324, + "ewc_loss": 0.04520035907626152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2600179363507777e-05, + "grad_norm": 29.296741485595703, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8687279224395752, + "num_tokens": 582701701.0, + "step": 15276 + }, + { + "epoch": 1.943391426027223, + "ewc_loss": 0.04516519978642464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.258260064991191e-05, + "grad_norm": 29.4689998626709, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8593780994415283, + "num_tokens": 582741977.0, + "step": 15277 + }, + { + "epoch": 1.9435186363058135, + "ewc_loss": 0.04518253356218338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2591266315430403e-05, + "grad_norm": 29.37363624572754, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8764085173606873, + "num_tokens": 582783420.0, + "step": 15278 + }, + { + "epoch": 1.943645846584404, + "ewc_loss": 0.04515437036752701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2577185518457554e-05, + "grad_norm": 29.42043113708496, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8735542297363281, + "num_tokens": 582819464.0, + "step": 15279 + }, + { + "epoch": 1.9437730568629945, + "ewc_loss": 0.04511865973472595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2559330318472348e-05, + "grad_norm": 29.4113826751709, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8729435205459595, + "num_tokens": 582860757.0, + "step": 15280 + }, + { + "epoch": 1.943900267141585, + "ewc_loss": 0.0451631136238575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2581556549994275e-05, + "grad_norm": 29.352806091308594, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8621010780334473, + "num_tokens": 582910300.0, + "step": 15281 + }, + { + "epoch": 1.9440274774201756, + "ewc_loss": 0.04505682736635208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.252841295558028e-05, + "grad_norm": 29.31020164489746, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8600603342056274, + "num_tokens": 582956333.0, + "step": 15282 + }, + { + "epoch": 1.944154687698766, + "ewc_loss": 0.0452432744204998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2621637981501408e-05, + "grad_norm": 29.474063873291016, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8757312893867493, + "num_tokens": 582994068.0, + "step": 15283 + }, + { + "epoch": 1.9442818979773566, + "ewc_loss": 0.04515567794442177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2577838535653427e-05, + "grad_norm": 29.458984375, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8653733730316162, + "num_tokens": 583027829.0, + "step": 15284 + }, + { + "epoch": 1.9444091082559471, + "ewc_loss": 0.04505447298288345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2527236069436185e-05, + "grad_norm": 29.378347396850586, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8655725717544556, + "num_tokens": 583068881.0, + "step": 15285 + }, + { + "epoch": 1.9445363185345377, + "ewc_loss": 0.04506910219788551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2534550225827843e-05, + "grad_norm": 29.44611167907715, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8715740442276001, + "num_tokens": 583100097.0, + "step": 15286 + }, + { + "epoch": 1.9446635288131282, + "ewc_loss": 0.045118916779756546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2559457647730596e-05, + "grad_norm": 29.443981170654297, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8667017817497253, + "num_tokens": 583135340.0, + "step": 15287 + }, + { + "epoch": 1.9447907390917187, + "ewc_loss": 0.045073702931404114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.253685124742333e-05, + "grad_norm": 29.465150833129883, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8659793138504028, + "num_tokens": 583172032.0, + "step": 15288 + }, + { + "epoch": 1.9449179493703093, + "ewc_loss": 0.04502222314476967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.251111072837375e-05, + "grad_norm": 29.27345085144043, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8712340593338013, + "num_tokens": 583205244.0, + "step": 15289 + }, + { + "epoch": 1.9450451596488998, + "ewc_loss": 0.04510384052991867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2551919755642302e-05, + "grad_norm": 29.41469383239746, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8632625341415405, + "num_tokens": 583247537.0, + "step": 15290 + }, + { + "epoch": 1.94517236992749, + "ewc_loss": 0.045100852847099304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.255042636534199e-05, + "grad_norm": 29.280942916870117, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8605542778968811, + "num_tokens": 583285349.0, + "step": 15291 + }, + { + "epoch": 1.9452995802060806, + "ewc_loss": 0.045213762670755386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.260688052047044e-05, + "grad_norm": 29.65323829650879, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8691966533660889, + "num_tokens": 583323810.0, + "step": 15292 + }, + { + "epoch": 1.9454267904846712, + "ewc_loss": 0.04519781842827797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2598909708904102e-05, + "grad_norm": 29.289920806884766, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8802753686904907, + "num_tokens": 583354244.0, + "step": 15293 + }, + { + "epoch": 1.9455540007632617, + "ewc_loss": 0.045104894787073135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.255244726256933e-05, + "grad_norm": 29.510671615600586, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8809850215911865, + "num_tokens": 583391764.0, + "step": 15294 + }, + { + "epoch": 1.9456812110418522, + "ewc_loss": 0.045225780457258224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2612890461459756e-05, + "grad_norm": 29.34840202331543, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8549484014511108, + "num_tokens": 583426305.0, + "step": 15295 + }, + { + "epoch": 1.9458084213204427, + "ewc_loss": 0.045116592198610306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.255829531350173e-05, + "grad_norm": 29.41551971435547, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8636319637298584, + "num_tokens": 583465252.0, + "step": 15296 + }, + { + "epoch": 1.945935631599033, + "ewc_loss": 0.04524598643183708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.262299312860705e-05, + "grad_norm": 29.272266387939453, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8586888313293457, + "num_tokens": 583510001.0, + "step": 15297 + }, + { + "epoch": 1.9460628418776236, + "ewc_loss": 0.045136794447898865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2568397980649024e-05, + "grad_norm": 29.364273071289062, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8701033592224121, + "num_tokens": 583547380.0, + "step": 15298 + }, + { + "epoch": 1.946190052156214, + "ewc_loss": 0.04534469544887543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.267234776809346e-05, + "grad_norm": 29.46624755859375, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8686582446098328, + "num_tokens": 583588852.0, + "step": 15299 + }, + { + "epoch": 1.9463172624348046, + "ewc_loss": 0.04520409554243088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.260204746562522e-05, + "grad_norm": 29.505598068237305, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8743784427642822, + "num_tokens": 583634006.0, + "step": 15300 + }, + { + "epoch": 1.9464444727133952, + "ewc_loss": 0.04518992826342583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.259496432088781e-05, + "grad_norm": 29.303163528442383, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8540383577346802, + "num_tokens": 583679430.0, + "step": 15301 + }, + { + "epoch": 1.9465716829919857, + "ewc_loss": 0.04522136598825455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.261068220832385e-05, + "grad_norm": 29.432147979736328, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8735634088516235, + "num_tokens": 583708856.0, + "step": 15302 + }, + { + "epoch": 1.9466988932705762, + "ewc_loss": 0.04529507830739021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2647538571618497e-05, + "grad_norm": 29.45487403869629, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8659436106681824, + "num_tokens": 583750737.0, + "step": 15303 + }, + { + "epoch": 1.9468261035491667, + "ewc_loss": 0.04516775906085968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2583879399462603e-05, + "grad_norm": 29.565082550048828, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8731968402862549, + "num_tokens": 583786342.0, + "step": 15304 + }, + { + "epoch": 1.9469533138277573, + "ewc_loss": 0.045187514275312424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2593756511923857e-05, + "grad_norm": 29.425203323364258, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8730298280715942, + "num_tokens": 583821681.0, + "step": 15305 + }, + { + "epoch": 1.9470805241063478, + "ewc_loss": 0.04509298875927925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2546493710251525e-05, + "grad_norm": 29.54621124267578, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.84974604845047, + "num_tokens": 583855640.0, + "step": 15306 + }, + { + "epoch": 1.9472077343849383, + "ewc_loss": 0.045221541076898575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2610771338804625e-05, + "grad_norm": 29.451059341430664, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8780688643455505, + "num_tokens": 583888047.0, + "step": 15307 + }, + { + "epoch": 1.9473349446635289, + "ewc_loss": 0.045101530849933624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.255076469737105e-05, + "grad_norm": 29.42059898376465, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8658432960510254, + "num_tokens": 583923220.0, + "step": 15308 + }, + { + "epoch": 1.9474621549421194, + "ewc_loss": 0.045169949531555176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2584974431083538e-05, + "grad_norm": 29.38203239440918, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.862649142742157, + "num_tokens": 583965718.0, + "step": 15309 + }, + { + "epoch": 1.94758936522071, + "ewc_loss": 0.04511186107993126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.255593062727712e-05, + "grad_norm": 29.421377182006836, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8723726868629456, + "num_tokens": 584014027.0, + "step": 15310 + }, + { + "epoch": 1.9477165754993004, + "ewc_loss": 0.04517684131860733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2588421416003257e-05, + "grad_norm": 29.361129760742188, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8714824914932251, + "num_tokens": 584045019.0, + "step": 15311 + }, + { + "epoch": 1.947843785777891, + "ewc_loss": 0.045167475938797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2583737518289126e-05, + "grad_norm": 29.401731491088867, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8515708446502686, + "num_tokens": 584086839.0, + "step": 15312 + }, + { + "epoch": 1.9479709960564815, + "ewc_loss": 0.04525875300168991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2629375962424092e-05, + "grad_norm": 29.3656005859375, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.882445752620697, + "num_tokens": 584123785.0, + "step": 15313 + }, + { + "epoch": 1.948098206335072, + "ewc_loss": 0.045256152749061584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.262807720398996e-05, + "grad_norm": 29.388803482055664, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8586773872375488, + "num_tokens": 584163768.0, + "step": 15314 + }, + { + "epoch": 1.9482254166136623, + "ewc_loss": 0.04520672187209129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.260336077597458e-05, + "grad_norm": 29.3832950592041, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8646371364593506, + "num_tokens": 584201668.0, + "step": 15315 + }, + { + "epoch": 1.9483526268922529, + "ewc_loss": 0.04519476741552353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2597383576794527e-05, + "grad_norm": 29.338293075561523, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8756692409515381, + "num_tokens": 584236626.0, + "step": 15316 + }, + { + "epoch": 1.9484798371708434, + "ewc_loss": 0.04533407464623451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.266703813802451e-05, + "grad_norm": 29.422019958496094, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8517200946807861, + "num_tokens": 584279811.0, + "step": 15317 + }, + { + "epoch": 1.948607047449434, + "ewc_loss": 0.04528217762708664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2641088435193524e-05, + "grad_norm": 29.44013786315918, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8678750395774841, + "num_tokens": 584317314.0, + "step": 15318 + }, + { + "epoch": 1.9487342577280244, + "ewc_loss": 0.04520244151353836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2601219825446606e-05, + "grad_norm": 29.455278396606445, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8730608224868774, + "num_tokens": 584355768.0, + "step": 15319 + }, + { + "epoch": 1.948861468006615, + "ewc_loss": 0.045255858451128006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2627929865848273e-05, + "grad_norm": 29.466064453125, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8649424910545349, + "num_tokens": 584386205.0, + "step": 15320 + }, + { + "epoch": 1.9489886782852053, + "ewc_loss": 0.04517906531691551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2589532818528824e-05, + "grad_norm": 29.43726921081543, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8800761699676514, + "num_tokens": 584419721.0, + "step": 15321 + }, + { + "epoch": 1.9491158885637958, + "ewc_loss": 0.045206815004348755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.260340806969907e-05, + "grad_norm": 29.511810302734375, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8657301664352417, + "num_tokens": 584451029.0, + "step": 15322 + }, + { + "epoch": 1.9492430988423863, + "ewc_loss": 0.045240845531225204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.262042289657984e-05, + "grad_norm": 29.490196228027344, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8815903663635254, + "num_tokens": 584483991.0, + "step": 15323 + }, + { + "epoch": 1.9493703091209769, + "ewc_loss": 0.04523121938109398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2615609850618057e-05, + "grad_norm": 29.51761245727539, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8660765290260315, + "num_tokens": 584517652.0, + "step": 15324 + }, + { + "epoch": 1.9494975193995674, + "ewc_loss": 0.04523817077279091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2619085939368233e-05, + "grad_norm": 29.435680389404297, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8762632012367249, + "num_tokens": 584561058.0, + "step": 15325 + }, + { + "epoch": 1.949624729678158, + "ewc_loss": 0.04516902565956116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2584512407775037e-05, + "grad_norm": 29.455015182495117, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8710408806800842, + "num_tokens": 584595097.0, + "step": 15326 + }, + { + "epoch": 1.9497519399567484, + "ewc_loss": 0.04525909572839737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.262954876641743e-05, + "grad_norm": 29.40445899963379, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8744678497314453, + "num_tokens": 584639073.0, + "step": 15327 + }, + { + "epoch": 1.949879150235339, + "ewc_loss": 0.04522804915904999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2614023691858165e-05, + "grad_norm": 29.582494735717773, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8580037355422974, + "num_tokens": 584680517.0, + "step": 15328 + }, + { + "epoch": 1.9500063605139295, + "ewc_loss": 0.04522073268890381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2610365704167634e-05, + "grad_norm": 29.40932846069336, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8669676780700684, + "num_tokens": 584717796.0, + "step": 15329 + }, + { + "epoch": 1.95013357079252, + "ewc_loss": 0.04518468677997589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2592343157157302e-05, + "grad_norm": 29.36321258544922, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.881126880645752, + "num_tokens": 584755595.0, + "step": 15330 + }, + { + "epoch": 1.9502607810711106, + "ewc_loss": 0.045299969613552094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2649985112366267e-05, + "grad_norm": 29.48622703552246, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8571979999542236, + "num_tokens": 584798681.0, + "step": 15331 + }, + { + "epoch": 1.950387991349701, + "ewc_loss": 0.04523074999451637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2615375200985e-05, + "grad_norm": 29.379213333129883, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8706775307655334, + "num_tokens": 584835424.0, + "step": 15332 + }, + { + "epoch": 1.9505152016282916, + "ewc_loss": 0.04527595639228821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.263797796331346e-05, + "grad_norm": 29.4141902923584, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8812845945358276, + "num_tokens": 584872063.0, + "step": 15333 + }, + { + "epoch": 1.9506424119068821, + "ewc_loss": 0.04529069364070892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2645346689387225e-05, + "grad_norm": 29.45812225341797, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8829949498176575, + "num_tokens": 584905850.0, + "step": 15334 + }, + { + "epoch": 1.9507696221854727, + "ewc_loss": 0.04524441435933113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2622207325184718e-05, + "grad_norm": 29.483613967895508, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8771222233772278, + "num_tokens": 584946480.0, + "step": 15335 + }, + { + "epoch": 1.9508968324640632, + "ewc_loss": 0.04534781351685524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2673906642012298e-05, + "grad_norm": 29.535507202148438, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8748906254768372, + "num_tokens": 584977382.0, + "step": 15336 + }, + { + "epoch": 1.9510240427426537, + "ewc_loss": 0.045249078422784805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2624539269600064e-05, + "grad_norm": 29.4399356842041, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8703405857086182, + "num_tokens": 585010885.0, + "step": 15337 + }, + { + "epoch": 1.9511512530212443, + "ewc_loss": 0.04520546644926071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2602733224630356e-05, + "grad_norm": 29.39957046508789, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8658077716827393, + "num_tokens": 585048208.0, + "step": 15338 + }, + { + "epoch": 1.9512784632998348, + "ewc_loss": 0.04523269459605217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2616346541326493e-05, + "grad_norm": 29.444244384765625, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8638361692428589, + "num_tokens": 585087477.0, + "step": 15339 + }, + { + "epoch": 1.951405673578425, + "ewc_loss": 0.04524775221943855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.262387533846777e-05, + "grad_norm": 29.454875946044922, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8671092391014099, + "num_tokens": 585120860.0, + "step": 15340 + }, + { + "epoch": 1.9515328838570156, + "ewc_loss": 0.0452410951256752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2620548406848684e-05, + "grad_norm": 29.488479614257812, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8700660467147827, + "num_tokens": 585161887.0, + "step": 15341 + }, + { + "epoch": 1.9516600941356061, + "ewc_loss": 0.04521225392818451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2606127458857372e-05, + "grad_norm": 29.478670120239258, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8765323162078857, + "num_tokens": 585200496.0, + "step": 15342 + }, + { + "epoch": 1.9517873044141967, + "ewc_loss": 0.045190244913101196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.259512257296592e-05, + "grad_norm": 29.307126998901367, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8684553503990173, + "num_tokens": 585243976.0, + "step": 15343 + }, + { + "epoch": 1.9519145146927872, + "ewc_loss": 0.04526063799858093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.263031819893513e-05, + "grad_norm": 29.5003604888916, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8629378080368042, + "num_tokens": 585282366.0, + "step": 15344 + }, + { + "epoch": 1.9520417249713777, + "ewc_loss": 0.045283686369657516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2641843315795995e-05, + "grad_norm": 29.39949607849121, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8674488067626953, + "num_tokens": 585325224.0, + "step": 15345 + }, + { + "epoch": 1.952168935249968, + "ewc_loss": 0.045184846967458725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2592423192691058e-05, + "grad_norm": 29.40072250366211, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8767195343971252, + "num_tokens": 585361954.0, + "step": 15346 + }, + { + "epoch": 1.9522961455285586, + "ewc_loss": 0.045269157737493515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2634578272118233e-05, + "grad_norm": 29.471357345581055, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8648877143859863, + "num_tokens": 585406033.0, + "step": 15347 + }, + { + "epoch": 1.952423355807149, + "ewc_loss": 0.04527789726853371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2638949303654954e-05, + "grad_norm": 29.435672760009766, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8670951724052429, + "num_tokens": 585440457.0, + "step": 15348 + }, + { + "epoch": 1.9525505660857396, + "ewc_loss": 0.045240260660648346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.262013003928587e-05, + "grad_norm": 29.468921661376953, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8903887867927551, + "num_tokens": 585473451.0, + "step": 15349 + }, + { + "epoch": 1.9526777763643302, + "ewc_loss": 0.04516778513789177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2583892132388428e-05, + "grad_norm": 29.3160457611084, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8717304468154907, + "num_tokens": 585509652.0, + "step": 15350 + }, + { + "epoch": 1.9528049866429207, + "ewc_loss": 0.04524453729391098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.262226917082444e-05, + "grad_norm": 29.490188598632812, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8727882504463196, + "num_tokens": 585545591.0, + "step": 15351 + }, + { + "epoch": 1.9529321969215112, + "ewc_loss": 0.04529249668121338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2646248908131383e-05, + "grad_norm": 29.537755966186523, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8560280203819275, + "num_tokens": 585583757.0, + "step": 15352 + }, + { + "epoch": 1.9530594072001017, + "ewc_loss": 0.0452093631029129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2604681362281553e-05, + "grad_norm": 29.38220977783203, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8570495843887329, + "num_tokens": 585623897.0, + "step": 15353 + }, + { + "epoch": 1.9531866174786923, + "ewc_loss": 0.04529068246483803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2645341232419014e-05, + "grad_norm": 29.59636116027832, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8702601194381714, + "num_tokens": 585661043.0, + "step": 15354 + }, + { + "epoch": 1.9533138277572828, + "ewc_loss": 0.045244812965393066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2622407414019108e-05, + "grad_norm": 29.529983520507812, + "learning_rate": 1e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8413280248641968, + "num_tokens": 585699201.0, + "step": 15355 + }, + { + "epoch": 1.9534410380358733, + "ewc_loss": 0.04516709968447685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.258355016238056e-05, + "grad_norm": 29.484102249145508, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8604453206062317, + "num_tokens": 585735325.0, + "step": 15356 + }, + { + "epoch": 1.9535682483144639, + "ewc_loss": 0.04522330313920975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.261165172967594e-05, + "grad_norm": 29.51500129699707, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8755151033401489, + "num_tokens": 585769909.0, + "step": 15357 + }, + { + "epoch": 1.9536954585930544, + "ewc_loss": 0.045235902070999146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.261795088998042e-05, + "grad_norm": 29.53956413269043, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8573206067085266, + "num_tokens": 585804590.0, + "step": 15358 + }, + { + "epoch": 1.953822668871645, + "ewc_loss": 0.045207634568214417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.260381734231487e-05, + "grad_norm": 29.567386627197266, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8636362552642822, + "num_tokens": 585843349.0, + "step": 15359 + }, + { + "epoch": 1.9539498791502354, + "ewc_loss": 0.04514310508966446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2571552108274773e-05, + "grad_norm": 29.444543838500977, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8806273341178894, + "num_tokens": 585881846.0, + "step": 15360 + }, + { + "epoch": 1.954077089428826, + "ewc_loss": 0.04523930326104164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2619651645072736e-05, + "grad_norm": 29.572128295898438, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8658868074417114, + "num_tokens": 585917848.0, + "step": 15361 + }, + { + "epoch": 1.9542042997074165, + "ewc_loss": 0.04511965438723564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2559826902579516e-05, + "grad_norm": 29.3630313873291, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8738304376602173, + "num_tokens": 585961249.0, + "step": 15362 + }, + { + "epoch": 1.954331509986007, + "ewc_loss": 0.04518227279186249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.259113716718275e-05, + "grad_norm": 29.459083557128906, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.876611590385437, + "num_tokens": 586005795.0, + "step": 15363 + }, + { + "epoch": 1.9544587202645973, + "ewc_loss": 0.04521476477384567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.260738256154582e-05, + "grad_norm": 29.353918075561523, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8769152164459229, + "num_tokens": 586041523.0, + "step": 15364 + }, + { + "epoch": 1.9545859305431879, + "ewc_loss": 0.04521757364273071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2608786821365356e-05, + "grad_norm": 29.422740936279297, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8601099252700806, + "num_tokens": 586077800.0, + "step": 15365 + }, + { + "epoch": 1.9547131408217784, + "ewc_loss": 0.04520353674888611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2601769160246477e-05, + "grad_norm": 29.39261245727539, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8630199432373047, + "num_tokens": 586116010.0, + "step": 15366 + }, + { + "epoch": 1.954840351100369, + "ewc_loss": 0.04522263631224632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2611318854615092e-05, + "grad_norm": 29.4320125579834, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8802698850631714, + "num_tokens": 586155359.0, + "step": 15367 + }, + { + "epoch": 1.9549675613789594, + "ewc_loss": 0.04527590423822403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.263795249746181e-05, + "grad_norm": 29.487516403198242, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8729669451713562, + "num_tokens": 586188190.0, + "step": 15368 + }, + { + "epoch": 1.95509477165755, + "ewc_loss": 0.045178014785051346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.25890071305912e-05, + "grad_norm": 29.36876106262207, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8680764436721802, + "num_tokens": 586225971.0, + "step": 15369 + }, + { + "epoch": 1.9552219819361403, + "ewc_loss": 0.04525960981845856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.262980524392333e-05, + "grad_norm": 29.490514755249023, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8647110462188721, + "num_tokens": 586263295.0, + "step": 15370 + }, + { + "epoch": 1.9553491922147308, + "ewc_loss": 0.04518033564090729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.259016764583066e-05, + "grad_norm": 29.461505889892578, + "learning_rate": 1e-06, + "loss": 0.3593, + "mean_token_accuracy": 0.890297532081604, + "num_tokens": 586293547.0, + "step": 15371 + }, + { + "epoch": 1.9554764024933213, + "ewc_loss": 0.045143973082304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.257198684674222e-05, + "grad_norm": 29.446168899536133, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8640279769897461, + "num_tokens": 586333180.0, + "step": 15372 + }, + { + "epoch": 1.9556036127719119, + "ewc_loss": 0.045276496559381485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.263824899273459e-05, + "grad_norm": 29.54144859313965, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8694652318954468, + "num_tokens": 586368780.0, + "step": 15373 + }, + { + "epoch": 1.9557308230505024, + "ewc_loss": 0.045153722167015076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2576861738343723e-05, + "grad_norm": 29.42841339111328, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8531101942062378, + "num_tokens": 586404177.0, + "step": 15374 + }, + { + "epoch": 1.955858033329093, + "ewc_loss": 0.04525475203990936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2627376893069595e-05, + "grad_norm": 29.42989730834961, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8535524606704712, + "num_tokens": 586446917.0, + "step": 15375 + }, + { + "epoch": 1.9559852436076834, + "ewc_loss": 0.045285727828741074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2642863768851385e-05, + "grad_norm": 29.451026916503906, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.86920166015625, + "num_tokens": 586485832.0, + "step": 15376 + }, + { + "epoch": 1.956112453886274, + "ewc_loss": 0.04532775655388832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.266387855343055e-05, + "grad_norm": 29.39203643798828, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8785215020179749, + "num_tokens": 586524089.0, + "step": 15377 + }, + { + "epoch": 1.9562396641648645, + "ewc_loss": 0.045277826488018036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2638912923866883e-05, + "grad_norm": 29.452880859375, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8726688027381897, + "num_tokens": 586564097.0, + "step": 15378 + }, + { + "epoch": 1.956366874443455, + "ewc_loss": 0.04527856782078743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2639283997705206e-05, + "grad_norm": 29.41057586669922, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.881457507610321, + "num_tokens": 586600653.0, + "step": 15379 + }, + { + "epoch": 1.9564940847220456, + "ewc_loss": 0.04533912241458893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2669561076327227e-05, + "grad_norm": 29.43414878845215, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8750499486923218, + "num_tokens": 586639527.0, + "step": 15380 + }, + { + "epoch": 1.956621295000636, + "ewc_loss": 0.04532909393310547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.266454612254165e-05, + "grad_norm": 29.368825912475586, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8743876218795776, + "num_tokens": 586670885.0, + "step": 15381 + }, + { + "epoch": 1.9567485052792266, + "ewc_loss": 0.04537561908364296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.26878091780236e-05, + "grad_norm": 29.530824661254883, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8813719749450684, + "num_tokens": 586709875.0, + "step": 15382 + }, + { + "epoch": 1.9568757155578171, + "ewc_loss": 0.04536906257271767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.268453135911841e-05, + "grad_norm": 29.256433486938477, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8618278503417969, + "num_tokens": 586753164.0, + "step": 15383 + }, + { + "epoch": 1.9570029258364077, + "ewc_loss": 0.04526705667376518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2633528715232387e-05, + "grad_norm": 29.536392211914062, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8753523230552673, + "num_tokens": 586793737.0, + "step": 15384 + }, + { + "epoch": 1.9571301361149982, + "ewc_loss": 0.045369621366262436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2684811483486556e-05, + "grad_norm": 29.334545135498047, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.863205075263977, + "num_tokens": 586833428.0, + "step": 15385 + }, + { + "epoch": 1.9572573463935887, + "ewc_loss": 0.045280229300260544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2640115275862627e-05, + "grad_norm": 29.411548614501953, + "learning_rate": 1e-06, + "loss": 0.5083, + "mean_token_accuracy": 0.8459815979003906, + "num_tokens": 586871202.0, + "step": 15386 + }, + { + "epoch": 1.9573845566721793, + "ewc_loss": 0.045393191277980804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2696594896842726e-05, + "grad_norm": 29.486892700195312, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8788580298423767, + "num_tokens": 586905537.0, + "step": 15387 + }, + { + "epoch": 1.9575117669507698, + "ewc_loss": 0.04542398080229759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2711990823154338e-05, + "grad_norm": 29.350465774536133, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8578433394432068, + "num_tokens": 586940378.0, + "step": 15388 + }, + { + "epoch": 1.95763897722936, + "ewc_loss": 0.045269355177879333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2634678316535428e-05, + "grad_norm": 29.4631404876709, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8789471983909607, + "num_tokens": 586978806.0, + "step": 15389 + }, + { + "epoch": 1.9577661875079506, + "ewc_loss": 0.04536205530166626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.268102798552718e-05, + "grad_norm": 29.345073699951172, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8612468838691711, + "num_tokens": 587007367.0, + "step": 15390 + }, + { + "epoch": 1.9578933977865411, + "ewc_loss": 0.045340392738580704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2670195903629065e-05, + "grad_norm": 29.457002639770508, + "learning_rate": 1e-06, + "loss": 0.362, + "mean_token_accuracy": 0.8912474513053894, + "num_tokens": 587040422.0, + "step": 15391 + }, + { + "epoch": 1.9580206080651317, + "ewc_loss": 0.045380886644124985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2690443074679933e-05, + "grad_norm": 29.489253997802734, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8861770629882812, + "num_tokens": 587070238.0, + "step": 15392 + }, + { + "epoch": 1.9581478183437222, + "ewc_loss": 0.04533540457487106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2667702069156803e-05, + "grad_norm": 29.384206771850586, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8748371601104736, + "num_tokens": 587105973.0, + "step": 15393 + }, + { + "epoch": 1.9582750286223127, + "ewc_loss": 0.0454435870051384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2721793357050046e-05, + "grad_norm": 29.530569076538086, + "learning_rate": 1e-06, + "loss": 0.3803, + "mean_token_accuracy": 0.881574273109436, + "num_tokens": 587137975.0, + "step": 15394 + }, + { + "epoch": 1.958402238900903, + "ewc_loss": 0.045401278883218765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.270064032927621e-05, + "grad_norm": 29.491003036499023, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8643436431884766, + "num_tokens": 587169176.0, + "step": 15395 + }, + { + "epoch": 1.9585294491794936, + "ewc_loss": 0.04540148004889488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2700740373693407e-05, + "grad_norm": 29.490631103515625, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8667404055595398, + "num_tokens": 587205570.0, + "step": 15396 + }, + { + "epoch": 1.958656659458084, + "ewc_loss": 0.045424479991197586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2712240024702623e-05, + "grad_norm": 29.50483512878418, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8606515526771545, + "num_tokens": 587246729.0, + "step": 15397 + }, + { + "epoch": 1.9587838697366746, + "ewc_loss": 0.04549860581755638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2749303752789274e-05, + "grad_norm": 29.394424438476562, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8729713559150696, + "num_tokens": 587284537.0, + "step": 15398 + }, + { + "epoch": 1.9589110800152651, + "ewc_loss": 0.04546860232949257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2734300728188828e-05, + "grad_norm": 29.56772804260254, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8732078671455383, + "num_tokens": 587324394.0, + "step": 15399 + }, + { + "epoch": 1.9590382902938557, + "ewc_loss": 0.04550756886601448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2753783923690207e-05, + "grad_norm": 29.420381546020508, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8761084675788879, + "num_tokens": 587359935.0, + "step": 15400 + }, + { + "epoch": 1.9591655005724462, + "ewc_loss": 0.04536214843392372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.268107346026227e-05, + "grad_norm": 29.483558654785156, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8806883692741394, + "num_tokens": 587402081.0, + "step": 15401 + }, + { + "epoch": 1.9592927108510367, + "ewc_loss": 0.04543896019458771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2719479602528736e-05, + "grad_norm": 29.33170509338379, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8657064437866211, + "num_tokens": 587442939.0, + "step": 15402 + }, + { + "epoch": 1.9594199211296273, + "ewc_loss": 0.04537727311253548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2688636818202212e-05, + "grad_norm": 29.51346778869629, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8607336282730103, + "num_tokens": 587479155.0, + "step": 15403 + }, + { + "epoch": 1.9595471314082178, + "ewc_loss": 0.04544760659337044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2723803340340964e-05, + "grad_norm": 29.419513702392578, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8581631183624268, + "num_tokens": 587519285.0, + "step": 15404 + }, + { + "epoch": 1.9596743416868083, + "ewc_loss": 0.04529242962598801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2646214347332716e-05, + "grad_norm": 29.38119888305664, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8510416746139526, + "num_tokens": 587563978.0, + "step": 15405 + }, + { + "epoch": 1.9598015519653988, + "ewc_loss": 0.04548664763569832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2743322915630415e-05, + "grad_norm": 29.50931739807129, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.862671434879303, + "num_tokens": 587599052.0, + "step": 15406 + }, + { + "epoch": 1.9599287622439894, + "ewc_loss": 0.045464251190423965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2732125216862187e-05, + "grad_norm": 29.303607940673828, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8723409175872803, + "num_tokens": 587639980.0, + "step": 15407 + }, + { + "epoch": 1.96005597252258, + "ewc_loss": 0.04540787637233734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.270393815706484e-05, + "grad_norm": 29.65562629699707, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8645662069320679, + "num_tokens": 587673836.0, + "step": 15408 + }, + { + "epoch": 1.9601831828011704, + "ewc_loss": 0.04549124091863632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2745620299247093e-05, + "grad_norm": 29.392658233642578, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8685036301612854, + "num_tokens": 587712896.0, + "step": 15409 + }, + { + "epoch": 1.960310393079761, + "ewc_loss": 0.04531867057085037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2659334717900492e-05, + "grad_norm": 29.568096160888672, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8622168898582458, + "num_tokens": 587747352.0, + "step": 15410 + }, + { + "epoch": 1.9604376033583515, + "ewc_loss": 0.04541301727294922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.270650838909205e-05, + "grad_norm": 29.407075881958008, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8674039840698242, + "num_tokens": 587790234.0, + "step": 15411 + }, + { + "epoch": 1.960564813636942, + "ewc_loss": 0.045306626707315445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2653313862974755e-05, + "grad_norm": 29.42951202392578, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8822071552276611, + "num_tokens": 587830185.0, + "step": 15412 + }, + { + "epoch": 1.9606920239155323, + "ewc_loss": 0.04540351405739784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.270175718876999e-05, + "grad_norm": 29.365386962890625, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8700065612792969, + "num_tokens": 587866265.0, + "step": 15413 + }, + { + "epoch": 1.9608192341941229, + "ewc_loss": 0.04531354084610939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2656769942841493e-05, + "grad_norm": 29.42218589782715, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8802334666252136, + "num_tokens": 587904956.0, + "step": 15414 + }, + { + "epoch": 1.9609464444727134, + "ewc_loss": 0.045410364866256714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2705182345816866e-05, + "grad_norm": 29.382070541381836, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8748825788497925, + "num_tokens": 587942369.0, + "step": 15415 + }, + { + "epoch": 1.961073654751304, + "ewc_loss": 0.04539784416556358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.269892138428986e-05, + "grad_norm": 29.471817016601562, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8745484352111816, + "num_tokens": 587979340.0, + "step": 15416 + }, + { + "epoch": 1.9612008650298944, + "ewc_loss": 0.04551294073462486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2756470571039245e-05, + "grad_norm": 29.443866729736328, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8643237352371216, + "num_tokens": 588021812.0, + "step": 15417 + }, + { + "epoch": 1.961328075308485, + "ewc_loss": 0.04538808390498161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2694041035720147e-05, + "grad_norm": 29.51082992553711, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8648219108581543, + "num_tokens": 588064978.0, + "step": 15418 + }, + { + "epoch": 1.9614552855870753, + "ewc_loss": 0.04537928104400635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.268964090035297e-05, + "grad_norm": 29.46649932861328, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8645980358123779, + "num_tokens": 588098142.0, + "step": 15419 + }, + { + "epoch": 1.9615824958656658, + "ewc_loss": 0.04534190148115158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2670950784231536e-05, + "grad_norm": 29.432470321655273, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8766361474990845, + "num_tokens": 588141808.0, + "step": 15420 + }, + { + "epoch": 1.9617097061442563, + "ewc_loss": 0.04543354734778404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.271677294629626e-05, + "grad_norm": 29.638837814331055, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8755814433097839, + "num_tokens": 588178137.0, + "step": 15421 + }, + { + "epoch": 1.9618369164228469, + "ewc_loss": 0.04530497267842293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2652486222796142e-05, + "grad_norm": 29.442480087280273, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8624606728553772, + "num_tokens": 588215646.0, + "step": 15422 + }, + { + "epoch": 1.9619641267014374, + "ewc_loss": 0.04525146633386612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.262573252664879e-05, + "grad_norm": 29.416881561279297, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8614786267280579, + "num_tokens": 588251345.0, + "step": 15423 + }, + { + "epoch": 1.962091336980028, + "ewc_loss": 0.04532436281442642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.266218143631704e-05, + "grad_norm": 29.457395553588867, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8746273517608643, + "num_tokens": 588291222.0, + "step": 15424 + }, + { + "epoch": 1.9622185472586184, + "ewc_loss": 0.04530175030231476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.26508745981846e-05, + "grad_norm": 29.457754135131836, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8600075244903564, + "num_tokens": 588329480.0, + "step": 15425 + }, + { + "epoch": 1.962345757537209, + "ewc_loss": 0.045349087566137314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.267454328830354e-05, + "grad_norm": 29.52775001525879, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8599885702133179, + "num_tokens": 588367143.0, + "step": 15426 + }, + { + "epoch": 1.9624729678157995, + "ewc_loss": 0.04535234719514847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2676173102809116e-05, + "grad_norm": 29.43877410888672, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8770459890365601, + "num_tokens": 588408363.0, + "step": 15427 + }, + { + "epoch": 1.96260017809439, + "ewc_loss": 0.04531725496053696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2658627131022513e-05, + "grad_norm": 29.554264068603516, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.875696063041687, + "num_tokens": 588445397.0, + "step": 15428 + }, + { + "epoch": 1.9627273883729806, + "ewc_loss": 0.04530801251530647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2654006897937506e-05, + "grad_norm": 29.556177139282227, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8747979402542114, + "num_tokens": 588480436.0, + "step": 15429 + }, + { + "epoch": 1.962854598651571, + "ewc_loss": 0.04525339603424072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2626698410022072e-05, + "grad_norm": 29.39731788635254, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8676901459693909, + "num_tokens": 588526918.0, + "step": 15430 + }, + { + "epoch": 1.9629818089301616, + "ewc_loss": 0.04535197094082832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.267598574690055e-05, + "grad_norm": 29.63002586364746, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8687995076179504, + "num_tokens": 588561300.0, + "step": 15431 + }, + { + "epoch": 1.9631090192087521, + "ewc_loss": 0.04536937549710274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2684687792207114e-05, + "grad_norm": 29.482728958129883, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.858539879322052, + "num_tokens": 588603393.0, + "step": 15432 + }, + { + "epoch": 1.9632362294873427, + "ewc_loss": 0.04520942643284798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2604714104090817e-05, + "grad_norm": 29.478302001953125, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8761301040649414, + "num_tokens": 588643806.0, + "step": 15433 + }, + { + "epoch": 1.9633634397659332, + "ewc_loss": 0.04528268054127693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2641339455731213e-05, + "grad_norm": 29.4737606048584, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8681648969650269, + "num_tokens": 588685090.0, + "step": 15434 + }, + { + "epoch": 1.9634906500445237, + "ewc_loss": 0.045231763273477554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2615880880039185e-05, + "grad_norm": 29.47809600830078, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8566986322402954, + "num_tokens": 588720983.0, + "step": 15435 + }, + { + "epoch": 1.9636178603231143, + "ewc_loss": 0.04528573900461197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2642869225819595e-05, + "grad_norm": 29.443769454956055, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8614118099212646, + "num_tokens": 588760143.0, + "step": 15436 + }, + { + "epoch": 1.9637450706017048, + "ewc_loss": 0.045297443866729736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2648722733720206e-05, + "grad_norm": 29.45777702331543, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8708534240722656, + "num_tokens": 588796636.0, + "step": 15437 + }, + { + "epoch": 1.963872280880295, + "ewc_loss": 0.0453217588365078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2660879039904103e-05, + "grad_norm": 29.570480346679688, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8651912212371826, + "num_tokens": 588831681.0, + "step": 15438 + }, + { + "epoch": 1.9639994911588856, + "ewc_loss": 0.0454118587076664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2705929950461723e-05, + "grad_norm": 29.5739688873291, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8786903023719788, + "num_tokens": 588875408.0, + "step": 15439 + }, + { + "epoch": 1.9641267014374761, + "ewc_loss": 0.04521966725587845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2609832740272395e-05, + "grad_norm": 29.352294921875, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8725788593292236, + "num_tokens": 588913583.0, + "step": 15440 + }, + { + "epoch": 1.9642539117160667, + "ewc_loss": 0.04527553915977478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2637768779532053e-05, + "grad_norm": 29.561357498168945, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8787176012992859, + "num_tokens": 588949833.0, + "step": 15441 + }, + { + "epoch": 1.9643811219946572, + "ewc_loss": 0.04532766342163086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2663831259706058e-05, + "grad_norm": 29.45581817626953, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8740496635437012, + "num_tokens": 588985678.0, + "step": 15442 + }, + { + "epoch": 1.9645083322732477, + "ewc_loss": 0.04522859677672386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.26142983592581e-05, + "grad_norm": 29.537261962890625, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8679576516151428, + "num_tokens": 589018276.0, + "step": 15443 + }, + { + "epoch": 1.964635542551838, + "ewc_loss": 0.0452948696911335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2647434889222495e-05, + "grad_norm": 29.52647590637207, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8676174879074097, + "num_tokens": 589061900.0, + "step": 15444 + }, + { + "epoch": 1.9647627528304286, + "ewc_loss": 0.0453009232878685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2650461687589996e-05, + "grad_norm": 29.42058753967285, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8708792924880981, + "num_tokens": 589102740.0, + "step": 15445 + }, + { + "epoch": 1.964889963109019, + "ewc_loss": 0.04521366208791733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2606831407756545e-05, + "grad_norm": 29.46107292175293, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8779731392860413, + "num_tokens": 589135276.0, + "step": 15446 + }, + { + "epoch": 1.9650171733876096, + "ewc_loss": 0.04529919847846031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2649599486612715e-05, + "grad_norm": 29.43364143371582, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.868486225605011, + "num_tokens": 589173390.0, + "step": 15447 + }, + { + "epoch": 1.9651443836662001, + "ewc_loss": 0.0452427864074707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.262139241793193e-05, + "grad_norm": 29.50356101989746, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8621492981910706, + "num_tokens": 589214592.0, + "step": 15448 + }, + { + "epoch": 1.9652715939447907, + "ewc_loss": 0.04532887414097786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.266443698317744e-05, + "grad_norm": 29.41788101196289, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8690973520278931, + "num_tokens": 589250682.0, + "step": 15449 + }, + { + "epoch": 1.9653988042233812, + "ewc_loss": 0.04531162604689598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2655813154415227e-05, + "grad_norm": 29.549734115600586, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8629977703094482, + "num_tokens": 589288326.0, + "step": 15450 + }, + { + "epoch": 1.9655260145019717, + "ewc_loss": 0.045307498425245285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2653748601442203e-05, + "grad_norm": 29.454885482788086, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8649508357048035, + "num_tokens": 589321632.0, + "step": 15451 + }, + { + "epoch": 1.9656532247805623, + "ewc_loss": 0.04531272500753403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.26563624892151e-05, + "grad_norm": 29.50780487060547, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8596945405006409, + "num_tokens": 589361134.0, + "step": 15452 + }, + { + "epoch": 1.9657804350591528, + "ewc_loss": 0.04537946358323097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2689731849823147e-05, + "grad_norm": 29.489261627197266, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8716980218887329, + "num_tokens": 589402338.0, + "step": 15453 + }, + { + "epoch": 1.9659076453377433, + "ewc_loss": 0.045324064791202545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.266203227918595e-05, + "grad_norm": 29.376956939697266, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.86211097240448, + "num_tokens": 589441080.0, + "step": 15454 + }, + { + "epoch": 1.9660348556163338, + "ewc_loss": 0.045390091836452484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2695045117870905e-05, + "grad_norm": 29.543014526367188, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8657824397087097, + "num_tokens": 589484019.0, + "step": 15455 + }, + { + "epoch": 1.9661620658949244, + "ewc_loss": 0.045444440096616745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.272222081955988e-05, + "grad_norm": 29.427656173706055, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8637726306915283, + "num_tokens": 589525890.0, + "step": 15456 + }, + { + "epoch": 1.966289276173515, + "ewc_loss": 0.04533567279577255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2667836674372666e-05, + "grad_norm": 29.509706497192383, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8739632368087769, + "num_tokens": 589560644.0, + "step": 15457 + }, + { + "epoch": 1.9664164864521054, + "ewc_loss": 0.04538072645664215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2690363039146177e-05, + "grad_norm": 29.481666564941406, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8884240388870239, + "num_tokens": 589600863.0, + "step": 15458 + }, + { + "epoch": 1.966543696730696, + "ewc_loss": 0.045311421155929565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.265571129100863e-05, + "grad_norm": 29.49394416809082, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8674194812774658, + "num_tokens": 589637948.0, + "step": 15459 + }, + { + "epoch": 1.9666709070092865, + "ewc_loss": 0.04539753869175911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2698768589179963e-05, + "grad_norm": 29.44927406311035, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8819390535354614, + "num_tokens": 589678061.0, + "step": 15460 + }, + { + "epoch": 1.966798117287877, + "ewc_loss": 0.04532941058278084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.266470437461976e-05, + "grad_norm": 29.467735290527344, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8585803508758545, + "num_tokens": 589718050.0, + "step": 15461 + }, + { + "epoch": 1.9669253275664673, + "ewc_loss": 0.04538393393158913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2691967387800105e-05, + "grad_norm": 29.520057678222656, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8723616600036621, + "num_tokens": 589759206.0, + "step": 15462 + }, + { + "epoch": 1.9670525378450578, + "ewc_loss": 0.04530596733093262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.265298280690331e-05, + "grad_norm": 29.429292678833008, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8573722243309021, + "num_tokens": 589800218.0, + "step": 15463 + }, + { + "epoch": 1.9671797481236484, + "ewc_loss": 0.045326098799705505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2663049094262533e-05, + "grad_norm": 29.712472915649414, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8694450259208679, + "num_tokens": 589843105.0, + "step": 15464 + }, + { + "epoch": 1.967306958402239, + "ewc_loss": 0.04534338042140007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.267169111291878e-05, + "grad_norm": 29.363061904907227, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8595044612884521, + "num_tokens": 589881560.0, + "step": 15465 + }, + { + "epoch": 1.9674341686808294, + "ewc_loss": 0.0452408567070961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.262042835354805e-05, + "grad_norm": 29.61453628540039, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8572803139686584, + "num_tokens": 589922755.0, + "step": 15466 + }, + { + "epoch": 1.96756137895942, + "ewc_loss": 0.04540024697780609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2700123736285605e-05, + "grad_norm": 29.445812225341797, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.850723922252655, + "num_tokens": 589964982.0, + "step": 15467 + }, + { + "epoch": 1.9676885892380103, + "ewc_loss": 0.04527297988533974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.263649002998136e-05, + "grad_norm": 29.599205017089844, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8550772070884705, + "num_tokens": 590003371.0, + "step": 15468 + }, + { + "epoch": 1.9678157995166008, + "ewc_loss": 0.04538601636886597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2693007849738933e-05, + "grad_norm": 29.473892211914062, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8656294345855713, + "num_tokens": 590048361.0, + "step": 15469 + }, + { + "epoch": 1.9679430097951913, + "ewc_loss": 0.04527191072702408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2635955247096717e-05, + "grad_norm": 29.423124313354492, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8653020858764648, + "num_tokens": 590083141.0, + "step": 15470 + }, + { + "epoch": 1.9680702200737819, + "ewc_loss": 0.04533028230071068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.266514093207661e-05, + "grad_norm": 29.376239776611328, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8623220920562744, + "num_tokens": 590128161.0, + "step": 15471 + }, + { + "epoch": 1.9681974303523724, + "ewc_loss": 0.045260846614837646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2630423700320534e-05, + "grad_norm": 29.50770378112793, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8838901519775391, + "num_tokens": 590166101.0, + "step": 15472 + }, + { + "epoch": 1.968324640630963, + "ewc_loss": 0.04535327106714249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2676635126117617e-05, + "grad_norm": 29.492834091186523, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8675159215927124, + "num_tokens": 590207643.0, + "step": 15473 + }, + { + "epoch": 1.9684518509095534, + "ewc_loss": 0.045221347361803055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2610673113376833e-05, + "grad_norm": 29.410968780517578, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8614908456802368, + "num_tokens": 590247567.0, + "step": 15474 + }, + { + "epoch": 1.968579061188144, + "ewc_loss": 0.04530193656682968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.265096736664418e-05, + "grad_norm": 29.341896057128906, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8639233112335205, + "num_tokens": 590287940.0, + "step": 15475 + }, + { + "epoch": 1.9687062714667345, + "ewc_loss": 0.045397039502859116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2698519387631677e-05, + "grad_norm": 29.66566276550293, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8625760078430176, + "num_tokens": 590322558.0, + "step": 15476 + }, + { + "epoch": 1.968833481745325, + "ewc_loss": 0.04543852433562279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.271926132380031e-05, + "grad_norm": 29.52610206604004, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8672665953636169, + "num_tokens": 590362913.0, + "step": 15477 + }, + { + "epoch": 1.9689606920239155, + "ewc_loss": 0.04516596719622612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.258298445667606e-05, + "grad_norm": 29.4354190826416, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8595170974731445, + "num_tokens": 590405877.0, + "step": 15478 + }, + { + "epoch": 1.969087902302506, + "ewc_loss": 0.04536334425210953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2681671907776035e-05, + "grad_norm": 29.49056625366211, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8596764802932739, + "num_tokens": 590445373.0, + "step": 15479 + }, + { + "epoch": 1.9692151125810966, + "ewc_loss": 0.04526973515748978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2634867491433397e-05, + "grad_norm": 29.576976776123047, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8573310971260071, + "num_tokens": 590483319.0, + "step": 15480 + }, + { + "epoch": 1.9693423228596871, + "ewc_loss": 0.04526188597083092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2630942112300545e-05, + "grad_norm": 29.431961059570312, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8548029661178589, + "num_tokens": 590522433.0, + "step": 15481 + }, + { + "epoch": 1.9694695331382777, + "ewc_loss": 0.045210547745227814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.260527435282711e-05, + "grad_norm": 29.544139862060547, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8724004030227661, + "num_tokens": 590564342.0, + "step": 15482 + }, + { + "epoch": 1.9695967434168682, + "ewc_loss": 0.04525537043809891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2627684302278794e-05, + "grad_norm": 29.539939880371094, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8697711825370789, + "num_tokens": 590594527.0, + "step": 15483 + }, + { + "epoch": 1.9697239536954587, + "ewc_loss": 0.04529939964413643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.264969953102991e-05, + "grad_norm": 29.552688598632812, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8793326616287231, + "num_tokens": 590630573.0, + "step": 15484 + }, + { + "epoch": 1.9698511639740492, + "ewc_loss": 0.04527228698134422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2636142603005283e-05, + "grad_norm": 29.57213020324707, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.868962824344635, + "num_tokens": 590669333.0, + "step": 15485 + }, + { + "epoch": 1.9699783742526398, + "ewc_loss": 0.04528077319264412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2640386305283755e-05, + "grad_norm": 29.48373031616211, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8828368186950684, + "num_tokens": 590709884.0, + "step": 15486 + }, + { + "epoch": 1.97010558453123, + "ewc_loss": 0.045276906341314316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2638452719547786e-05, + "grad_norm": 29.50731086730957, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8738261461257935, + "num_tokens": 590746109.0, + "step": 15487 + }, + { + "epoch": 1.9702327948098206, + "ewc_loss": 0.045289989560842514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.264499562443234e-05, + "grad_norm": 29.459829330444336, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8525722622871399, + "num_tokens": 590787881.0, + "step": 15488 + }, + { + "epoch": 1.9703600050884111, + "ewc_loss": 0.04529998451471329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.264999238832388e-05, + "grad_norm": 29.50017738342285, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8537135720252991, + "num_tokens": 590826861.0, + "step": 15489 + }, + { + "epoch": 1.9704872153670017, + "ewc_loss": 0.04533541947603226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2667709345114417e-05, + "grad_norm": 29.456907272338867, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8752564191818237, + "num_tokens": 590863104.0, + "step": 15490 + }, + { + "epoch": 1.9706144256455922, + "ewc_loss": 0.04529501125216484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2647505829809234e-05, + "grad_norm": 29.520509719848633, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8745895624160767, + "num_tokens": 590900582.0, + "step": 15491 + }, + { + "epoch": 1.9707416359241827, + "ewc_loss": 0.0453193336725235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.265966759296134e-05, + "grad_norm": 29.445642471313477, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8747577667236328, + "num_tokens": 590937735.0, + "step": 15492 + }, + { + "epoch": 1.970868846202773, + "ewc_loss": 0.04531402140855789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.265701004944276e-05, + "grad_norm": 29.49053382873535, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.88238525390625, + "num_tokens": 590973089.0, + "step": 15493 + }, + { + "epoch": 1.9709960564813636, + "ewc_loss": 0.0453542098402977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2677104425383732e-05, + "grad_norm": 29.489234924316406, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8545560836791992, + "num_tokens": 591012117.0, + "step": 15494 + }, + { + "epoch": 1.971123266759954, + "ewc_loss": 0.04535355418920517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2676777007291093e-05, + "grad_norm": 29.506011962890625, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8489824533462524, + "num_tokens": 591044978.0, + "step": 15495 + }, + { + "epoch": 1.9712504770385446, + "ewc_loss": 0.04536762833595276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2683814677293412e-05, + "grad_norm": 29.46918296813965, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.864776611328125, + "num_tokens": 591085562.0, + "step": 15496 + }, + { + "epoch": 1.9713776873171351, + "ewc_loss": 0.04539303854107857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2696518499287777e-05, + "grad_norm": 29.620845794677734, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8590567111968994, + "num_tokens": 591120140.0, + "step": 15497 + }, + { + "epoch": 1.9715048975957257, + "ewc_loss": 0.04535889998078346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2679449102724902e-05, + "grad_norm": 29.501163482666016, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8584529161453247, + "num_tokens": 591153952.0, + "step": 15498 + }, + { + "epoch": 1.9716321078743162, + "ewc_loss": 0.045298293232917786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2649146558251232e-05, + "grad_norm": 29.4437255859375, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8842333555221558, + "num_tokens": 591192983.0, + "step": 15499 + }, + { + "epoch": 1.9717593181529067, + "ewc_loss": 0.04541505128145218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2707525204168633e-05, + "grad_norm": 29.522939682006836, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8690539598464966, + "num_tokens": 591226260.0, + "step": 15500 + }, + { + "epoch": 1.9718865284314973, + "ewc_loss": 0.04536455124616623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2682275812258013e-05, + "grad_norm": 29.438629150390625, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8553775548934937, + "num_tokens": 591259361.0, + "step": 15501 + }, + { + "epoch": 1.9720137387100878, + "ewc_loss": 0.04549683630466461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2748417904949747e-05, + "grad_norm": 29.536794662475586, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8681116104125977, + "num_tokens": 591299753.0, + "step": 15502 + }, + { + "epoch": 1.9721409489886783, + "ewc_loss": 0.045491740107536316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.274586950079538e-05, + "grad_norm": 29.468395233154297, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8695278167724609, + "num_tokens": 591335008.0, + "step": 15503 + }, + { + "epoch": 1.9722681592672688, + "ewc_loss": 0.045451391488313675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2725695089320652e-05, + "grad_norm": 29.563098907470703, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8743857145309448, + "num_tokens": 591373001.0, + "step": 15504 + }, + { + "epoch": 1.9723953695458594, + "ewc_loss": 0.04548904299736023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2744521629647352e-05, + "grad_norm": 29.539854049682617, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8701847791671753, + "num_tokens": 591408637.0, + "step": 15505 + }, + { + "epoch": 1.97252257982445, + "ewc_loss": 0.045447807759046555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.272390338475816e-05, + "grad_norm": 29.54669189453125, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8648403286933899, + "num_tokens": 591444908.0, + "step": 15506 + }, + { + "epoch": 1.9726497901030404, + "ewc_loss": 0.045433372259140015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.271668563480489e-05, + "grad_norm": 29.49159049987793, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8702470660209656, + "num_tokens": 591484641.0, + "step": 15507 + }, + { + "epoch": 1.972777000381631, + "ewc_loss": 0.045506760478019714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.275338010804262e-05, + "grad_norm": 29.522050857543945, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8658307790756226, + "num_tokens": 591521859.0, + "step": 15508 + }, + { + "epoch": 1.9729042106602215, + "ewc_loss": 0.045499496161937714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.274974758620374e-05, + "grad_norm": 29.487154006958008, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8669378757476807, + "num_tokens": 591559124.0, + "step": 15509 + }, + { + "epoch": 1.973031420938812, + "ewc_loss": 0.04543197154998779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2715985323884524e-05, + "grad_norm": 29.4282283782959, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8659417629241943, + "num_tokens": 591597170.0, + "step": 15510 + }, + { + "epoch": 1.9731586312174023, + "ewc_loss": 0.04538433253765106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.269216565764509e-05, + "grad_norm": 29.388891220092773, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8736486434936523, + "num_tokens": 591634784.0, + "step": 15511 + }, + { + "epoch": 1.9732858414959928, + "ewc_loss": 0.0454782210290432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2739110136171803e-05, + "grad_norm": 29.506715774536133, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8732436895370483, + "num_tokens": 591671904.0, + "step": 15512 + }, + { + "epoch": 1.9734130517745834, + "ewc_loss": 0.04546699672937393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2733498553861864e-05, + "grad_norm": 29.42607307434082, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8701845407485962, + "num_tokens": 591712990.0, + "step": 15513 + }, + { + "epoch": 1.973540262053174, + "ewc_loss": 0.045412540435791016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2706270101480186e-05, + "grad_norm": 29.383113861083984, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8649821281433105, + "num_tokens": 591748579.0, + "step": 15514 + }, + { + "epoch": 1.9736674723317644, + "ewc_loss": 0.04551399499177933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2756998077966273e-05, + "grad_norm": 29.468616485595703, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8620824813842773, + "num_tokens": 591785865.0, + "step": 15515 + }, + { + "epoch": 1.973794682610355, + "ewc_loss": 0.045499399304389954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2749700292479247e-05, + "grad_norm": 29.460798263549805, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.877822995185852, + "num_tokens": 591820087.0, + "step": 15516 + }, + { + "epoch": 1.9739218928889453, + "ewc_loss": 0.04552706331014633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.276353188790381e-05, + "grad_norm": 29.50345802307129, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8668966889381409, + "num_tokens": 591859713.0, + "step": 15517 + }, + { + "epoch": 1.9740491031675358, + "ewc_loss": 0.045500338077545166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.275016959174536e-05, + "grad_norm": 29.490406036376953, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8753257393836975, + "num_tokens": 591893142.0, + "step": 15518 + }, + { + "epoch": 1.9741763134461263, + "ewc_loss": 0.04540173336863518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.270086588396225e-05, + "grad_norm": 29.441408157348633, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.862542986869812, + "num_tokens": 591931529.0, + "step": 15519 + }, + { + "epoch": 1.9743035237247168, + "ewc_loss": 0.04555511102080345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2777556296205148e-05, + "grad_norm": 29.483766555786133, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8724797368049622, + "num_tokens": 591976248.0, + "step": 15520 + }, + { + "epoch": 1.9744307340033074, + "ewc_loss": 0.045452240854501724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2726120732841082e-05, + "grad_norm": 29.460128784179688, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8656895160675049, + "num_tokens": 592018008.0, + "step": 15521 + }, + { + "epoch": 1.974557944281898, + "ewc_loss": 0.04548840597271919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.274420330650173e-05, + "grad_norm": 29.387741088867188, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8544749617576599, + "num_tokens": 592062934.0, + "step": 15522 + }, + { + "epoch": 1.9746851545604884, + "ewc_loss": 0.04547147452831268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.273573772981763e-05, + "grad_norm": 29.524181365966797, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8664652109146118, + "num_tokens": 592103129.0, + "step": 15523 + }, + { + "epoch": 1.974812364839079, + "ewc_loss": 0.04552827775478363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2764139430364594e-05, + "grad_norm": 29.410118103027344, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8628965616226196, + "num_tokens": 592142116.0, + "step": 15524 + }, + { + "epoch": 1.9749395751176695, + "ewc_loss": 0.04546686261892319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2733431251253933e-05, + "grad_norm": 29.50516700744629, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8683516979217529, + "num_tokens": 592182507.0, + "step": 15525 + }, + { + "epoch": 1.97506678539626, + "ewc_loss": 0.04551197215914726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.27559867198579e-05, + "grad_norm": 29.42323112487793, + "learning_rate": 1e-06, + "loss": 0.4949, + "mean_token_accuracy": 0.8458797335624695, + "num_tokens": 592225593.0, + "step": 15526 + }, + { + "epoch": 1.9751939956748505, + "ewc_loss": 0.04555518180131912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2777590857003815e-05, + "grad_norm": 29.622997283935547, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8922752737998962, + "num_tokens": 592270676.0, + "step": 15527 + }, + { + "epoch": 1.975321205953441, + "ewc_loss": 0.045431945472955704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.27159725909587e-05, + "grad_norm": 29.360450744628906, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8870760202407837, + "num_tokens": 592305537.0, + "step": 15528 + }, + { + "epoch": 1.9754484162320316, + "ewc_loss": 0.04541785642504692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2708927644998766e-05, + "grad_norm": 29.503273010253906, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.874985933303833, + "num_tokens": 592338289.0, + "step": 15529 + }, + { + "epoch": 1.9755756265106221, + "ewc_loss": 0.0454162061214447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.270810364279896e-05, + "grad_norm": 29.46340560913086, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8702248334884644, + "num_tokens": 592379189.0, + "step": 15530 + }, + { + "epoch": 1.9757028367892127, + "ewc_loss": 0.04542124643921852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.271062294312287e-05, + "grad_norm": 29.439956665039062, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8819912672042847, + "num_tokens": 592421875.0, + "step": 15531 + }, + { + "epoch": 1.9758300470678032, + "ewc_loss": 0.045407794415950775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2703898139297962e-05, + "grad_norm": 29.53805923461914, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8505123853683472, + "num_tokens": 592457438.0, + "step": 15532 + }, + { + "epoch": 1.9759572573463937, + "ewc_loss": 0.0454132966697216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2706648451276124e-05, + "grad_norm": 29.416847229003906, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8787745237350464, + "num_tokens": 592493230.0, + "step": 15533 + }, + { + "epoch": 1.9760844676249842, + "ewc_loss": 0.0453362837433815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.266814226459246e-05, + "grad_norm": 29.601085662841797, + "learning_rate": 1e-06, + "loss": 0.4839, + "mean_token_accuracy": 0.8516241312026978, + "num_tokens": 592533656.0, + "step": 15534 + }, + { + "epoch": 1.9762116779035748, + "ewc_loss": 0.04549095779657364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2745478418073617e-05, + "grad_norm": 29.42305564880371, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8821800947189331, + "num_tokens": 592571622.0, + "step": 15535 + }, + { + "epoch": 1.976338888182165, + "ewc_loss": 0.04533669352531433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.266834599140566e-05, + "grad_norm": 29.405454635620117, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8634617328643799, + "num_tokens": 592613260.0, + "step": 15536 + }, + { + "epoch": 1.9764660984607556, + "ewc_loss": 0.04550112411379814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2750562493456528e-05, + "grad_norm": 29.5726318359375, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8654085993766785, + "num_tokens": 592648872.0, + "step": 15537 + }, + { + "epoch": 1.9765933087393461, + "ewc_loss": 0.04537579417228699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.268789648951497e-05, + "grad_norm": 29.528711318969727, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.868749737739563, + "num_tokens": 592682336.0, + "step": 15538 + }, + { + "epoch": 1.9767205190179367, + "ewc_loss": 0.04533208906650543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2666044969810173e-05, + "grad_norm": 29.424806594848633, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8822318315505981, + "num_tokens": 592716412.0, + "step": 15539 + }, + { + "epoch": 1.9768477292965272, + "ewc_loss": 0.045424684882164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2712341888109222e-05, + "grad_norm": 29.52170753479004, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8708477020263672, + "num_tokens": 592753936.0, + "step": 15540 + }, + { + "epoch": 1.9769749395751177, + "ewc_loss": 0.04539601504802704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.269800825160928e-05, + "grad_norm": 29.532472610473633, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8692002296447754, + "num_tokens": 592792731.0, + "step": 15541 + }, + { + "epoch": 1.977102149853708, + "ewc_loss": 0.045443639159202576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2721818822901696e-05, + "grad_norm": 29.51283073425293, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8714269995689392, + "num_tokens": 592832035.0, + "step": 15542 + }, + { + "epoch": 1.9772293601322986, + "ewc_loss": 0.04535549134016037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2677746528643183e-05, + "grad_norm": 29.457250595092773, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8862583637237549, + "num_tokens": 592869693.0, + "step": 15543 + }, + { + "epoch": 1.977356570410889, + "ewc_loss": 0.04546242579817772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2731212084181607e-05, + "grad_norm": 29.426847457885742, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8718371987342834, + "num_tokens": 592909648.0, + "step": 15544 + }, + { + "epoch": 1.9774837806894796, + "ewc_loss": 0.04544922336935997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.272461097163614e-05, + "grad_norm": 29.551025390625, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.871518611907959, + "num_tokens": 592942539.0, + "step": 15545 + }, + { + "epoch": 1.9776109909680701, + "ewc_loss": 0.04544100537896156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.272050187457353e-05, + "grad_norm": 29.416261672973633, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8672506213188171, + "num_tokens": 592983680.0, + "step": 15546 + }, + { + "epoch": 1.9777382012466607, + "ewc_loss": 0.045482829213142395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2741414795746095e-05, + "grad_norm": 29.545726776123047, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8571906685829163, + "num_tokens": 593023161.0, + "step": 15547 + }, + { + "epoch": 1.9778654115252512, + "ewc_loss": 0.04550112783908844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.275056431244593e-05, + "grad_norm": 29.471527099609375, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8498385548591614, + "num_tokens": 593066612.0, + "step": 15548 + }, + { + "epoch": 1.9779926218038417, + "ewc_loss": 0.04542592540383339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.271296216349583e-05, + "grad_norm": 29.492395401000977, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8561561107635498, + "num_tokens": 593105224.0, + "step": 15549 + }, + { + "epoch": 1.9781198320824323, + "ewc_loss": 0.04549095779657364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2745478418073617e-05, + "grad_norm": 29.59981346130371, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8695297241210938, + "num_tokens": 593143683.0, + "step": 15550 + }, + { + "epoch": 1.9782470423610228, + "ewc_loss": 0.04546651616692543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2733258447260596e-05, + "grad_norm": 29.60116958618164, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8535361289978027, + "num_tokens": 593179827.0, + "step": 15551 + }, + { + "epoch": 1.9783742526396133, + "ewc_loss": 0.04543699696660042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2718499167240225e-05, + "grad_norm": 29.514535903930664, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8684619069099426, + "num_tokens": 593220936.0, + "step": 15552 + }, + { + "epoch": 1.9785014629182038, + "ewc_loss": 0.04539887607097626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2699437977280468e-05, + "grad_norm": 29.595659255981445, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8637037873268127, + "num_tokens": 593258503.0, + "step": 15553 + }, + { + "epoch": 1.9786286731967944, + "ewc_loss": 0.04542117565870285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2710588382324204e-05, + "grad_norm": 29.52631378173828, + "learning_rate": 1e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8487172722816467, + "num_tokens": 593293738.0, + "step": 15554 + }, + { + "epoch": 1.978755883475385, + "ewc_loss": 0.04537862166762352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2689311663270928e-05, + "grad_norm": 29.430036544799805, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8584340810775757, + "num_tokens": 593338037.0, + "step": 15555 + }, + { + "epoch": 1.9788830937539754, + "ewc_loss": 0.04549650847911835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2748254195903428e-05, + "grad_norm": 29.624685287475586, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8528156280517578, + "num_tokens": 593383509.0, + "step": 15556 + }, + { + "epoch": 1.979010304032566, + "ewc_loss": 0.045437853783369064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2718926629750058e-05, + "grad_norm": 29.454837799072266, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8719633221626282, + "num_tokens": 593417734.0, + "step": 15557 + }, + { + "epoch": 1.9791375143111565, + "ewc_loss": 0.04541525989770889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2707630705554038e-05, + "grad_norm": 29.636014938354492, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8801486492156982, + "num_tokens": 593454502.0, + "step": 15558 + }, + { + "epoch": 1.979264724589747, + "ewc_loss": 0.04547186195850372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2735930542694405e-05, + "grad_norm": 29.581518173217773, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8634586930274963, + "num_tokens": 593498864.0, + "step": 15559 + }, + { + "epoch": 1.9793919348683373, + "ewc_loss": 0.0454002246260643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2700112822349183e-05, + "grad_norm": 29.52391242980957, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8642889857292175, + "num_tokens": 593540125.0, + "step": 15560 + }, + { + "epoch": 1.9795191451469278, + "ewc_loss": 0.04541889205574989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2709446056978777e-05, + "grad_norm": 29.542339324951172, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8781744241714478, + "num_tokens": 593566937.0, + "step": 15561 + }, + { + "epoch": 1.9796463554255184, + "ewc_loss": 0.04542514309287071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.271257108077407e-05, + "grad_norm": 29.60895347595215, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8632531762123108, + "num_tokens": 593600109.0, + "step": 15562 + }, + { + "epoch": 1.979773565704109, + "ewc_loss": 0.04548400640487671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.274200232932344e-05, + "grad_norm": 29.553466796875, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8668429851531982, + "num_tokens": 593638982.0, + "step": 15563 + }, + { + "epoch": 1.9799007759826994, + "ewc_loss": 0.04547688737511635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2738444386050105e-05, + "grad_norm": 29.59029197692871, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8580124378204346, + "num_tokens": 593677270.0, + "step": 15564 + }, + { + "epoch": 1.98002798626129, + "ewc_loss": 0.0454222671687603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2711134079145268e-05, + "grad_norm": 29.6269588470459, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8759362697601318, + "num_tokens": 593714056.0, + "step": 15565 + }, + { + "epoch": 1.9801551965398803, + "ewc_loss": 0.045398663729429245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2699332475895062e-05, + "grad_norm": 29.5861873626709, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8572814464569092, + "num_tokens": 593753516.0, + "step": 15566 + }, + { + "epoch": 1.9802824068184708, + "ewc_loss": 0.04538847133517265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2694235667586327e-05, + "grad_norm": 29.53006935119629, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8696824312210083, + "num_tokens": 593789276.0, + "step": 15567 + }, + { + "epoch": 1.9804096170970613, + "ewc_loss": 0.04540007933974266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.270004006277304e-05, + "grad_norm": 29.540237426757812, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8705644607543945, + "num_tokens": 593828112.0, + "step": 15568 + }, + { + "epoch": 1.9805368273756518, + "ewc_loss": 0.04544764757156372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2723823349224404e-05, + "grad_norm": 29.577529907226562, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.856059730052948, + "num_tokens": 593866370.0, + "step": 15569 + }, + { + "epoch": 1.9806640376542424, + "ewc_loss": 0.04543018341064453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2715092200087383e-05, + "grad_norm": 29.457807540893555, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8723289370536804, + "num_tokens": 593907863.0, + "step": 15570 + }, + { + "epoch": 1.980791247932833, + "ewc_loss": 0.045430950820446014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.271547600685153e-05, + "grad_norm": 29.589990615844727, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8791117668151855, + "num_tokens": 593945254.0, + "step": 15571 + }, + { + "epoch": 1.9809184582114234, + "ewc_loss": 0.04540855064988136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2704274670104496e-05, + "grad_norm": 29.485416412353516, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8720116019248962, + "num_tokens": 593987105.0, + "step": 15572 + }, + { + "epoch": 1.981045668490014, + "ewc_loss": 0.04535704478621483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2678523237118497e-05, + "grad_norm": 29.632009506225586, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.878859281539917, + "num_tokens": 594024091.0, + "step": 15573 + }, + { + "epoch": 1.9811728787686045, + "ewc_loss": 0.045457568019628525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2728783733327873e-05, + "grad_norm": 29.51741600036621, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8787501454353333, + "num_tokens": 594056018.0, + "step": 15574 + }, + { + "epoch": 1.981300089047195, + "ewc_loss": 0.045403026044368744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2701513444189914e-05, + "grad_norm": 29.579893112182617, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8710027933120728, + "num_tokens": 594102297.0, + "step": 15575 + }, + { + "epoch": 1.9814272993257855, + "ewc_loss": 0.04547450691461563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2737252947990783e-05, + "grad_norm": 29.59011459350586, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8700643181800842, + "num_tokens": 594140278.0, + "step": 15576 + }, + { + "epoch": 1.981554509604376, + "ewc_loss": 0.04536697641015053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2683489078190178e-05, + "grad_norm": 29.591106414794922, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8717211484909058, + "num_tokens": 594175025.0, + "step": 15577 + }, + { + "epoch": 1.9816817198829666, + "ewc_loss": 0.04539831355214119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2699156033922918e-05, + "grad_norm": 29.595123291015625, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8671845197677612, + "num_tokens": 594217906.0, + "step": 15578 + }, + { + "epoch": 1.9818089301615571, + "ewc_loss": 0.04544302448630333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2721511413692497e-05, + "grad_norm": 29.595701217651367, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8502446413040161, + "num_tokens": 594264331.0, + "step": 15579 + }, + { + "epoch": 1.9819361404401477, + "ewc_loss": 0.04542003199458122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2710015400662087e-05, + "grad_norm": 29.589155197143555, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8834424614906311, + "num_tokens": 594300220.0, + "step": 15580 + }, + { + "epoch": 1.9820633507187382, + "ewc_loss": 0.04526367783546448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2631838874076493e-05, + "grad_norm": 29.529190063476562, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8738703727722168, + "num_tokens": 594337849.0, + "step": 15581 + }, + { + "epoch": 1.9821905609973287, + "ewc_loss": 0.045358575880527496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2679287212667987e-05, + "grad_norm": 29.567859649658203, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.861783504486084, + "num_tokens": 594379295.0, + "step": 15582 + }, + { + "epoch": 1.9823177712759192, + "ewc_loss": 0.045415908098220825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.270795448566787e-05, + "grad_norm": 29.651315689086914, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8649537563323975, + "num_tokens": 594408350.0, + "step": 15583 + }, + { + "epoch": 1.9824449815545098, + "ewc_loss": 0.045402828603982925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.270141339977272e-05, + "grad_norm": 29.373947143554688, + "learning_rate": 1e-06, + "loss": 0.4654, + "mean_token_accuracy": 0.8577104806900024, + "num_tokens": 594455110.0, + "step": 15584 + }, + { + "epoch": 1.9825721918331, + "ewc_loss": 0.04534269496798515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.267134732392151e-05, + "grad_norm": 29.52911376953125, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8721996545791626, + "num_tokens": 594499231.0, + "step": 15585 + }, + { + "epoch": 1.9826994021116906, + "ewc_loss": 0.04554547742009163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.277273961226456e-05, + "grad_norm": 29.49427032470703, + "learning_rate": 1e-06, + "loss": 0.4968, + "mean_token_accuracy": 0.8503286242485046, + "num_tokens": 594541478.0, + "step": 15586 + }, + { + "epoch": 1.9828266123902811, + "ewc_loss": 0.04540303349494934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.270151708216872e-05, + "grad_norm": 29.531658172607422, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8717164993286133, + "num_tokens": 594579075.0, + "step": 15587 + }, + { + "epoch": 1.9829538226688717, + "ewc_loss": 0.04541414976119995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2707074094796553e-05, + "grad_norm": 29.481142044067383, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8842559456825256, + "num_tokens": 594616189.0, + "step": 15588 + }, + { + "epoch": 1.9830810329474622, + "ewc_loss": 0.04541047289967537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2705236915498972e-05, + "grad_norm": 29.602767944335938, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8750747442245483, + "num_tokens": 594656633.0, + "step": 15589 + }, + { + "epoch": 1.9832082432260527, + "ewc_loss": 0.04551079496741295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2755397367291152e-05, + "grad_norm": 29.61093521118164, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8489586114883423, + "num_tokens": 594696833.0, + "step": 15590 + }, + { + "epoch": 1.983335453504643, + "ewc_loss": 0.04543213173747063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.271606535941828e-05, + "grad_norm": 29.587507247924805, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8557947278022766, + "num_tokens": 594735138.0, + "step": 15591 + }, + { + "epoch": 1.9834626637832335, + "ewc_loss": 0.04537922888994217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2689613615511917e-05, + "grad_norm": 29.61582374572754, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8622041940689087, + "num_tokens": 594769587.0, + "step": 15592 + }, + { + "epoch": 1.983589874061824, + "ewc_loss": 0.04532932490110397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2664662537863478e-05, + "grad_norm": 29.474069595336914, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8735532760620117, + "num_tokens": 594803635.0, + "step": 15593 + }, + { + "epoch": 1.9837170843404146, + "ewc_loss": 0.04538825899362564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.269413016620092e-05, + "grad_norm": 29.593828201293945, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8836437463760376, + "num_tokens": 594839872.0, + "step": 15594 + }, + { + "epoch": 1.9838442946190051, + "ewc_loss": 0.04544077441096306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2720387278241105e-05, + "grad_norm": 29.540130615234375, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8568768501281738, + "num_tokens": 594879208.0, + "step": 15595 + }, + { + "epoch": 1.9839715048975957, + "ewc_loss": 0.04543691501021385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2718457330483943e-05, + "grad_norm": 29.491849899291992, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8747307658195496, + "num_tokens": 594910377.0, + "step": 15596 + }, + { + "epoch": 1.9840987151761862, + "ewc_loss": 0.04539848864078522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2699245164403692e-05, + "grad_norm": 29.454633712768555, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8701629042625427, + "num_tokens": 594956574.0, + "step": 15597 + }, + { + "epoch": 1.9842259254547767, + "ewc_loss": 0.04540364444255829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2701822672388516e-05, + "grad_norm": 29.441537857055664, + "learning_rate": 1e-06, + "loss": 0.4778, + "mean_token_accuracy": 0.852431058883667, + "num_tokens": 594998036.0, + "step": 15598 + }, + { + "epoch": 1.9843531357333672, + "ewc_loss": 0.045493774116039276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.274688631587196e-05, + "grad_norm": 29.571104049682617, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8630651235580444, + "num_tokens": 595031404.0, + "step": 15599 + }, + { + "epoch": 1.9844803460119578, + "ewc_loss": 0.04550303518772125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.275151746289339e-05, + "grad_norm": 29.550823211669922, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8769831657409668, + "num_tokens": 595069800.0, + "step": 15600 + }, + { + "epoch": 1.9846075562905483, + "ewc_loss": 0.045418281108140945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.270914046675898e-05, + "grad_norm": 29.47040367126465, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8740407228469849, + "num_tokens": 595110987.0, + "step": 15601 + }, + { + "epoch": 1.9847347665691388, + "ewc_loss": 0.045532673597335815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2766336769564077e-05, + "grad_norm": 29.515928268432617, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8711899518966675, + "num_tokens": 595149192.0, + "step": 15602 + }, + { + "epoch": 1.9848619768477294, + "ewc_loss": 0.0454985648393631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.274928192491643e-05, + "grad_norm": 29.493764877319336, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8665902614593506, + "num_tokens": 595182993.0, + "step": 15603 + }, + { + "epoch": 1.9849891871263199, + "ewc_loss": 0.04551167041063309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2755835743737407e-05, + "grad_norm": 29.449466705322266, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8653486371040344, + "num_tokens": 595216177.0, + "step": 15604 + }, + { + "epoch": 1.9851163974049104, + "ewc_loss": 0.04551954194903374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2759770217817277e-05, + "grad_norm": 29.537874221801758, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8684329986572266, + "num_tokens": 595253379.0, + "step": 15605 + }, + { + "epoch": 1.985243607683501, + "ewc_loss": 0.04552618786692619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2763093511457555e-05, + "grad_norm": 29.393280029296875, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8726446628570557, + "num_tokens": 595292878.0, + "step": 15606 + }, + { + "epoch": 1.9853708179620915, + "ewc_loss": 0.045531246811151505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2765623725717887e-05, + "grad_norm": 29.5180606842041, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8697882890701294, + "num_tokens": 595329082.0, + "step": 15607 + }, + { + "epoch": 1.985498028240682, + "ewc_loss": 0.045582693070173264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.279134605487343e-05, + "grad_norm": 29.557113647460938, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8608987331390381, + "num_tokens": 595367246.0, + "step": 15608 + }, + { + "epoch": 1.9856252385192723, + "ewc_loss": 0.04547424614429474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.273712379974313e-05, + "grad_norm": 29.46591567993164, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8754263520240784, + "num_tokens": 595404450.0, + "step": 15609 + }, + { + "epoch": 1.9857524487978628, + "ewc_loss": 0.04557126387953758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2785632609156892e-05, + "grad_norm": 29.58791732788086, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8809568881988525, + "num_tokens": 595439718.0, + "step": 15610 + }, + { + "epoch": 1.9858796590764534, + "ewc_loss": 0.0455341599881649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2767080736230128e-05, + "grad_norm": 29.48219871520996, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8693920373916626, + "num_tokens": 595473617.0, + "step": 15611 + }, + { + "epoch": 1.986006869355044, + "ewc_loss": 0.045569226145744324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2784612156101502e-05, + "grad_norm": 29.591041564941406, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8658198118209839, + "num_tokens": 595516432.0, + "step": 15612 + }, + { + "epoch": 1.9861340796336344, + "ewc_loss": 0.0455779954791069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.278899773955345e-05, + "grad_norm": 29.520492553710938, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8697073459625244, + "num_tokens": 595556278.0, + "step": 15613 + }, + { + "epoch": 1.986261289912225, + "ewc_loss": 0.04547468200325966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2737340259482153e-05, + "grad_norm": 29.543928146362305, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8745074272155762, + "num_tokens": 595598713.0, + "step": 15614 + }, + { + "epoch": 1.9863885001908153, + "ewc_loss": 0.045563459396362305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2781729057896882e-05, + "grad_norm": 29.59173583984375, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8712875843048096, + "num_tokens": 595630597.0, + "step": 15615 + }, + { + "epoch": 1.9865157104694058, + "ewc_loss": 0.04551323503255844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2756617909180932e-05, + "grad_norm": 29.55022430419922, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8868330121040344, + "num_tokens": 595664163.0, + "step": 15616 + }, + { + "epoch": 1.9866429207479963, + "ewc_loss": 0.045447275042533875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2723637812305242e-05, + "grad_norm": 29.379671096801758, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8783579468727112, + "num_tokens": 595698809.0, + "step": 15617 + }, + { + "epoch": 1.9867701310265868, + "ewc_loss": 0.04559716954827309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2798585632699542e-05, + "grad_norm": 29.636892318725586, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8632240891456604, + "num_tokens": 595741166.0, + "step": 15618 + }, + { + "epoch": 1.9868973413051774, + "ewc_loss": 0.045488473027944565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2744236048310995e-05, + "grad_norm": 29.52703285217285, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8634623885154724, + "num_tokens": 595787079.0, + "step": 15619 + }, + { + "epoch": 1.987024551583768, + "ewc_loss": 0.04548868536949158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2744343368685804e-05, + "grad_norm": 29.658414840698242, + "learning_rate": 1e-06, + "loss": 0.4995, + "mean_token_accuracy": 0.8437414169311523, + "num_tokens": 595829085.0, + "step": 15620 + }, + { + "epoch": 1.9871517618623584, + "ewc_loss": 0.045498576015233994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2749287381884642e-05, + "grad_norm": 29.63686180114746, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.866687536239624, + "num_tokens": 595870646.0, + "step": 15621 + }, + { + "epoch": 1.987278972140949, + "ewc_loss": 0.04545963928103447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.272981873829849e-05, + "grad_norm": 29.607948303222656, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.858359694480896, + "num_tokens": 595918473.0, + "step": 15622 + }, + { + "epoch": 1.9874061824195395, + "ewc_loss": 0.04542743042111397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.27137152251089e-05, + "grad_norm": 29.581462860107422, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8571527600288391, + "num_tokens": 595954097.0, + "step": 15623 + }, + { + "epoch": 1.98753339269813, + "ewc_loss": 0.04553507640957832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.276753912155982e-05, + "grad_norm": 29.713768005371094, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8498848080635071, + "num_tokens": 595990191.0, + "step": 15624 + }, + { + "epoch": 1.9876606029767205, + "ewc_loss": 0.045479822903871536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2739912310498767e-05, + "grad_norm": 29.549522399902344, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8612092733383179, + "num_tokens": 596027109.0, + "step": 15625 + }, + { + "epoch": 1.987787813255311, + "ewc_loss": 0.0454082153737545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.270410732307937e-05, + "grad_norm": 29.687498092651367, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8765157461166382, + "num_tokens": 596067930.0, + "step": 15626 + }, + { + "epoch": 1.9879150235339016, + "ewc_loss": 0.04547680914402008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2738404368283227e-05, + "grad_norm": 29.605466842651367, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8787055015563965, + "num_tokens": 596105809.0, + "step": 15627 + }, + { + "epoch": 1.9880422338124921, + "ewc_loss": 0.04538700357079506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2693502614856698e-05, + "grad_norm": 29.542123794555664, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8725264668464661, + "num_tokens": 596144220.0, + "step": 15628 + }, + { + "epoch": 1.9881694440910826, + "ewc_loss": 0.04546146094799042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2730730051989667e-05, + "grad_norm": 29.594501495361328, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8460927605628967, + "num_tokens": 596183825.0, + "step": 15629 + }, + { + "epoch": 1.9882966543696732, + "ewc_loss": 0.04545067623257637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2725338567397557e-05, + "grad_norm": 29.598363876342773, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8773901462554932, + "num_tokens": 596232333.0, + "step": 15630 + }, + { + "epoch": 1.9884238646482637, + "ewc_loss": 0.04541167616844177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2705837182002142e-05, + "grad_norm": 29.58392333984375, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8767719864845276, + "num_tokens": 596272138.0, + "step": 15631 + }, + { + "epoch": 1.9885510749268542, + "ewc_loss": 0.045420270413160324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.271013545396272e-05, + "grad_norm": 29.582698822021484, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8778183460235596, + "num_tokens": 596310254.0, + "step": 15632 + }, + { + "epoch": 1.9886782852054448, + "ewc_loss": 0.04543129727244377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.271564881084487e-05, + "grad_norm": 29.500085830688477, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8648703098297119, + "num_tokens": 596350396.0, + "step": 15633 + }, + { + "epoch": 1.988805495484035, + "ewc_loss": 0.045375555753707886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.268777825520374e-05, + "grad_norm": 29.54083251953125, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8712406158447266, + "num_tokens": 596388809.0, + "step": 15634 + }, + { + "epoch": 1.9889327057626256, + "ewc_loss": 0.045525044202804565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.276252234878484e-05, + "grad_norm": 29.625856399536133, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.87635338306427, + "num_tokens": 596427366.0, + "step": 15635 + }, + { + "epoch": 1.9890599160412161, + "ewc_loss": 0.0454026460647583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.270132245030254e-05, + "grad_norm": 29.593427658081055, + "learning_rate": 1e-06, + "loss": 0.4637, + "mean_token_accuracy": 0.8594671487808228, + "num_tokens": 596469999.0, + "step": 15636 + }, + { + "epoch": 1.9891871263198067, + "ewc_loss": 0.0453973151743412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2698657630826347e-05, + "grad_norm": 29.569141387939453, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.85300213098526, + "num_tokens": 596506773.0, + "step": 15637 + }, + { + "epoch": 1.9893143365983972, + "ewc_loss": 0.045415256172418594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2707628886564635e-05, + "grad_norm": 29.545896530151367, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8693258166313171, + "num_tokens": 596547440.0, + "step": 15638 + }, + { + "epoch": 1.9894415468769877, + "ewc_loss": 0.04534502699971199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2672513296129182e-05, + "grad_norm": 29.567829132080078, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8741388916969299, + "num_tokens": 596580212.0, + "step": 15639 + }, + { + "epoch": 1.989568757155578, + "ewc_loss": 0.04539059102535248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2695296138408594e-05, + "grad_norm": 29.585350036621094, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8892762064933777, + "num_tokens": 596611790.0, + "step": 15640 + }, + { + "epoch": 1.9896959674341685, + "ewc_loss": 0.04549591988325119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2747959519620053e-05, + "grad_norm": 29.539283752441406, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8838282823562622, + "num_tokens": 596646984.0, + "step": 15641 + }, + { + "epoch": 1.989823177712759, + "ewc_loss": 0.045410726219415665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2705362425767817e-05, + "grad_norm": 29.650415420532227, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8648942708969116, + "num_tokens": 596688362.0, + "step": 15642 + }, + { + "epoch": 1.9899503879913496, + "ewc_loss": 0.04546712338924408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2733562218490988e-05, + "grad_norm": 29.5939884185791, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8704721927642822, + "num_tokens": 596725298.0, + "step": 15643 + }, + { + "epoch": 1.9900775982699401, + "ewc_loss": 0.04536965489387512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2684827854391187e-05, + "grad_norm": 29.48841667175293, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8749068975448608, + "num_tokens": 596757400.0, + "step": 15644 + }, + { + "epoch": 1.9902048085485307, + "ewc_loss": 0.045476704835891724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2738351617590524e-05, + "grad_norm": 29.564125061035156, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8807638883590698, + "num_tokens": 596790533.0, + "step": 15645 + }, + { + "epoch": 1.9903320188271212, + "ewc_loss": 0.04545406997203827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2727035684511065e-05, + "grad_norm": 29.545385360717773, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8625599145889282, + "num_tokens": 596834467.0, + "step": 15646 + }, + { + "epoch": 1.9904592291057117, + "ewc_loss": 0.04546654224395752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.273327118018642e-05, + "grad_norm": 29.465656280517578, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8563470840454102, + "num_tokens": 596873262.0, + "step": 15647 + }, + { + "epoch": 1.9905864393843022, + "ewc_loss": 0.04554359242320061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.277179555676412e-05, + "grad_norm": 29.640438079833984, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8696616291999817, + "num_tokens": 596912833.0, + "step": 15648 + }, + { + "epoch": 1.9907136496628928, + "ewc_loss": 0.04555368050932884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.277683961438015e-05, + "grad_norm": 29.563066482543945, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8764986395835876, + "num_tokens": 596953554.0, + "step": 15649 + }, + { + "epoch": 1.9908408599414833, + "ewc_loss": 0.04539747163653374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.26987358473707e-05, + "grad_norm": 29.484474182128906, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8641643524169922, + "num_tokens": 596991206.0, + "step": 15650 + }, + { + "epoch": 1.9909680702200738, + "ewc_loss": 0.04555036500096321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.277518251503352e-05, + "grad_norm": 29.622751235961914, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8709708452224731, + "num_tokens": 597027222.0, + "step": 15651 + }, + { + "epoch": 1.9910952804986644, + "ewc_loss": 0.04547114297747612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2735572201781906e-05, + "grad_norm": 29.473217010498047, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8777303695678711, + "num_tokens": 597069040.0, + "step": 15652 + }, + { + "epoch": 1.9912224907772549, + "ewc_loss": 0.045494791120290756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2747395632904954e-05, + "grad_norm": 29.647602081298828, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8612955808639526, + "num_tokens": 597112529.0, + "step": 15653 + }, + { + "epoch": 1.9913497010558454, + "ewc_loss": 0.045508973300457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2754486053599976e-05, + "grad_norm": 29.522695541381836, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8706202507019043, + "num_tokens": 597148304.0, + "step": 15654 + }, + { + "epoch": 1.991476911334436, + "ewc_loss": 0.045418478548526764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2709238692186773e-05, + "grad_norm": 29.528894424438477, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8781229257583618, + "num_tokens": 597184244.0, + "step": 15655 + }, + { + "epoch": 1.9916041216130265, + "ewc_loss": 0.04556930065155029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2784650354878977e-05, + "grad_norm": 29.606340408325195, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8607504367828369, + "num_tokens": 597218178.0, + "step": 15656 + }, + { + "epoch": 1.991731331891617, + "ewc_loss": 0.045504067093133926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2752034055883996e-05, + "grad_norm": 29.534440994262695, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8662936091423035, + "num_tokens": 597259256.0, + "step": 15657 + }, + { + "epoch": 1.9918585421702073, + "ewc_loss": 0.045511823147535324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2755912141292356e-05, + "grad_norm": 29.533504486083984, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8794808983802795, + "num_tokens": 597296584.0, + "step": 15658 + }, + { + "epoch": 1.9919857524487978, + "ewc_loss": 0.045495156198740005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.274757753184531e-05, + "grad_norm": 29.522544860839844, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8538352251052856, + "num_tokens": 597341116.0, + "step": 15659 + }, + { + "epoch": 1.9921129627273884, + "ewc_loss": 0.04553788900375366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2768945200368762e-05, + "grad_norm": 29.660032272338867, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8662378191947937, + "num_tokens": 597375595.0, + "step": 15660 + }, + { + "epoch": 1.9922401730059789, + "ewc_loss": 0.04556334391236305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2781672669225372e-05, + "grad_norm": 29.515453338623047, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8554332256317139, + "num_tokens": 597419226.0, + "step": 15661 + }, + { + "epoch": 1.9923673832845694, + "ewc_loss": 0.04542388394474983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2711941710440442e-05, + "grad_norm": 29.609352111816406, + "learning_rate": 1e-06, + "loss": 0.4886, + "mean_token_accuracy": 0.8490997552871704, + "num_tokens": 597461242.0, + "step": 15662 + }, + { + "epoch": 1.99249459356316, + "ewc_loss": 0.045600052922964096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.280002627230715e-05, + "grad_norm": 29.640390396118164, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8681809902191162, + "num_tokens": 597500030.0, + "step": 15663 + }, + { + "epoch": 1.9926218038417502, + "ewc_loss": 0.04546918347477913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2734591766493395e-05, + "grad_norm": 29.57567024230957, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8531782627105713, + "num_tokens": 597540428.0, + "step": 15664 + }, + { + "epoch": 1.9927490141203408, + "ewc_loss": 0.04554416239261627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2772081138100475e-05, + "grad_norm": 29.733896255493164, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8795610666275024, + "num_tokens": 597575070.0, + "step": 15665 + }, + { + "epoch": 1.9928762243989313, + "ewc_loss": 0.045453161001205444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.272658093716018e-05, + "grad_norm": 29.47509765625, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8667755126953125, + "num_tokens": 597617425.0, + "step": 15666 + }, + { + "epoch": 1.9930034346775218, + "ewc_loss": 0.045357026159763336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2678512323182076e-05, + "grad_norm": 29.58032989501953, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8748966455459595, + "num_tokens": 597660489.0, + "step": 15667 + }, + { + "epoch": 1.9931306449561124, + "ewc_loss": 0.04553551226854324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2767755581298843e-05, + "grad_norm": 29.604671478271484, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8742748498916626, + "num_tokens": 597703626.0, + "step": 15668 + }, + { + "epoch": 1.993257855234703, + "ewc_loss": 0.04544250667095184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2721253117197193e-05, + "grad_norm": 29.490020751953125, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8634294867515564, + "num_tokens": 597741176.0, + "step": 15669 + }, + { + "epoch": 1.9933850655132934, + "ewc_loss": 0.04550519958138466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.27525997615885e-05, + "grad_norm": 29.71906852722168, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8571437001228333, + "num_tokens": 597779763.0, + "step": 15670 + }, + { + "epoch": 1.993512275791884, + "ewc_loss": 0.0455336831510067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2766842448618263e-05, + "grad_norm": 29.608261108398438, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8726835250854492, + "num_tokens": 597813300.0, + "step": 15671 + }, + { + "epoch": 1.9936394860704745, + "ewc_loss": 0.045454785227775574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.272739220643416e-05, + "grad_norm": 29.626480102539062, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8579688668251038, + "num_tokens": 597849062.0, + "step": 15672 + }, + { + "epoch": 1.993766696349065, + "ewc_loss": 0.04542284458875656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2711421479471028e-05, + "grad_norm": 29.66667938232422, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8699644804000854, + "num_tokens": 597886565.0, + "step": 15673 + }, + { + "epoch": 1.9938939066276555, + "ewc_loss": 0.045395929366350174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2697964595863596e-05, + "grad_norm": 29.633216857910156, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8708562850952148, + "num_tokens": 597925097.0, + "step": 15674 + }, + { + "epoch": 1.994021116906246, + "ewc_loss": 0.04543431103229523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2717154934071004e-05, + "grad_norm": 29.611053466796875, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8811195492744446, + "num_tokens": 597960786.0, + "step": 15675 + }, + { + "epoch": 1.9941483271848366, + "ewc_loss": 0.04538491368293762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.269245669594966e-05, + "grad_norm": 29.58005714416504, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8505431413650513, + "num_tokens": 598000442.0, + "step": 15676 + }, + { + "epoch": 1.9942755374634271, + "ewc_loss": 0.045396823436021805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2698412067256868e-05, + "grad_norm": 29.66037368774414, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8661447763442993, + "num_tokens": 598042946.0, + "step": 15677 + }, + { + "epoch": 1.9944027477420176, + "ewc_loss": 0.045405998826026917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.270299955853261e-05, + "grad_norm": 29.59160614013672, + "learning_rate": 1e-06, + "loss": 0.5008, + "mean_token_accuracy": 0.8485732078552246, + "num_tokens": 598081659.0, + "step": 15678 + }, + { + "epoch": 1.9945299580206082, + "ewc_loss": 0.04543478041887283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2717389583704062e-05, + "grad_norm": 29.582271575927734, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8625073432922363, + "num_tokens": 598120203.0, + "step": 15679 + }, + { + "epoch": 1.9946571682991987, + "ewc_loss": 0.04540974274277687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.270487129862886e-05, + "grad_norm": 29.51850700378418, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8554927110671997, + "num_tokens": 598154779.0, + "step": 15680 + }, + { + "epoch": 1.9947843785777892, + "ewc_loss": 0.045364800840616226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2682401322526857e-05, + "grad_norm": 29.526042938232422, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8672252893447876, + "num_tokens": 598197075.0, + "step": 15681 + }, + { + "epoch": 1.9949115888563798, + "ewc_loss": 0.045503612607717514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2751806682208553e-05, + "grad_norm": 29.51923179626465, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8609129190444946, + "num_tokens": 598240539.0, + "step": 15682 + }, + { + "epoch": 1.99503879913497, + "ewc_loss": 0.045444004237651825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2722002540831454e-05, + "grad_norm": 29.534223556518555, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.869785726070404, + "num_tokens": 598281776.0, + "step": 15683 + }, + { + "epoch": 1.9951660094135606, + "ewc_loss": 0.04549410194158554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2747051843907684e-05, + "grad_norm": 29.471839904785156, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.874730110168457, + "num_tokens": 598322715.0, + "step": 15684 + }, + { + "epoch": 1.9952932196921511, + "ewc_loss": 0.04547112062573433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.273555946885608e-05, + "grad_norm": 29.663951873779297, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8623323440551758, + "num_tokens": 598364495.0, + "step": 15685 + }, + { + "epoch": 1.9954204299707416, + "ewc_loss": 0.04558694362640381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2793472453486174e-05, + "grad_norm": 29.55365562438965, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8646122813224792, + "num_tokens": 598407102.0, + "step": 15686 + }, + { + "epoch": 1.9955476402493322, + "ewc_loss": 0.04540986940264702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2704934963257983e-05, + "grad_norm": 29.50467300415039, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8551799058914185, + "num_tokens": 598448600.0, + "step": 15687 + }, + { + "epoch": 1.9956748505279227, + "ewc_loss": 0.0455918051302433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.279590262332931e-05, + "grad_norm": 29.81197166442871, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8578487038612366, + "num_tokens": 598491140.0, + "step": 15688 + }, + { + "epoch": 1.995802060806513, + "ewc_loss": 0.04548519477248192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.27425971388584e-05, + "grad_norm": 29.454265594482422, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8635236024856567, + "num_tokens": 598527831.0, + "step": 15689 + }, + { + "epoch": 1.9959292710851035, + "ewc_loss": 0.04532929137349129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2664646166958846e-05, + "grad_norm": 29.59432029724121, + "learning_rate": 1e-06, + "loss": 0.475, + "mean_token_accuracy": 0.8550639152526855, + "num_tokens": 598568680.0, + "step": 15690 + }, + { + "epoch": 1.996056481363694, + "ewc_loss": 0.0454784594476223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2739230189472437e-05, + "grad_norm": 29.434810638427734, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8567487001419067, + "num_tokens": 598606254.0, + "step": 15691 + }, + { + "epoch": 1.9961836916422846, + "ewc_loss": 0.0455271452665329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2763571905670688e-05, + "grad_norm": 29.66748046875, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8699020743370056, + "num_tokens": 598642400.0, + "step": 15692 + }, + { + "epoch": 1.9963109019208751, + "ewc_loss": 0.045587316155433655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2793657990405336e-05, + "grad_norm": 29.45417594909668, + "learning_rate": 1e-06, + "loss": 0.4922, + "mean_token_accuracy": 0.8463038802146912, + "num_tokens": 598678379.0, + "step": 15693 + }, + { + "epoch": 1.9964381121994657, + "ewc_loss": 0.04543311148881912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2716556486557238e-05, + "grad_norm": 29.478500366210938, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8658870458602905, + "num_tokens": 598718234.0, + "step": 15694 + }, + { + "epoch": 1.9965653224780562, + "ewc_loss": 0.045598141849040985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.279907130287029e-05, + "grad_norm": 29.618343353271484, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8706536293029785, + "num_tokens": 598754897.0, + "step": 15695 + }, + { + "epoch": 1.9966925327566467, + "ewc_loss": 0.045542746782302856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2771373551222496e-05, + "grad_norm": 29.587560653686523, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8616814613342285, + "num_tokens": 598785268.0, + "step": 15696 + }, + { + "epoch": 1.9968197430352372, + "ewc_loss": 0.045557308942079544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.277865496580489e-05, + "grad_norm": 29.597246170043945, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8737007975578308, + "num_tokens": 598827131.0, + "step": 15697 + }, + { + "epoch": 1.9969469533138278, + "ewc_loss": 0.045451752841472626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2725876988261007e-05, + "grad_norm": 29.560670852661133, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8688820600509644, + "num_tokens": 598859507.0, + "step": 15698 + }, + { + "epoch": 1.9970741635924183, + "ewc_loss": 0.0455612950026989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2780646759201773e-05, + "grad_norm": 29.612224578857422, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8578747510910034, + "num_tokens": 598893280.0, + "step": 15699 + }, + { + "epoch": 1.9972013738710088, + "ewc_loss": 0.045549314469099045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2774656827095896e-05, + "grad_norm": 29.507858276367188, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8817332983016968, + "num_tokens": 598932759.0, + "step": 15700 + }, + { + "epoch": 1.9973285841495994, + "ewc_loss": 0.045567356050014496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.278367719554808e-05, + "grad_norm": 29.558643341064453, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8774706125259399, + "num_tokens": 598972899.0, + "step": 15701 + }, + { + "epoch": 1.9974557944281899, + "ewc_loss": 0.04562105983495712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2810529117123224e-05, + "grad_norm": 29.563901901245117, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8690071105957031, + "num_tokens": 599012811.0, + "step": 15702 + }, + { + "epoch": 1.9975830047067804, + "ewc_loss": 0.045607585459947586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2803793399361894e-05, + "grad_norm": 29.62799835205078, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8584897518157959, + "num_tokens": 599055632.0, + "step": 15703 + }, + { + "epoch": 1.997710214985371, + "ewc_loss": 0.045624446123838425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2812222596257925e-05, + "grad_norm": 29.660430908203125, + "learning_rate": 1e-06, + "loss": 0.4744, + "mean_token_accuracy": 0.8496723175048828, + "num_tokens": 599090924.0, + "step": 15704 + }, + { + "epoch": 1.9978374252639615, + "ewc_loss": 0.04558412730693817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.279206455568783e-05, + "grad_norm": 29.46360969543457, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8815675377845764, + "num_tokens": 599130434.0, + "step": 15705 + }, + { + "epoch": 1.997964635542552, + "ewc_loss": 0.045574039220809937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2787018679082394e-05, + "grad_norm": 29.582300186157227, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8668529987335205, + "num_tokens": 599171018.0, + "step": 15706 + }, + { + "epoch": 1.9980918458211423, + "ewc_loss": 0.04566193372011185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2830967282061465e-05, + "grad_norm": 29.50846290588379, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8591006398200989, + "num_tokens": 599212445.0, + "step": 15707 + }, + { + "epoch": 1.9982190560997328, + "ewc_loss": 0.045643098652362823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2821548554929905e-05, + "grad_norm": 29.69059181213379, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8724240064620972, + "num_tokens": 599250591.0, + "step": 15708 + }, + { + "epoch": 1.9983462663783234, + "ewc_loss": 0.045686982572078705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.284349102410488e-05, + "grad_norm": 29.566036224365234, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.862762451171875, + "num_tokens": 599284987.0, + "step": 15709 + }, + { + "epoch": 1.9984734766569139, + "ewc_loss": 0.04559925198554993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.279962609463837e-05, + "grad_norm": 29.60777473449707, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8631146550178528, + "num_tokens": 599324656.0, + "step": 15710 + }, + { + "epoch": 1.9986006869355044, + "ewc_loss": 0.045690249651670456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2845124476589262e-05, + "grad_norm": 29.673240661621094, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8786638975143433, + "num_tokens": 599364014.0, + "step": 15711 + }, + { + "epoch": 1.998727897214095, + "ewc_loss": 0.04555385932326317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2776928744860925e-05, + "grad_norm": 29.599855422973633, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8594944477081299, + "num_tokens": 599396781.0, + "step": 15712 + }, + { + "epoch": 1.9988551074926852, + "ewc_loss": 0.045561883598566055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2780941435485147e-05, + "grad_norm": 29.644245147705078, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8772252202033997, + "num_tokens": 599433240.0, + "step": 15713 + }, + { + "epoch": 1.9989823177712758, + "ewc_loss": 0.045535024255514145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2767511836718768e-05, + "grad_norm": 29.48680877685547, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8768396377563477, + "num_tokens": 599474883.0, + "step": 15714 + }, + { + "epoch": 1.9991095280498663, + "ewc_loss": 0.04548695310950279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2743475710740313e-05, + "grad_norm": 29.624483108520508, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.868354320526123, + "num_tokens": 599508492.0, + "step": 15715 + }, + { + "epoch": 1.9992367383284568, + "ewc_loss": 0.045619215816259384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.280960870848503e-05, + "grad_norm": 29.575138092041016, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8700995445251465, + "num_tokens": 599545946.0, + "step": 15716 + }, + { + "epoch": 1.9993639486070474, + "ewc_loss": 0.04551674425601959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.275837141496595e-05, + "grad_norm": 29.617416381835938, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8804893493652344, + "num_tokens": 599586217.0, + "step": 15717 + }, + { + "epoch": 1.9994911588856379, + "ewc_loss": 0.045604489743709564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2802245439379476e-05, + "grad_norm": 29.617372512817383, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8752106428146362, + "num_tokens": 599620431.0, + "step": 15718 + }, + { + "epoch": 1.9996183691642284, + "ewc_loss": 0.045496225357055664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.274811231472995e-05, + "grad_norm": 29.569499969482422, + "learning_rate": 1e-06, + "loss": 0.4775, + "mean_token_accuracy": 0.8540172576904297, + "num_tokens": 599659959.0, + "step": 15719 + }, + { + "epoch": 1.999745579442819, + "ewc_loss": 0.04550159350037575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2750797143089585e-05, + "grad_norm": 29.627782821655273, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8655328750610352, + "num_tokens": 599696558.0, + "step": 15720 + }, + { + "epoch": 1.9998727897214095, + "ewc_loss": 0.04558994248509407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2794971300754696e-05, + "grad_norm": 29.586181640625, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8566203713417053, + "num_tokens": 599734925.0, + "step": 15721 + }, + { + "epoch": 2.0, + "ewc_loss": 0.04556824639439583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.278412284795195e-05, + "grad_norm": 29.641592025756836, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8712549805641174, + "num_tokens": 599772613.0, + "step": 15722 + }, + { + "epoch": 2.0001272102785905, + "ewc_loss": 0.045623477548360825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2811738745076582e-05, + "grad_norm": 29.535322189331055, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8720159530639648, + "num_tokens": 599813892.0, + "step": 15723 + }, + { + "epoch": 2.000254420557181, + "ewc_loss": 0.04555998370051384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2779991923016496e-05, + "grad_norm": 29.68985366821289, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8695939779281616, + "num_tokens": 599852807.0, + "step": 15724 + }, + { + "epoch": 2.0003816308357716, + "ewc_loss": 0.04557913541793823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.278956708323676e-05, + "grad_norm": 29.571521759033203, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8753855228424072, + "num_tokens": 599893712.0, + "step": 15725 + }, + { + "epoch": 2.000508841114362, + "ewc_loss": 0.04551512748003006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2757563783670776e-05, + "grad_norm": 29.580333709716797, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8708558082580566, + "num_tokens": 599929954.0, + "step": 15726 + }, + { + "epoch": 2.0006360513929526, + "ewc_loss": 0.045630671083927155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2815334887127392e-05, + "grad_norm": 29.535478591918945, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8669871687889099, + "num_tokens": 599967763.0, + "step": 15727 + }, + { + "epoch": 2.000763261671543, + "ewc_loss": 0.045562658458948135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2781328880228102e-05, + "grad_norm": 29.657899856567383, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.877728283405304, + "num_tokens": 600008938.0, + "step": 15728 + }, + { + "epoch": 2.0008904719501337, + "ewc_loss": 0.04563271254301071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2816357159172185e-05, + "grad_norm": 29.474594116210938, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8651175498962402, + "num_tokens": 600051849.0, + "step": 15729 + }, + { + "epoch": 2.0010176822287242, + "ewc_loss": 0.0455058328807354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2752916265744716e-05, + "grad_norm": 29.5837459564209, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.874911367893219, + "num_tokens": 600088568.0, + "step": 15730 + }, + { + "epoch": 2.0011448925073148, + "ewc_loss": 0.04561930522322655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2809652364230715e-05, + "grad_norm": 29.457876205444336, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8732945322990417, + "num_tokens": 600131553.0, + "step": 15731 + }, + { + "epoch": 2.0012721027859053, + "ewc_loss": 0.045603375881910324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.280168882862199e-05, + "grad_norm": 29.621349334716797, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8698773384094238, + "num_tokens": 600173198.0, + "step": 15732 + }, + { + "epoch": 2.001399313064496, + "ewc_loss": 0.045692168176174164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2846084902994335e-05, + "grad_norm": 29.580167770385742, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8741698265075684, + "num_tokens": 600211347.0, + "step": 15733 + }, + { + "epoch": 2.0015265233430863, + "ewc_loss": 0.04549912363290787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2749562049284577e-05, + "grad_norm": 29.502132415771484, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8640358448028564, + "num_tokens": 600249953.0, + "step": 15734 + }, + { + "epoch": 2.0016537336216764, + "ewc_loss": 0.04562683030962944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.281341585330665e-05, + "grad_norm": 29.633041381835938, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8617175221443176, + "num_tokens": 600291211.0, + "step": 15735 + }, + { + "epoch": 2.001780943900267, + "ewc_loss": 0.045623116195201874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.281155866512563e-05, + "grad_norm": 29.578807830810547, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8744145631790161, + "num_tokens": 600332906.0, + "step": 15736 + }, + { + "epoch": 2.0019081541788575, + "ewc_loss": 0.045565783977508545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2782891392125748e-05, + "grad_norm": 29.58155632019043, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8698197603225708, + "num_tokens": 600370570.0, + "step": 15737 + }, + { + "epoch": 2.002035364457448, + "ewc_loss": 0.04556368663907051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2781843654229306e-05, + "grad_norm": 29.56972885131836, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8603389263153076, + "num_tokens": 600409368.0, + "step": 15738 + }, + { + "epoch": 2.0021625747360385, + "ewc_loss": 0.04561026394367218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2805132175562903e-05, + "grad_norm": 29.6053524017334, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8726959824562073, + "num_tokens": 600442019.0, + "step": 15739 + }, + { + "epoch": 2.002289785014629, + "ewc_loss": 0.04557568207383156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2787840862292796e-05, + "grad_norm": 29.542579650878906, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8785036206245422, + "num_tokens": 600471665.0, + "step": 15740 + }, + { + "epoch": 2.0024169952932196, + "ewc_loss": 0.045631349086761475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2815675038145855e-05, + "grad_norm": 29.639413833618164, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8676844835281372, + "num_tokens": 600508804.0, + "step": 15741 + }, + { + "epoch": 2.00254420557181, + "ewc_loss": 0.04557701572775841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2788508431403898e-05, + "grad_norm": 29.51424789428711, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.8586194515228271, + "num_tokens": 600543895.0, + "step": 15742 + }, + { + "epoch": 2.0026714158504006, + "ewc_loss": 0.04561483487486839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2807416826253757e-05, + "grad_norm": 29.66813850402832, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8713427186012268, + "num_tokens": 600585818.0, + "step": 15743 + }, + { + "epoch": 2.002798626128991, + "ewc_loss": 0.04567550867795944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2837753931526095e-05, + "grad_norm": 29.616069793701172, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8734809160232544, + "num_tokens": 600623106.0, + "step": 15744 + }, + { + "epoch": 2.0029258364075817, + "ewc_loss": 0.045554276555776596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2777137928642333e-05, + "grad_norm": 29.580961227416992, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8911381959915161, + "num_tokens": 600662662.0, + "step": 15745 + }, + { + "epoch": 2.0030530466861722, + "ewc_loss": 0.04562641307711601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2813206669525243e-05, + "grad_norm": 29.653484344482422, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8565316796302795, + "num_tokens": 600699416.0, + "step": 15746 + }, + { + "epoch": 2.0031802569647628, + "ewc_loss": 0.04566172882914543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2830863599665463e-05, + "grad_norm": 29.62437629699707, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8625819683074951, + "num_tokens": 600739351.0, + "step": 15747 + }, + { + "epoch": 2.0033074672433533, + "ewc_loss": 0.04554547742009163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2772737793275155e-05, + "grad_norm": 29.510311126708984, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8712406158447266, + "num_tokens": 600781198.0, + "step": 15748 + }, + { + "epoch": 2.003434677521944, + "ewc_loss": 0.04559548199176788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2797741621616296e-05, + "grad_norm": 29.56344223022461, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8678702116012573, + "num_tokens": 600821743.0, + "step": 15749 + }, + { + "epoch": 2.0035618878005343, + "ewc_loss": 0.04567232355475426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2836162315797992e-05, + "grad_norm": 29.610198974609375, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8853999376296997, + "num_tokens": 600862463.0, + "step": 15750 + }, + { + "epoch": 2.003689098079125, + "ewc_loss": 0.04568200558423996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2841002646600828e-05, + "grad_norm": 29.598770141601562, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8600025773048401, + "num_tokens": 600902629.0, + "step": 15751 + }, + { + "epoch": 2.0038163083577154, + "ewc_loss": 0.04561585560441017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2807927962276153e-05, + "grad_norm": 29.565282821655273, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8675272464752197, + "num_tokens": 600941778.0, + "step": 15752 + }, + { + "epoch": 2.003943518636306, + "ewc_loss": 0.04564039781689644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2820198864792474e-05, + "grad_norm": 29.590240478515625, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8775548338890076, + "num_tokens": 600979425.0, + "step": 15753 + }, + { + "epoch": 2.0040707289148965, + "ewc_loss": 0.04566308483481407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2831542082712986e-05, + "grad_norm": 29.62664794921875, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8658231496810913, + "num_tokens": 601017882.0, + "step": 15754 + }, + { + "epoch": 2.004197939193487, + "ewc_loss": 0.045591969043016434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.279598447785247e-05, + "grad_norm": 29.606611251831055, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8632993698120117, + "num_tokens": 601053394.0, + "step": 15755 + }, + { + "epoch": 2.0043251494720775, + "ewc_loss": 0.04562850296497345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.281425076944288e-05, + "grad_norm": 29.55677604675293, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8733494281768799, + "num_tokens": 601094940.0, + "step": 15756 + }, + { + "epoch": 2.004452359750668, + "ewc_loss": 0.04563182592391968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.281591332575772e-05, + "grad_norm": 29.71328353881836, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8710839748382568, + "num_tokens": 601132963.0, + "step": 15757 + }, + { + "epoch": 2.0045795700292586, + "ewc_loss": 0.045640986412763596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2820493541075848e-05, + "grad_norm": 29.57185173034668, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8695693016052246, + "num_tokens": 601168511.0, + "step": 15758 + }, + { + "epoch": 2.0047067803078487, + "ewc_loss": 0.04555892199277878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.277946077811066e-05, + "grad_norm": 29.53294563293457, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8693526387214661, + "num_tokens": 601208922.0, + "step": 15759 + }, + { + "epoch": 2.004833990586439, + "ewc_loss": 0.04572764411568642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.286382186866831e-05, + "grad_norm": 29.681795120239258, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8635753989219666, + "num_tokens": 601253814.0, + "step": 15760 + }, + { + "epoch": 2.0049612008650297, + "ewc_loss": 0.04563877731561661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2819389414507896e-05, + "grad_norm": 29.614288330078125, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8715810775756836, + "num_tokens": 601295584.0, + "step": 15761 + }, + { + "epoch": 2.0050884111436202, + "ewc_loss": 0.04559158906340599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2795795302954502e-05, + "grad_norm": 29.6082706451416, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8630327582359314, + "num_tokens": 601329134.0, + "step": 15762 + }, + { + "epoch": 2.0052156214222108, + "ewc_loss": 0.045560322701931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2780161089031026e-05, + "grad_norm": 29.534034729003906, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8788226246833801, + "num_tokens": 601369100.0, + "step": 15763 + }, + { + "epoch": 2.0053428317008013, + "ewc_loss": 0.04558837413787842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2794187316321768e-05, + "grad_norm": 29.571590423583984, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.884273886680603, + "num_tokens": 601407876.0, + "step": 15764 + }, + { + "epoch": 2.005470041979392, + "ewc_loss": 0.04560376703739166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.280188346048817e-05, + "grad_norm": 29.548534393310547, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8832392692565918, + "num_tokens": 601446996.0, + "step": 15765 + }, + { + "epoch": 2.0055972522579824, + "ewc_loss": 0.04567861929535866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2839309167466126e-05, + "grad_norm": 29.550918579101562, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8619238138198853, + "num_tokens": 601482158.0, + "step": 15766 + }, + { + "epoch": 2.005724462536573, + "ewc_loss": 0.04561685398221016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2808426365372725e-05, + "grad_norm": 29.6141300201416, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8761154413223267, + "num_tokens": 601517940.0, + "step": 15767 + }, + { + "epoch": 2.0058516728151634, + "ewc_loss": 0.04566703736782074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.283351932419464e-05, + "grad_norm": 29.51234245300293, + "learning_rate": 1e-06, + "loss": 0.3515, + "mean_token_accuracy": 0.8904187679290771, + "num_tokens": 601553823.0, + "step": 15768 + }, + { + "epoch": 2.005978883093754, + "ewc_loss": 0.04562205448746681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2811027520219795e-05, + "grad_norm": 29.621437072753906, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8585739731788635, + "num_tokens": 601595482.0, + "step": 15769 + }, + { + "epoch": 2.0061060933723445, + "ewc_loss": 0.04563438892364502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2817193894297816e-05, + "grad_norm": 29.48443603515625, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8778172731399536, + "num_tokens": 601632451.0, + "step": 15770 + }, + { + "epoch": 2.006233303650935, + "ewc_loss": 0.04561861231923103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.280930675624404e-05, + "grad_norm": 29.6932315826416, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8673332929611206, + "num_tokens": 601667573.0, + "step": 15771 + }, + { + "epoch": 2.0063605139295255, + "ewc_loss": 0.04567406326532364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2837031792732887e-05, + "grad_norm": 29.631973266601562, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.876812219619751, + "num_tokens": 601704494.0, + "step": 15772 + }, + { + "epoch": 2.006487724208116, + "ewc_loss": 0.04563180357217789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2815902411821298e-05, + "grad_norm": 29.757217407226562, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8605690002441406, + "num_tokens": 601745307.0, + "step": 15773 + }, + { + "epoch": 2.0066149344867066, + "ewc_loss": 0.045637212693691254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2818605430074967e-05, + "grad_norm": 29.74347496032715, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8855172395706177, + "num_tokens": 601782551.0, + "step": 15774 + }, + { + "epoch": 2.006742144765297, + "ewc_loss": 0.04549971595406532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2749858544557355e-05, + "grad_norm": 29.50971221923828, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8716162443161011, + "num_tokens": 601823378.0, + "step": 15775 + }, + { + "epoch": 2.0068693550438876, + "ewc_loss": 0.04563402011990547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2817010176368058e-05, + "grad_norm": 29.7130126953125, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8708013296127319, + "num_tokens": 601860881.0, + "step": 15776 + }, + { + "epoch": 2.006996565322478, + "ewc_loss": 0.04566590115427971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.283294998051133e-05, + "grad_norm": 29.660091400146484, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8687456846237183, + "num_tokens": 601903502.0, + "step": 15777 + }, + { + "epoch": 2.0071237756010687, + "ewc_loss": 0.04555654898285866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.277827479701955e-05, + "grad_norm": 29.581512451171875, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8874629735946655, + "num_tokens": 601937097.0, + "step": 15778 + }, + { + "epoch": 2.007250985879659, + "ewc_loss": 0.04559078812599182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.279539330629632e-05, + "grad_norm": 29.592866897583008, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8733446002006531, + "num_tokens": 601978050.0, + "step": 15779 + }, + { + "epoch": 2.0073781961582498, + "ewc_loss": 0.04563535377383232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.281767774547916e-05, + "grad_norm": 29.64409065246582, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8691244125366211, + "num_tokens": 602018167.0, + "step": 15780 + }, + { + "epoch": 2.0075054064368403, + "ewc_loss": 0.045575279742479324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2787640773458406e-05, + "grad_norm": 29.748151779174805, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8553656339645386, + "num_tokens": 602054485.0, + "step": 15781 + }, + { + "epoch": 2.007632616715431, + "ewc_loss": 0.04559338465332985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.279669206473045e-05, + "grad_norm": 29.775043487548828, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8754715323448181, + "num_tokens": 602092881.0, + "step": 15782 + }, + { + "epoch": 2.0077598269940213, + "ewc_loss": 0.045555874705314636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.277793646499049e-05, + "grad_norm": 29.50225257873535, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8722538352012634, + "num_tokens": 602127424.0, + "step": 15783 + }, + { + "epoch": 2.0078870372726114, + "ewc_loss": 0.04555364325642586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2776821424486116e-05, + "grad_norm": 29.80069351196289, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8774827122688293, + "num_tokens": 602168340.0, + "step": 15784 + }, + { + "epoch": 2.008014247551202, + "ewc_loss": 0.04562191292643547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2810956579633057e-05, + "grad_norm": 29.589841842651367, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8617370128631592, + "num_tokens": 602207894.0, + "step": 15785 + }, + { + "epoch": 2.0081414578297925, + "ewc_loss": 0.04550136253237724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2750680727767758e-05, + "grad_norm": 29.598711013793945, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8724459409713745, + "num_tokens": 602245924.0, + "step": 15786 + }, + { + "epoch": 2.008268668108383, + "ewc_loss": 0.04566040262579918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2830201487522572e-05, + "grad_norm": 29.804397583007812, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8691670894622803, + "num_tokens": 602278417.0, + "step": 15787 + }, + { + "epoch": 2.0083958783869735, + "ewc_loss": 0.04560447111725807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2802236344432458e-05, + "grad_norm": 29.59358787536621, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8735179901123047, + "num_tokens": 602318431.0, + "step": 15788 + }, + { + "epoch": 2.008523088665564, + "ewc_loss": 0.045503802597522736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2751901269657537e-05, + "grad_norm": 29.529193878173828, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.862488329410553, + "num_tokens": 602356315.0, + "step": 15789 + }, + { + "epoch": 2.0086502989441546, + "ewc_loss": 0.04564400389790535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2822001483291388e-05, + "grad_norm": 29.66094398498535, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8845031261444092, + "num_tokens": 602398865.0, + "step": 15790 + }, + { + "epoch": 2.008777509222745, + "ewc_loss": 0.04557463899254799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.278731881233398e-05, + "grad_norm": 29.600086212158203, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8803537487983704, + "num_tokens": 602435835.0, + "step": 15791 + }, + { + "epoch": 2.0089047195013356, + "ewc_loss": 0.04561179131269455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2805896151112393e-05, + "grad_norm": 29.60618782043457, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8775342106819153, + "num_tokens": 602470359.0, + "step": 15792 + }, + { + "epoch": 2.009031929779926, + "ewc_loss": 0.045589543879032135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2794771211920306e-05, + "grad_norm": 29.710025787353516, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8745812177658081, + "num_tokens": 602512132.0, + "step": 15793 + }, + { + "epoch": 2.0091591400585167, + "ewc_loss": 0.045685648918151855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.284282527398318e-05, + "grad_norm": 29.580623626708984, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.868048369884491, + "num_tokens": 602552700.0, + "step": 15794 + }, + { + "epoch": 2.0092863503371072, + "ewc_loss": 0.0456402525305748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2820126105216332e-05, + "grad_norm": 29.65727996826172, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8696988821029663, + "num_tokens": 602587779.0, + "step": 15795 + }, + { + "epoch": 2.0094135606156978, + "ewc_loss": 0.045639485120773315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2819742298452184e-05, + "grad_norm": 29.56125831604004, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8603605628013611, + "num_tokens": 602623834.0, + "step": 15796 + }, + { + "epoch": 2.0095407708942883, + "ewc_loss": 0.04568862542510033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.284431320731528e-05, + "grad_norm": 29.742401123046875, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8814688920974731, + "num_tokens": 602660141.0, + "step": 15797 + }, + { + "epoch": 2.009667981172879, + "ewc_loss": 0.04567977786064148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2839889425085858e-05, + "grad_norm": 29.713138580322266, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8755831718444824, + "num_tokens": 602698990.0, + "step": 15798 + }, + { + "epoch": 2.0097951914514693, + "ewc_loss": 0.045557018369436264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2778509446652606e-05, + "grad_norm": 29.619518280029297, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8528223633766174, + "num_tokens": 602732053.0, + "step": 15799 + }, + { + "epoch": 2.00992240173006, + "ewc_loss": 0.04561454430222511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2807271307101473e-05, + "grad_norm": 29.677711486816406, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8720126152038574, + "num_tokens": 602770975.0, + "step": 15800 + }, + { + "epoch": 2.0100496120086504, + "ewc_loss": 0.04561108350753784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.28055414481787e-05, + "grad_norm": 29.58513069152832, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.861814022064209, + "num_tokens": 602813207.0, + "step": 15801 + }, + { + "epoch": 2.010176822287241, + "ewc_loss": 0.045656539499759674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2828269720776007e-05, + "grad_norm": 29.625526428222656, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8772914409637451, + "num_tokens": 602849951.0, + "step": 15802 + }, + { + "epoch": 2.0103040325658315, + "ewc_loss": 0.04563639312982559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.281819615745917e-05, + "grad_norm": 29.698633193969727, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8611010313034058, + "num_tokens": 602891418.0, + "step": 15803 + }, + { + "epoch": 2.010431242844422, + "ewc_loss": 0.04569470137357712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2847350919619203e-05, + "grad_norm": 29.64499855041504, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8768510222434998, + "num_tokens": 602933039.0, + "step": 15804 + }, + { + "epoch": 2.0105584531230125, + "ewc_loss": 0.04557310789823532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.278655483678449e-05, + "grad_norm": 29.638702392578125, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8854552507400513, + "num_tokens": 602965546.0, + "step": 15805 + }, + { + "epoch": 2.010685663401603, + "ewc_loss": 0.04563991725444794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2819958758191206e-05, + "grad_norm": 29.57274055480957, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8701210021972656, + "num_tokens": 603004473.0, + "step": 15806 + }, + { + "epoch": 2.0108128736801936, + "ewc_loss": 0.04565904662013054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.282952300447505e-05, + "grad_norm": 29.663570404052734, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.86295485496521, + "num_tokens": 603041644.0, + "step": 15807 + }, + { + "epoch": 2.0109400839587837, + "ewc_loss": 0.04559661075472832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2798305508331396e-05, + "grad_norm": 29.547414779663086, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8578430414199829, + "num_tokens": 603084698.0, + "step": 15808 + }, + { + "epoch": 2.011067294237374, + "ewc_loss": 0.04568284749984741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2841422833153047e-05, + "grad_norm": 29.690343856811523, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8548352718353271, + "num_tokens": 603121059.0, + "step": 15809 + }, + { + "epoch": 2.0111945045159647, + "ewc_loss": 0.04573820158839226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.28691005759174e-05, + "grad_norm": 29.593881607055664, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8679312467575073, + "num_tokens": 603156604.0, + "step": 15810 + }, + { + "epoch": 2.0113217147945552, + "ewc_loss": 0.045656315982341766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.282815876242239e-05, + "grad_norm": 29.571557998657227, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.870591402053833, + "num_tokens": 603183743.0, + "step": 15811 + }, + { + "epoch": 2.0114489250731458, + "ewc_loss": 0.04576300084590912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2881500626681373e-05, + "grad_norm": 29.654876708984375, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.8474842309951782, + "num_tokens": 603222099.0, + "step": 15812 + }, + { + "epoch": 2.0115761353517363, + "ewc_loss": 0.04567377641797066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2836888092570007e-05, + "grad_norm": 29.5543270111084, + "learning_rate": 1e-06, + "loss": 0.4818, + "mean_token_accuracy": 0.8511104583740234, + "num_tokens": 603263467.0, + "step": 15813 + }, + { + "epoch": 2.011703345630327, + "ewc_loss": 0.0457129143178463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2856456780573353e-05, + "grad_norm": 29.711030960083008, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8512901067733765, + "num_tokens": 603296726.0, + "step": 15814 + }, + { + "epoch": 2.0118305559089174, + "ewc_loss": 0.045771483331918716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.288574250997044e-05, + "grad_norm": 29.555055618286133, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.876070499420166, + "num_tokens": 603329691.0, + "step": 15815 + }, + { + "epoch": 2.011957766187508, + "ewc_loss": 0.04573413357138634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2867066945764236e-05, + "grad_norm": 29.65985870361328, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8775396347045898, + "num_tokens": 603371619.0, + "step": 15816 + }, + { + "epoch": 2.0120849764660984, + "ewc_loss": 0.045892711728811264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.29463566938648e-05, + "grad_norm": 29.7518253326416, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8729102611541748, + "num_tokens": 603400018.0, + "step": 15817 + }, + { + "epoch": 2.012212186744689, + "ewc_loss": 0.04565858095884323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2829290173831396e-05, + "grad_norm": 29.535743713378906, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8738781213760376, + "num_tokens": 603436971.0, + "step": 15818 + }, + { + "epoch": 2.0123393970232795, + "ewc_loss": 0.04574260860681534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2871305191074498e-05, + "grad_norm": 29.660167694091797, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8700474500656128, + "num_tokens": 603472802.0, + "step": 15819 + }, + { + "epoch": 2.01246660730187, + "ewc_loss": 0.04584752395749092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2923761207493953e-05, + "grad_norm": 29.720006942749023, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8750995397567749, + "num_tokens": 603506547.0, + "step": 15820 + }, + { + "epoch": 2.0125938175804605, + "ewc_loss": 0.04577483981847763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.288741961820051e-05, + "grad_norm": 29.6171875, + "learning_rate": 1e-06, + "loss": 0.4717, + "mean_token_accuracy": 0.8529767990112305, + "num_tokens": 603544967.0, + "step": 15821 + }, + { + "epoch": 2.012721027859051, + "ewc_loss": 0.04576360806822777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2881804397911765e-05, + "grad_norm": 29.601469039916992, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8650531768798828, + "num_tokens": 603582522.0, + "step": 15822 + }, + { + "epoch": 2.0128482381376416, + "ewc_loss": 0.04581895470619202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2909476683707908e-05, + "grad_norm": 29.626338958740234, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8802050352096558, + "num_tokens": 603619584.0, + "step": 15823 + }, + { + "epoch": 2.012975448416232, + "ewc_loss": 0.04583865776658058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2919328330317512e-05, + "grad_norm": 29.577552795410156, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8587334156036377, + "num_tokens": 603660542.0, + "step": 15824 + }, + { + "epoch": 2.0131026586948226, + "ewc_loss": 0.04585028439760208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2925141820451245e-05, + "grad_norm": 29.776819229125977, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8723801374435425, + "num_tokens": 603698900.0, + "step": 15825 + }, + { + "epoch": 2.013229868973413, + "ewc_loss": 0.04590867832303047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.295433841936756e-05, + "grad_norm": 29.698177337646484, + "learning_rate": 1e-06, + "loss": 0.4779, + "mean_token_accuracy": 0.8526440858840942, + "num_tokens": 603733019.0, + "step": 15826 + }, + { + "epoch": 2.0133570792520037, + "ewc_loss": 0.045811526477336884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2905762307345867e-05, + "grad_norm": 29.66233253479004, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8786547183990479, + "num_tokens": 603772071.0, + "step": 15827 + }, + { + "epoch": 2.013484289530594, + "ewc_loss": 0.045824360102415085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2912179701961577e-05, + "grad_norm": 29.522216796875, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8624836802482605, + "num_tokens": 603811008.0, + "step": 15828 + }, + { + "epoch": 2.0136114998091847, + "ewc_loss": 0.045840997248888016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2920497940503992e-05, + "grad_norm": 29.788875579833984, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8601062297821045, + "num_tokens": 603852647.0, + "step": 15829 + }, + { + "epoch": 2.0137387100877753, + "ewc_loss": 0.045836083590984344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2918042304809205e-05, + "grad_norm": 29.490032196044922, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.868594765663147, + "num_tokens": 603894198.0, + "step": 15830 + }, + { + "epoch": 2.013865920366366, + "ewc_loss": 0.04580759257078171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2903795979800634e-05, + "grad_norm": 29.771059036254883, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8666460514068604, + "num_tokens": 603930168.0, + "step": 15831 + }, + { + "epoch": 2.0139931306449563, + "ewc_loss": 0.04584365338087082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.292182580276858e-05, + "grad_norm": 29.54552459716797, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8795486688613892, + "num_tokens": 603966189.0, + "step": 15832 + }, + { + "epoch": 2.0141203409235464, + "ewc_loss": 0.04576065391302109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2880327378516085e-05, + "grad_norm": 29.66240692138672, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8668944835662842, + "num_tokens": 604011110.0, + "step": 15833 + }, + { + "epoch": 2.014247551202137, + "ewc_loss": 0.04588865116238594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.294432488270104e-05, + "grad_norm": 29.640222549438477, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8522392511367798, + "num_tokens": 604044256.0, + "step": 15834 + }, + { + "epoch": 2.0143747614807275, + "ewc_loss": 0.045754749327898026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.287737515871413e-05, + "grad_norm": 29.830259323120117, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8781960010528564, + "num_tokens": 604078652.0, + "step": 15835 + }, + { + "epoch": 2.014501971759318, + "ewc_loss": 0.04587525501847267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2937627363717183e-05, + "grad_norm": 29.507112503051758, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8797498941421509, + "num_tokens": 604112302.0, + "step": 15836 + }, + { + "epoch": 2.0146291820379085, + "ewc_loss": 0.04572844132781029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2864220227347687e-05, + "grad_norm": 29.785850524902344, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.885587215423584, + "num_tokens": 604150011.0, + "step": 15837 + }, + { + "epoch": 2.014756392316499, + "ewc_loss": 0.045828599482774734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.291430064360611e-05, + "grad_norm": 29.526283264160156, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.870650053024292, + "num_tokens": 604188859.0, + "step": 15838 + }, + { + "epoch": 2.0148836025950896, + "ewc_loss": 0.045767974108457565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.288398718519602e-05, + "grad_norm": 29.76854133605957, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8645485043525696, + "num_tokens": 604232273.0, + "step": 15839 + }, + { + "epoch": 2.01501081287368, + "ewc_loss": 0.04585115611553192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2925578377908096e-05, + "grad_norm": 29.577617645263672, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8768725395202637, + "num_tokens": 604265236.0, + "step": 15840 + }, + { + "epoch": 2.0151380231522706, + "ewc_loss": 0.04573381319642067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2866906874696724e-05, + "grad_norm": 29.634309768676758, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8845343589782715, + "num_tokens": 604304784.0, + "step": 15841 + }, + { + "epoch": 2.015265233430861, + "ewc_loss": 0.04592449590563774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2962247385294177e-05, + "grad_norm": 29.64708137512207, + "learning_rate": 1e-06, + "loss": 0.3423, + "mean_token_accuracy": 0.895677924156189, + "num_tokens": 604341461.0, + "step": 15842 + }, + { + "epoch": 2.0153924437094517, + "ewc_loss": 0.04573262110352516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.286631024617236e-05, + "grad_norm": 29.647184371948242, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8758149743080139, + "num_tokens": 604378628.0, + "step": 15843 + }, + { + "epoch": 2.0155196539880422, + "ewc_loss": 0.04585167393088341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.29258366744034e-05, + "grad_norm": 29.566734313964844, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8802688717842102, + "num_tokens": 604418070.0, + "step": 15844 + }, + { + "epoch": 2.0156468642666328, + "ewc_loss": 0.0457543320953846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2877165974932723e-05, + "grad_norm": 29.65639877319336, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8829838037490845, + "num_tokens": 604454445.0, + "step": 15845 + }, + { + "epoch": 2.0157740745452233, + "ewc_loss": 0.04587467387318611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2937336325412616e-05, + "grad_norm": 29.720422744750977, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8735248446464539, + "num_tokens": 604488559.0, + "step": 15846 + }, + { + "epoch": 2.015901284823814, + "ewc_loss": 0.04577774927020073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2888874809723347e-05, + "grad_norm": 29.643465042114258, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8631837964057922, + "num_tokens": 604522538.0, + "step": 15847 + }, + { + "epoch": 2.0160284951024043, + "ewc_loss": 0.04575413465499878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.287706774950493e-05, + "grad_norm": 29.510175704956055, + "learning_rate": 1e-06, + "loss": 0.3505, + "mean_token_accuracy": 0.8893771171569824, + "num_tokens": 604561750.0, + "step": 15848 + }, + { + "epoch": 2.016155705380995, + "ewc_loss": 0.04584287852048874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.292144017701503e-05, + "grad_norm": 29.726476669311523, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8841187953948975, + "num_tokens": 604594961.0, + "step": 15849 + }, + { + "epoch": 2.0162829156595854, + "ewc_loss": 0.04586614668369293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2933072614250705e-05, + "grad_norm": 29.550724029541016, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8617939949035645, + "num_tokens": 604631301.0, + "step": 15850 + }, + { + "epoch": 2.016410125938176, + "ewc_loss": 0.045802295207977295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.290114753122907e-05, + "grad_norm": 29.76557731628418, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8495818972587585, + "num_tokens": 604666049.0, + "step": 15851 + }, + { + "epoch": 2.0165373362167665, + "ewc_loss": 0.04584934189915657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2924670702195726e-05, + "grad_norm": 29.54904556274414, + "learning_rate": 1e-06, + "loss": 0.3567, + "mean_token_accuracy": 0.8916974067687988, + "num_tokens": 604701984.0, + "step": 15852 + }, + { + "epoch": 2.016664546495357, + "ewc_loss": 0.04573841392993927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2869206077302806e-05, + "grad_norm": 29.625139236450195, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8792186379432678, + "num_tokens": 604738414.0, + "step": 15853 + }, + { + "epoch": 2.0167917567739475, + "ewc_loss": 0.0458446741104126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2922336938790977e-05, + "grad_norm": 29.616436004638672, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8731364011764526, + "num_tokens": 604779961.0, + "step": 15854 + }, + { + "epoch": 2.016918967052538, + "ewc_loss": 0.045780885964632034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2890442778589204e-05, + "grad_norm": 29.657758712768555, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8641384840011597, + "num_tokens": 604819729.0, + "step": 15855 + }, + { + "epoch": 2.0170461773311286, + "ewc_loss": 0.04585133120417595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2925665689399466e-05, + "grad_norm": 29.63085174560547, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8811891078948975, + "num_tokens": 604864108.0, + "step": 15856 + }, + { + "epoch": 2.0171733876097186, + "ewc_loss": 0.04578256234526634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.289128133270424e-05, + "grad_norm": 29.640731811523438, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8642287254333496, + "num_tokens": 604905582.0, + "step": 15857 + }, + { + "epoch": 2.017300597888309, + "ewc_loss": 0.045883338898420334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2941669158171862e-05, + "grad_norm": 29.65667152404785, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8780650496482849, + "num_tokens": 604943931.0, + "step": 15858 + }, + { + "epoch": 2.0174278081668997, + "ewc_loss": 0.045844171196222305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2922085918253288e-05, + "grad_norm": 29.633602142333984, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8667795658111572, + "num_tokens": 604982100.0, + "step": 15859 + }, + { + "epoch": 2.0175550184454902, + "ewc_loss": 0.04579074680805206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2895374058862217e-05, + "grad_norm": 29.493263244628906, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8601425886154175, + "num_tokens": 605026289.0, + "step": 15860 + }, + { + "epoch": 2.0176822287240808, + "ewc_loss": 0.045851923525333405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2925962184672244e-05, + "grad_norm": 29.64885902404785, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.875487208366394, + "num_tokens": 605070214.0, + "step": 15861 + }, + { + "epoch": 2.0178094390026713, + "ewc_loss": 0.04592728614807129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2963642550166696e-05, + "grad_norm": 29.70279884338379, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8662481307983398, + "num_tokens": 605113495.0, + "step": 15862 + }, + { + "epoch": 2.017936649281262, + "ewc_loss": 0.045816998928785324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2908499886398204e-05, + "grad_norm": 29.722620010375977, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8739798665046692, + "num_tokens": 605145717.0, + "step": 15863 + }, + { + "epoch": 2.0180638595598523, + "ewc_loss": 0.04577649012207985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.288824543938972e-05, + "grad_norm": 29.661108016967773, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8637229204177856, + "num_tokens": 605180930.0, + "step": 15864 + }, + { + "epoch": 2.018191069838443, + "ewc_loss": 0.04574340954422951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2871705368743278e-05, + "grad_norm": 29.563098907470703, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8770570755004883, + "num_tokens": 605214004.0, + "step": 15865 + }, + { + "epoch": 2.0183182801170334, + "ewc_loss": 0.04584381356835365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.292190765729174e-05, + "grad_norm": 29.738481521606445, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8713017702102661, + "num_tokens": 605253244.0, + "step": 15866 + }, + { + "epoch": 2.018445490395624, + "ewc_loss": 0.04578988999128342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.289494477736298e-05, + "grad_norm": 29.567232131958008, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8664743900299072, + "num_tokens": 605293832.0, + "step": 15867 + }, + { + "epoch": 2.0185727006742145, + "ewc_loss": 0.045767515897750854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2883757992531173e-05, + "grad_norm": 29.73323631286621, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8809903860092163, + "num_tokens": 605333811.0, + "step": 15868 + }, + { + "epoch": 2.018699910952805, + "ewc_loss": 0.045801322907209396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2900661861058325e-05, + "grad_norm": 29.591413497924805, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8709125518798828, + "num_tokens": 605371560.0, + "step": 15869 + }, + { + "epoch": 2.0188271212313955, + "ewc_loss": 0.045787081122398376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2893540517543443e-05, + "grad_norm": 29.828542709350586, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8673706650733948, + "num_tokens": 605410555.0, + "step": 15870 + }, + { + "epoch": 2.018954331509986, + "ewc_loss": 0.045880384743213654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2940192138776183e-05, + "grad_norm": 29.615398406982422, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8657662868499756, + "num_tokens": 605451851.0, + "step": 15871 + }, + { + "epoch": 2.0190815417885766, + "ewc_loss": 0.045743245631456375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.287162351422012e-05, + "grad_norm": 29.706832885742188, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8622151613235474, + "num_tokens": 605490442.0, + "step": 15872 + }, + { + "epoch": 2.019208752067167, + "ewc_loss": 0.04587460681796074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2937303583603352e-05, + "grad_norm": 29.726720809936523, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8690428733825684, + "num_tokens": 605525237.0, + "step": 15873 + }, + { + "epoch": 2.0193359623457576, + "ewc_loss": 0.04577203840017319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.288601899635978e-05, + "grad_norm": 29.731590270996094, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8757748603820801, + "num_tokens": 605563624.0, + "step": 15874 + }, + { + "epoch": 2.019463172624348, + "ewc_loss": 0.045762039721012115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2881020413478836e-05, + "grad_norm": 29.610820770263672, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8797403573989868, + "num_tokens": 605597419.0, + "step": 15875 + }, + { + "epoch": 2.0195903829029387, + "ewc_loss": 0.045833736658096313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2916869056643918e-05, + "grad_norm": 29.807092666625977, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8636429309844971, + "num_tokens": 605639954.0, + "step": 15876 + }, + { + "epoch": 2.019717593181529, + "ewc_loss": 0.04584125429391861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2920627088751644e-05, + "grad_norm": 29.697891235351562, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8712315559387207, + "num_tokens": 605678226.0, + "step": 15877 + }, + { + "epoch": 2.0198448034601197, + "ewc_loss": 0.045726802200078964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.286340168211609e-05, + "grad_norm": 29.741294860839844, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8787935376167297, + "num_tokens": 605719925.0, + "step": 15878 + }, + { + "epoch": 2.0199720137387103, + "ewc_loss": 0.04583713784813881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2918569811736234e-05, + "grad_norm": 29.724157333374023, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8524394631385803, + "num_tokens": 605761282.0, + "step": 15879 + }, + { + "epoch": 2.020099224017301, + "ewc_loss": 0.04571278765797615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.285639311594423e-05, + "grad_norm": 29.646615982055664, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8804630637168884, + "num_tokens": 605800681.0, + "step": 15880 + }, + { + "epoch": 2.0202264342958913, + "ewc_loss": 0.045754022896289825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.287701136083342e-05, + "grad_norm": 29.66843605041504, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8536521792411804, + "num_tokens": 605836282.0, + "step": 15881 + }, + { + "epoch": 2.0203536445744814, + "ewc_loss": 0.045740220695734024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.287011011503637e-05, + "grad_norm": 29.691984176635742, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8730487823486328, + "num_tokens": 605880360.0, + "step": 15882 + }, + { + "epoch": 2.020480854853072, + "ewc_loss": 0.04569745436310768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2848727894597687e-05, + "grad_norm": 29.546348571777344, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8811542987823486, + "num_tokens": 605918363.0, + "step": 15883 + }, + { + "epoch": 2.0206080651316625, + "ewc_loss": 0.04578292369842529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.289146141265519e-05, + "grad_norm": 29.722896575927734, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8840298652648926, + "num_tokens": 605953032.0, + "step": 15884 + }, + { + "epoch": 2.020735275410253, + "ewc_loss": 0.045729104429483414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2864553102408536e-05, + "grad_norm": 29.6428279876709, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.879702627658844, + "num_tokens": 605992633.0, + "step": 15885 + }, + { + "epoch": 2.0208624856888435, + "ewc_loss": 0.04579073190689087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2895366782904603e-05, + "grad_norm": 29.776033401489258, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8730360865592957, + "num_tokens": 606031776.0, + "step": 15886 + }, + { + "epoch": 2.020989695967434, + "ewc_loss": 0.04577410966157913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2887054001330398e-05, + "grad_norm": 29.721920013427734, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8648331165313721, + "num_tokens": 606070614.0, + "step": 15887 + }, + { + "epoch": 2.0211169062460246, + "ewc_loss": 0.04574357345700264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2871787223266438e-05, + "grad_norm": 29.763813018798828, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8700683116912842, + "num_tokens": 606115898.0, + "step": 15888 + }, + { + "epoch": 2.021244116524615, + "ewc_loss": 0.0457320362329483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.286601738887839e-05, + "grad_norm": 29.71270179748535, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8752866387367249, + "num_tokens": 606150231.0, + "step": 15889 + }, + { + "epoch": 2.0213713268032056, + "ewc_loss": 0.04558321088552475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2791606170358136e-05, + "grad_norm": 29.724491119384766, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8628348112106323, + "num_tokens": 606191527.0, + "step": 15890 + }, + { + "epoch": 2.021498537081796, + "ewc_loss": 0.04572681710124016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2863408958073705e-05, + "grad_norm": 29.870010375976562, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8629165291786194, + "num_tokens": 606229070.0, + "step": 15891 + }, + { + "epoch": 2.0216257473603867, + "ewc_loss": 0.04566671699285507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2833359253127128e-05, + "grad_norm": 29.722129821777344, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8720012307167053, + "num_tokens": 606269555.0, + "step": 15892 + }, + { + "epoch": 2.021752957638977, + "ewc_loss": 0.045626200735569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2813101168139838e-05, + "grad_norm": 29.72868537902832, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8667487502098083, + "num_tokens": 606302040.0, + "step": 15893 + }, + { + "epoch": 2.0218801679175677, + "ewc_loss": 0.04561347886919975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2806740162195638e-05, + "grad_norm": 29.81337547302246, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8807497620582581, + "num_tokens": 606341769.0, + "step": 15894 + }, + { + "epoch": 2.0220073781961583, + "ewc_loss": 0.04559657350182533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.279828731843736e-05, + "grad_norm": 29.70697593688965, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8581395745277405, + "num_tokens": 606379989.0, + "step": 15895 + }, + { + "epoch": 2.022134588474749, + "ewc_loss": 0.04563618451356888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2818092475063168e-05, + "grad_norm": 29.68930435180664, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8652383089065552, + "num_tokens": 606414396.0, + "step": 15896 + }, + { + "epoch": 2.0222617987533393, + "ewc_loss": 0.04562824219465256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2814121621195227e-05, + "grad_norm": 29.579078674316406, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8741254210472107, + "num_tokens": 606445949.0, + "step": 15897 + }, + { + "epoch": 2.02238900903193, + "ewc_loss": 0.045686956495046616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2843478291179053e-05, + "grad_norm": 29.783340454101562, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8738943338394165, + "num_tokens": 606484751.0, + "step": 15898 + }, + { + "epoch": 2.0225162193105204, + "ewc_loss": 0.0456903912127018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2845195417176e-05, + "grad_norm": 29.650768280029297, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8739809989929199, + "num_tokens": 606525390.0, + "step": 15899 + }, + { + "epoch": 2.022643429589111, + "ewc_loss": 0.04565804451704025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2829022782389075e-05, + "grad_norm": 29.587038040161133, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8671486377716064, + "num_tokens": 606563364.0, + "step": 15900 + }, + { + "epoch": 2.0227706398677014, + "ewc_loss": 0.04573160782456398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2865804567118175e-05, + "grad_norm": 29.632579803466797, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.873456597328186, + "num_tokens": 606598140.0, + "step": 15901 + }, + { + "epoch": 2.022897850146292, + "ewc_loss": 0.04573359340429306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2866795916343108e-05, + "grad_norm": 29.656871795654297, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8639698028564453, + "num_tokens": 606636480.0, + "step": 15902 + }, + { + "epoch": 2.0230250604248825, + "ewc_loss": 0.04582653194665909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2913265638635494e-05, + "grad_norm": 29.725618362426758, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8752608299255371, + "num_tokens": 606676044.0, + "step": 15903 + }, + { + "epoch": 2.023152270703473, + "ewc_loss": 0.045764144510030746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2882071789354086e-05, + "grad_norm": 29.694454193115234, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8632347583770752, + "num_tokens": 606713262.0, + "step": 15904 + }, + { + "epoch": 2.0232794809820636, + "ewc_loss": 0.045747291296720505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.287364623043686e-05, + "grad_norm": 29.664405822753906, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8666810989379883, + "num_tokens": 606754844.0, + "step": 15905 + }, + { + "epoch": 2.0234066912606536, + "ewc_loss": 0.04580943286418915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2904716388438828e-05, + "grad_norm": 29.71200942993164, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8714199662208557, + "num_tokens": 606800910.0, + "step": 15906 + }, + { + "epoch": 2.023533901539244, + "ewc_loss": 0.04579778388142586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2898891984368674e-05, + "grad_norm": 29.655122756958008, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8663361668586731, + "num_tokens": 606841855.0, + "step": 15907 + }, + { + "epoch": 2.0236611118178347, + "ewc_loss": 0.04576042294502258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.288021096319426e-05, + "grad_norm": 29.651262283325195, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8602620959281921, + "num_tokens": 606888261.0, + "step": 15908 + }, + { + "epoch": 2.0237883220964252, + "ewc_loss": 0.04579080268740654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.289540134370327e-05, + "grad_norm": 29.62078285217285, + "learning_rate": 1e-06, + "loss": 0.4499, + "mean_token_accuracy": 0.862275242805481, + "num_tokens": 606926235.0, + "step": 15909 + }, + { + "epoch": 2.0239155323750158, + "ewc_loss": 0.04574352875351906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.287176357640419e-05, + "grad_norm": 29.598966598510742, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8662606477737427, + "num_tokens": 606962527.0, + "step": 15910 + }, + { + "epoch": 2.0240427426536063, + "ewc_loss": 0.04584883153438568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.292441604367923e-05, + "grad_norm": 29.70527458190918, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8752620220184326, + "num_tokens": 607008924.0, + "step": 15911 + }, + { + "epoch": 2.024169952932197, + "ewc_loss": 0.04578324779868126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2891623302712105e-05, + "grad_norm": 29.73338508605957, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8672826290130615, + "num_tokens": 607049788.0, + "step": 15912 + }, + { + "epoch": 2.0242971632107873, + "ewc_loss": 0.04579659178853035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.289829535584431e-05, + "grad_norm": 29.719335556030273, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8650847673416138, + "num_tokens": 607087592.0, + "step": 15913 + }, + { + "epoch": 2.024424373489378, + "ewc_loss": 0.045779600739479065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2889800675329752e-05, + "grad_norm": 29.69353485107422, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8860251903533936, + "num_tokens": 607119954.0, + "step": 15914 + }, + { + "epoch": 2.0245515837679684, + "ewc_loss": 0.04577622190117836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2888110834173858e-05, + "grad_norm": 29.69382095336914, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8603910207748413, + "num_tokens": 607156511.0, + "step": 15915 + }, + { + "epoch": 2.024678794046559, + "ewc_loss": 0.045730724930763245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2865362552693114e-05, + "grad_norm": 29.619558334350586, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8797347545623779, + "num_tokens": 607198292.0, + "step": 15916 + }, + { + "epoch": 2.0248060043251495, + "ewc_loss": 0.045751649886369705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.287582537974231e-05, + "grad_norm": 29.582029342651367, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8684737682342529, + "num_tokens": 607237572.0, + "step": 15917 + }, + { + "epoch": 2.02493321460374, + "ewc_loss": 0.045717306435108185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2858652300783433e-05, + "grad_norm": 29.683246612548828, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8608794212341309, + "num_tokens": 607282321.0, + "step": 15918 + }, + { + "epoch": 2.0250604248823305, + "ewc_loss": 0.04580694064497948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.29034703806974e-05, + "grad_norm": 29.640167236328125, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8683810234069824, + "num_tokens": 607313355.0, + "step": 15919 + }, + { + "epoch": 2.025187635160921, + "ewc_loss": 0.04574555158615112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2872774934512563e-05, + "grad_norm": 29.756345748901367, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8674309849739075, + "num_tokens": 607351961.0, + "step": 15920 + }, + { + "epoch": 2.0253148454395116, + "ewc_loss": 0.04577121138572693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2885606085765176e-05, + "grad_norm": 29.656360626220703, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.872434675693512, + "num_tokens": 607391368.0, + "step": 15921 + }, + { + "epoch": 2.025442055718102, + "ewc_loss": 0.045688435435295105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2844216800876893e-05, + "grad_norm": 29.57749366760254, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8897527456283569, + "num_tokens": 607427328.0, + "step": 15922 + }, + { + "epoch": 2.0255692659966926, + "ewc_loss": 0.04577285796403885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2886428268975578e-05, + "grad_norm": 29.61044692993164, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8664873242378235, + "num_tokens": 607468243.0, + "step": 15923 + }, + { + "epoch": 2.025696476275283, + "ewc_loss": 0.04589793458580971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.294896694365889e-05, + "grad_norm": 29.761459350585938, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8547555208206177, + "num_tokens": 607502466.0, + "step": 15924 + }, + { + "epoch": 2.0258236865538737, + "ewc_loss": 0.04582234472036362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2911171981832013e-05, + "grad_norm": 29.595829010009766, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8617369532585144, + "num_tokens": 607544093.0, + "step": 15925 + }, + { + "epoch": 2.025950896832464, + "ewc_loss": 0.04580756276845932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2903781427885406e-05, + "grad_norm": 29.68546485900879, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8771237134933472, + "num_tokens": 607576054.0, + "step": 15926 + }, + { + "epoch": 2.0260781071110547, + "ewc_loss": 0.04587149620056152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.293574834766332e-05, + "grad_norm": 29.7816162109375, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8834388256072998, + "num_tokens": 607608462.0, + "step": 15927 + }, + { + "epoch": 2.0262053173896453, + "ewc_loss": 0.04586062207818031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2930311388336122e-05, + "grad_norm": 29.681507110595703, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8644278049468994, + "num_tokens": 607645272.0, + "step": 15928 + }, + { + "epoch": 2.026332527668236, + "ewc_loss": 0.04577040672302246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2885204089106992e-05, + "grad_norm": 29.732202529907227, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8723762035369873, + "num_tokens": 607682585.0, + "step": 15929 + }, + { + "epoch": 2.0264597379468263, + "ewc_loss": 0.04587876796722412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.293938450748101e-05, + "grad_norm": 29.6361083984375, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8880083560943604, + "num_tokens": 607721302.0, + "step": 15930 + }, + { + "epoch": 2.0265869482254164, + "ewc_loss": 0.045746125280857086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2873062334838323e-05, + "grad_norm": 29.669937133789062, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.866067111492157, + "num_tokens": 607758357.0, + "step": 15931 + }, + { + "epoch": 2.026714158504007, + "ewc_loss": 0.0457531176507473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.287655843247194e-05, + "grad_norm": 29.405502319335938, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8703518509864807, + "num_tokens": 607795467.0, + "step": 15932 + }, + { + "epoch": 2.0268413687825975, + "ewc_loss": 0.04583188518881798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2915943191037513e-05, + "grad_norm": 29.676786422729492, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8650604486465454, + "num_tokens": 607833192.0, + "step": 15933 + }, + { + "epoch": 2.026968579061188, + "ewc_loss": 0.046069491654634476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.30347450269619e-05, + "grad_norm": 29.712202072143555, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8790193796157837, + "num_tokens": 607866672.0, + "step": 15934 + }, + { + "epoch": 2.0270957893397785, + "ewc_loss": 0.0458967387676239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2948370315134525e-05, + "grad_norm": 29.59855079650879, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8726131916046143, + "num_tokens": 607907976.0, + "step": 15935 + }, + { + "epoch": 2.027222999618369, + "ewc_loss": 0.045898500829935074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.294925070600584e-05, + "grad_norm": 29.525211334228516, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8745335340499878, + "num_tokens": 607946009.0, + "step": 15936 + }, + { + "epoch": 2.0273502098969596, + "ewc_loss": 0.0459006205201149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.295031117682811e-05, + "grad_norm": 29.69833755493164, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8756587505340576, + "num_tokens": 607986800.0, + "step": 15937 + }, + { + "epoch": 2.02747742017555, + "ewc_loss": 0.046011168509721756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3005584807833657e-05, + "grad_norm": 29.603734970092773, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8645734190940857, + "num_tokens": 608021807.0, + "step": 15938 + }, + { + "epoch": 2.0276046304541406, + "ewc_loss": 0.04597450792789459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2987254851614125e-05, + "grad_norm": 29.7867488861084, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8695790767669678, + "num_tokens": 608060294.0, + "step": 15939 + }, + { + "epoch": 2.027731840732731, + "ewc_loss": 0.04594315215945244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.297157516295556e-05, + "grad_norm": 29.542287826538086, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8702318668365479, + "num_tokens": 608100790.0, + "step": 15940 + }, + { + "epoch": 2.0278590510113217, + "ewc_loss": 0.04585656896233559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2928285034140572e-05, + "grad_norm": 29.708040237426758, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.86616051197052, + "num_tokens": 608142164.0, + "step": 15941 + }, + { + "epoch": 2.027986261289912, + "ewc_loss": 0.04596000909805298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.298000435985159e-05, + "grad_norm": 29.602108001708984, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8715049028396606, + "num_tokens": 608184175.0, + "step": 15942 + }, + { + "epoch": 2.0281134715685027, + "ewc_loss": 0.045869503170251846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2934751541470177e-05, + "grad_norm": 29.721328735351562, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8645513653755188, + "num_tokens": 608226801.0, + "step": 15943 + }, + { + "epoch": 2.0282406818470933, + "ewc_loss": 0.04599675536155701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2998377971816808e-05, + "grad_norm": 29.631479263305664, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8791787028312683, + "num_tokens": 608265155.0, + "step": 15944 + }, + { + "epoch": 2.028367892125684, + "ewc_loss": 0.045882388949394226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2941194401937537e-05, + "grad_norm": 29.74033546447754, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8809974789619446, + "num_tokens": 608303817.0, + "step": 15945 + }, + { + "epoch": 2.0284951024042743, + "ewc_loss": 0.04598281532526016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.299140760442242e-05, + "grad_norm": 29.678686141967773, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8778904676437378, + "num_tokens": 608338845.0, + "step": 15946 + }, + { + "epoch": 2.028622312682865, + "ewc_loss": 0.045836981385946274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.291849159519188e-05, + "grad_norm": 29.742849349975586, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8706724643707275, + "num_tokens": 608377661.0, + "step": 15947 + }, + { + "epoch": 2.0287495229614554, + "ewc_loss": 0.045854728668928146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2927364625502378e-05, + "grad_norm": 29.68632698059082, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8697768449783325, + "num_tokens": 608412897.0, + "step": 15948 + }, + { + "epoch": 2.028876733240046, + "ewc_loss": 0.045851897448301315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.292594945174642e-05, + "grad_norm": 29.801347732543945, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8625622987747192, + "num_tokens": 608453584.0, + "step": 15949 + }, + { + "epoch": 2.0290039435186364, + "ewc_loss": 0.04585414007306099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2927069949219003e-05, + "grad_norm": 29.80849266052246, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8662971258163452, + "num_tokens": 608492211.0, + "step": 15950 + }, + { + "epoch": 2.029131153797227, + "ewc_loss": 0.04578755423426628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2893776986165904e-05, + "grad_norm": 29.77774429321289, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8665670156478882, + "num_tokens": 608534317.0, + "step": 15951 + }, + { + "epoch": 2.0292583640758175, + "ewc_loss": 0.04582076147198677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.291038072144147e-05, + "grad_norm": 29.634014129638672, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8804607391357422, + "num_tokens": 608569763.0, + "step": 15952 + }, + { + "epoch": 2.029385574354408, + "ewc_loss": 0.04574315994977951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2871579858474433e-05, + "grad_norm": 29.698076248168945, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8731987476348877, + "num_tokens": 608612145.0, + "step": 15953 + }, + { + "epoch": 2.0295127846329986, + "ewc_loss": 0.04583869129419327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2919346520211548e-05, + "grad_norm": 29.679594039916992, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8692160844802856, + "num_tokens": 608649528.0, + "step": 15954 + }, + { + "epoch": 2.0296399949115886, + "ewc_loss": 0.045868758112192154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.293437864864245e-05, + "grad_norm": 29.761138916015625, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8647431135177612, + "num_tokens": 608686973.0, + "step": 15955 + }, + { + "epoch": 2.029767205190179, + "ewc_loss": 0.04577551782131195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2887759769218974e-05, + "grad_norm": 29.71478843688965, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8727335929870605, + "num_tokens": 608729169.0, + "step": 15956 + }, + { + "epoch": 2.0298944154687697, + "ewc_loss": 0.04582387953996658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.291193959536031e-05, + "grad_norm": 29.650949478149414, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.869581937789917, + "num_tokens": 608764502.0, + "step": 15957 + }, + { + "epoch": 2.0300216257473602, + "ewc_loss": 0.045761965215206146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2880982214701362e-05, + "grad_norm": 29.7562198638916, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8705388307571411, + "num_tokens": 608805190.0, + "step": 15958 + }, + { + "epoch": 2.0301488360259508, + "ewc_loss": 0.04588078707456589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2940394046599977e-05, + "grad_norm": 29.688180923461914, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8776295185089111, + "num_tokens": 608840006.0, + "step": 15959 + }, + { + "epoch": 2.0302760463045413, + "ewc_loss": 0.045786309987306595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.289315489178989e-05, + "grad_norm": 29.742055892944336, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8775250911712646, + "num_tokens": 608871551.0, + "step": 15960 + }, + { + "epoch": 2.030403256583132, + "ewc_loss": 0.045863520354032516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2931759303901345e-05, + "grad_norm": 29.74136734008789, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8496763110160828, + "num_tokens": 608909610.0, + "step": 15961 + }, + { + "epoch": 2.0305304668617223, + "ewc_loss": 0.04581892490386963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.290946213179268e-05, + "grad_norm": 29.718345642089844, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8675996661186218, + "num_tokens": 608950758.0, + "step": 15962 + }, + { + "epoch": 2.030657677140313, + "ewc_loss": 0.04585942253470421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2929711121832952e-05, + "grad_norm": 29.65694808959961, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8577780723571777, + "num_tokens": 608990695.0, + "step": 15963 + }, + { + "epoch": 2.0307848874189034, + "ewc_loss": 0.04587749391794205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2938746042200364e-05, + "grad_norm": 29.655029296875, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8843655586242676, + "num_tokens": 609030978.0, + "step": 15964 + }, + { + "epoch": 2.030912097697494, + "ewc_loss": 0.0458730086684227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2936505047255196e-05, + "grad_norm": 29.73479461669922, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8812940120697021, + "num_tokens": 609071974.0, + "step": 15965 + }, + { + "epoch": 2.0310393079760845, + "ewc_loss": 0.04587649926543236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2938249458093196e-05, + "grad_norm": 29.62006378173828, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.873274028301239, + "num_tokens": 609111268.0, + "step": 15966 + }, + { + "epoch": 2.031166518254675, + "ewc_loss": 0.0459001362323761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2950067432248034e-05, + "grad_norm": 29.766237258911133, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8667765259742737, + "num_tokens": 609149681.0, + "step": 15967 + }, + { + "epoch": 2.0312937285332655, + "ewc_loss": 0.04595606401562691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2978032575338148e-05, + "grad_norm": 29.617311477661133, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8756022453308105, + "num_tokens": 609189121.0, + "step": 15968 + }, + { + "epoch": 2.031420938811856, + "ewc_loss": 0.045922115445137024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2961057766224258e-05, + "grad_norm": 29.731842041015625, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.880382239818573, + "num_tokens": 609228070.0, + "step": 15969 + }, + { + "epoch": 2.0315481490904466, + "ewc_loss": 0.04594049230217934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2970245481701568e-05, + "grad_norm": 29.600706100463867, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8726072907447815, + "num_tokens": 609269718.0, + "step": 15970 + }, + { + "epoch": 2.031675359369037, + "ewc_loss": 0.0458063967525959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2903197532286867e-05, + "grad_norm": 29.510345458984375, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8737106919288635, + "num_tokens": 609307236.0, + "step": 15971 + }, + { + "epoch": 2.0318025696476276, + "ewc_loss": 0.04602085426449776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3010426957625896e-05, + "grad_norm": 29.76746368408203, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8535705804824829, + "num_tokens": 609342993.0, + "step": 15972 + }, + { + "epoch": 2.031929779926218, + "ewc_loss": 0.04593981057405472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2969905330683105e-05, + "grad_norm": 29.657726287841797, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8771699666976929, + "num_tokens": 609376817.0, + "step": 15973 + }, + { + "epoch": 2.0320569902048087, + "ewc_loss": 0.04588117450475693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2940586859476753e-05, + "grad_norm": 29.67461585998535, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8690166473388672, + "num_tokens": 609411209.0, + "step": 15974 + }, + { + "epoch": 2.032184200483399, + "ewc_loss": 0.04592664912343025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2963324227021076e-05, + "grad_norm": 29.589048385620117, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8880544304847717, + "num_tokens": 609449055.0, + "step": 15975 + }, + { + "epoch": 2.0323114107619897, + "ewc_loss": 0.045843061059713364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2921531126485206e-05, + "grad_norm": 29.673425674438477, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8716317415237427, + "num_tokens": 609486332.0, + "step": 15976 + }, + { + "epoch": 2.0324386210405803, + "ewc_loss": 0.045919373631477356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2959686248213984e-05, + "grad_norm": 29.69748306274414, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8575305938720703, + "num_tokens": 609526924.0, + "step": 15977 + }, + { + "epoch": 2.032565831319171, + "ewc_loss": 0.04589265212416649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.294632577104494e-05, + "grad_norm": 29.704662322998047, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8845550417900085, + "num_tokens": 609559542.0, + "step": 15978 + }, + { + "epoch": 2.032693041597761, + "ewc_loss": 0.04585415869951248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.292707904416602e-05, + "grad_norm": 29.697114944458008, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8634835481643677, + "num_tokens": 609598989.0, + "step": 15979 + }, + { + "epoch": 2.0328202518763514, + "ewc_loss": 0.04589584097266197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.294792102475185e-05, + "grad_norm": 29.781564712524414, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8583910465240479, + "num_tokens": 609635790.0, + "step": 15980 + }, + { + "epoch": 2.032947462154942, + "ewc_loss": 0.045884232968091965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2942116629565135e-05, + "grad_norm": 29.607311248779297, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8631013631820679, + "num_tokens": 609670120.0, + "step": 15981 + }, + { + "epoch": 2.0330746724335325, + "ewc_loss": 0.04585329443216324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.292664794367738e-05, + "grad_norm": 29.718843460083008, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8706406950950623, + "num_tokens": 609713260.0, + "step": 15982 + }, + { + "epoch": 2.033201882712123, + "ewc_loss": 0.045914728194475174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2957363398745656e-05, + "grad_norm": 29.78887367248535, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8773336410522461, + "num_tokens": 609753236.0, + "step": 15983 + }, + { + "epoch": 2.0333290929907135, + "ewc_loss": 0.04584579914808273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2922899006516673e-05, + "grad_norm": 29.622522354125977, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.873980700969696, + "num_tokens": 609791629.0, + "step": 15984 + }, + { + "epoch": 2.033456303269304, + "ewc_loss": 0.045846257358789444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.292312819918152e-05, + "grad_norm": 29.562063217163086, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8552113771438599, + "num_tokens": 609833457.0, + "step": 15985 + }, + { + "epoch": 2.0335835135478946, + "ewc_loss": 0.045835915952920914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2917958631296642e-05, + "grad_norm": 29.640092849731445, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8631551265716553, + "num_tokens": 609866007.0, + "step": 15986 + }, + { + "epoch": 2.033710723826485, + "ewc_loss": 0.04602494835853577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3012473320704885e-05, + "grad_norm": 29.651805877685547, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8753055334091187, + "num_tokens": 609903346.0, + "step": 15987 + }, + { + "epoch": 2.0338379341050756, + "ewc_loss": 0.045963212847709656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2981606889516115e-05, + "grad_norm": 29.6671142578125, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8754161596298218, + "num_tokens": 609935885.0, + "step": 15988 + }, + { + "epoch": 2.033965144383666, + "ewc_loss": 0.04600943252444267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3004717149888165e-05, + "grad_norm": 29.648439407348633, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8643370270729065, + "num_tokens": 609971127.0, + "step": 15989 + }, + { + "epoch": 2.0340923546622567, + "ewc_loss": 0.04602079465985298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.301039785379544e-05, + "grad_norm": 29.697341918945312, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8619238138198853, + "num_tokens": 610016719.0, + "step": 15990 + }, + { + "epoch": 2.034219564940847, + "ewc_loss": 0.046029046177864075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.301452332176268e-05, + "grad_norm": 29.77052116394043, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.860560417175293, + "num_tokens": 610057002.0, + "step": 15991 + }, + { + "epoch": 2.0343467752194377, + "ewc_loss": 0.045936841517686844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.296842103532981e-05, + "grad_norm": 29.770145416259766, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8759323358535767, + "num_tokens": 610092779.0, + "step": 15992 + }, + { + "epoch": 2.0344739854980283, + "ewc_loss": 0.04593345895409584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2966729375184514e-05, + "grad_norm": 29.74840545654297, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8617266416549683, + "num_tokens": 610137092.0, + "step": 15993 + }, + { + "epoch": 2.034601195776619, + "ewc_loss": 0.04594002664089203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2970012651057914e-05, + "grad_norm": 29.74022102355957, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8801523447036743, + "num_tokens": 610178632.0, + "step": 15994 + }, + { + "epoch": 2.0347284060552093, + "ewc_loss": 0.04589902609586716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2949512640479952e-05, + "grad_norm": 29.66166114807129, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8695580959320068, + "num_tokens": 610219789.0, + "step": 15995 + }, + { + "epoch": 2.0348556163338, + "ewc_loss": 0.04593914374709129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2969572455622256e-05, + "grad_norm": 29.728010177612305, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8562330007553101, + "num_tokens": 610259104.0, + "step": 15996 + }, + { + "epoch": 2.0349828266123904, + "ewc_loss": 0.04596946761012077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.298473373230081e-05, + "grad_norm": 29.771150588989258, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8650121688842773, + "num_tokens": 610296317.0, + "step": 15997 + }, + { + "epoch": 2.035110036890981, + "ewc_loss": 0.04597622901201248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2988115233602002e-05, + "grad_norm": 29.728666305541992, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8685191869735718, + "num_tokens": 610339300.0, + "step": 15998 + }, + { + "epoch": 2.0352372471695714, + "ewc_loss": 0.0459313727915287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2965687094256282e-05, + "grad_norm": 29.813074111938477, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8660950064659119, + "num_tokens": 610373917.0, + "step": 15999 + }, + { + "epoch": 2.035364457448162, + "ewc_loss": 0.045944828540086746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2972413717070594e-05, + "grad_norm": 29.836252212524414, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8695276975631714, + "num_tokens": 610419571.0, + "step": 16000 + }, + { + "epoch": 2.0354916677267525, + "ewc_loss": 0.04582200571894646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2911002815817483e-05, + "grad_norm": 29.622610092163086, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8626813888549805, + "num_tokens": 610458073.0, + "step": 16001 + }, + { + "epoch": 2.035618878005343, + "ewc_loss": 0.04582388326525688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2911941414349712e-05, + "grad_norm": 29.786300659179688, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8592717051506042, + "num_tokens": 610497223.0, + "step": 16002 + }, + { + "epoch": 2.0357460882839336, + "ewc_loss": 0.04585966095328331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2929831175133586e-05, + "grad_norm": 29.716794967651367, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8503888845443726, + "num_tokens": 610532563.0, + "step": 16003 + }, + { + "epoch": 2.0358732985625236, + "ewc_loss": 0.045838240534067154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2919120965525508e-05, + "grad_norm": 29.676250457763672, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.881748378276825, + "num_tokens": 610566644.0, + "step": 16004 + }, + { + "epoch": 2.036000508841114, + "ewc_loss": 0.04590921103954315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.295460581080988e-05, + "grad_norm": 29.747182846069336, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8580716252326965, + "num_tokens": 610604755.0, + "step": 16005 + }, + { + "epoch": 2.0361277191197047, + "ewc_loss": 0.04588581994175911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.294290970894508e-05, + "grad_norm": 29.614397048950195, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8748703002929688, + "num_tokens": 610645905.0, + "step": 16006 + }, + { + "epoch": 2.036254929398295, + "ewc_loss": 0.04581354185938835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.290677002747543e-05, + "grad_norm": 29.601877212524414, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8871360421180725, + "num_tokens": 610686800.0, + "step": 16007 + }, + { + "epoch": 2.0363821396768857, + "ewc_loss": 0.04591701552271843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2958507543080486e-05, + "grad_norm": 29.80023193359375, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8704503774642944, + "num_tokens": 610728291.0, + "step": 16008 + }, + { + "epoch": 2.0365093499554763, + "ewc_loss": 0.04583406075835228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2917030946700834e-05, + "grad_norm": 29.619380950927734, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8686882257461548, + "num_tokens": 610759818.0, + "step": 16009 + }, + { + "epoch": 2.036636560234067, + "ewc_loss": 0.04593369737267494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2966849428485148e-05, + "grad_norm": 29.69032096862793, + "learning_rate": 1e-06, + "loss": 0.465, + "mean_token_accuracy": 0.8561029434204102, + "num_tokens": 610800125.0, + "step": 16010 + }, + { + "epoch": 2.0367637705126573, + "ewc_loss": 0.045936327427625656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.296816455782391e-05, + "grad_norm": 29.734737396240234, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8618602156639099, + "num_tokens": 610839261.0, + "step": 16011 + }, + { + "epoch": 2.036890980791248, + "ewc_loss": 0.045873939990997314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2936970708542503e-05, + "grad_norm": 29.73183250427246, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8745987415313721, + "num_tokens": 610879092.0, + "step": 16012 + }, + { + "epoch": 2.0370181910698384, + "ewc_loss": 0.045866500586271286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2933250875212252e-05, + "grad_norm": 29.695514678955078, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8696645498275757, + "num_tokens": 610914494.0, + "step": 16013 + }, + { + "epoch": 2.037145401348429, + "ewc_loss": 0.04595879465341568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2979396817390807e-05, + "grad_norm": 29.79337501525879, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8810107111930847, + "num_tokens": 610950784.0, + "step": 16014 + }, + { + "epoch": 2.0372726116270194, + "ewc_loss": 0.045846473425626755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2923237338545732e-05, + "grad_norm": 29.650941848754883, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8591582775115967, + "num_tokens": 610989254.0, + "step": 16015 + }, + { + "epoch": 2.03739982190561, + "ewc_loss": 0.045851659029722214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2925829398445785e-05, + "grad_norm": 29.71377944946289, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8629984855651855, + "num_tokens": 611022668.0, + "step": 16016 + }, + { + "epoch": 2.0375270321842005, + "ewc_loss": 0.04591164365410805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2955822714720853e-05, + "grad_norm": 29.60839080810547, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8695443868637085, + "num_tokens": 611062110.0, + "step": 16017 + }, + { + "epoch": 2.037654242462791, + "ewc_loss": 0.04593192785978317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.296596358064562e-05, + "grad_norm": 29.768278121948242, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8749491572380066, + "num_tokens": 611098663.0, + "step": 16018 + }, + { + "epoch": 2.0377814527413816, + "ewc_loss": 0.045961081981658936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2980540961725637e-05, + "grad_norm": 29.516420364379883, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8667759895324707, + "num_tokens": 611134058.0, + "step": 16019 + }, + { + "epoch": 2.037908663019972, + "ewc_loss": 0.04594714194536209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.297357059433125e-05, + "grad_norm": 29.823318481445312, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.87061607837677, + "num_tokens": 611174748.0, + "step": 16020 + }, + { + "epoch": 2.0380358732985626, + "ewc_loss": 0.046038296073675156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3019147192826495e-05, + "grad_norm": 29.61678123474121, + "learning_rate": 1e-06, + "loss": 0.3723, + "mean_token_accuracy": 0.8858006000518799, + "num_tokens": 611216792.0, + "step": 16021 + }, + { + "epoch": 2.038163083577153, + "ewc_loss": 0.04585236310958862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2926182282390073e-05, + "grad_norm": 29.742765426635742, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8794653415679932, + "num_tokens": 611253996.0, + "step": 16022 + }, + { + "epoch": 2.0382902938557437, + "ewc_loss": 0.04608652740716934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3043263354338706e-05, + "grad_norm": 29.740774154663086, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8808748722076416, + "num_tokens": 611295351.0, + "step": 16023 + }, + { + "epoch": 2.038417504134334, + "ewc_loss": 0.045900601893663406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2950300262891687e-05, + "grad_norm": 29.798290252685547, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8705204129219055, + "num_tokens": 611330513.0, + "step": 16024 + }, + { + "epoch": 2.0385447144129247, + "ewc_loss": 0.046008750796318054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.30043751798803e-05, + "grad_norm": 29.81108283996582, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8782333731651306, + "num_tokens": 611369679.0, + "step": 16025 + }, + { + "epoch": 2.0386719246915153, + "ewc_loss": 0.045930348336696625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2965174139244482e-05, + "grad_norm": 29.768789291381836, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8783140778541565, + "num_tokens": 611410398.0, + "step": 16026 + }, + { + "epoch": 2.038799134970106, + "ewc_loss": 0.04589906334877014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2949530830373988e-05, + "grad_norm": 29.778762817382812, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8638051748275757, + "num_tokens": 611447151.0, + "step": 16027 + }, + { + "epoch": 2.0389263452486963, + "ewc_loss": 0.045853227376937866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2926613382878713e-05, + "grad_norm": 29.75699806213379, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8619223833084106, + "num_tokens": 611486923.0, + "step": 16028 + }, + { + "epoch": 2.0390535555272864, + "ewc_loss": 0.04592825099825859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.296412640134804e-05, + "grad_norm": 29.92007827758789, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8761857748031616, + "num_tokens": 611519502.0, + "step": 16029 + }, + { + "epoch": 2.039180765805877, + "ewc_loss": 0.04590856656432152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2954283849685453e-05, + "grad_norm": 29.704029083251953, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8567541837692261, + "num_tokens": 611557647.0, + "step": 16030 + }, + { + "epoch": 2.0393079760844675, + "ewc_loss": 0.04575261473655701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.287630741193425e-05, + "grad_norm": 29.68618392944336, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8695040345191956, + "num_tokens": 611597405.0, + "step": 16031 + }, + { + "epoch": 2.039435186363058, + "ewc_loss": 0.04588871821761131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2944359443499707e-05, + "grad_norm": 29.821563720703125, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8644537329673767, + "num_tokens": 611635996.0, + "step": 16032 + }, + { + "epoch": 2.0395623966416485, + "ewc_loss": 0.04581758379936218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2908792743692175e-05, + "grad_norm": 29.776994705200195, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8788511157035828, + "num_tokens": 611677900.0, + "step": 16033 + }, + { + "epoch": 2.039689606920239, + "ewc_loss": 0.0458860881626606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2943044314160943e-05, + "grad_norm": 29.70336151123047, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8719815015792847, + "num_tokens": 611719184.0, + "step": 16034 + }, + { + "epoch": 2.0398168171988296, + "ewc_loss": 0.04583317041397095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2916585294296965e-05, + "grad_norm": 29.685199737548828, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8681565523147583, + "num_tokens": 611761270.0, + "step": 16035 + }, + { + "epoch": 2.03994402747742, + "ewc_loss": 0.045812707394361496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.290635347890202e-05, + "grad_norm": 29.648834228515625, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8684611320495605, + "num_tokens": 611794939.0, + "step": 16036 + }, + { + "epoch": 2.0400712377560106, + "ewc_loss": 0.04590858891606331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2954294763621874e-05, + "grad_norm": 29.766735076904297, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8815571069717407, + "num_tokens": 611834093.0, + "step": 16037 + }, + { + "epoch": 2.040198448034601, + "ewc_loss": 0.04589240998029709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2946205717744306e-05, + "grad_norm": 29.552001953125, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8654226660728455, + "num_tokens": 611871327.0, + "step": 16038 + }, + { + "epoch": 2.0403256583131917, + "ewc_loss": 0.0459558442234993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.297792161698453e-05, + "grad_norm": 29.747879028320312, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8631067276000977, + "num_tokens": 611907837.0, + "step": 16039 + }, + { + "epoch": 2.040452868591782, + "ewc_loss": 0.045976314693689346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2988157070358284e-05, + "grad_norm": 29.624826431274414, + "learning_rate": 1e-06, + "loss": 0.4795, + "mean_token_accuracy": 0.8518822193145752, + "num_tokens": 611950964.0, + "step": 16040 + }, + { + "epoch": 2.0405800788703727, + "ewc_loss": 0.04589944705367088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2949723643250763e-05, + "grad_norm": 29.699037551879883, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8656008243560791, + "num_tokens": 611985867.0, + "step": 16041 + }, + { + "epoch": 2.0407072891489633, + "ewc_loss": 0.046010084450244904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.30050427489914e-05, + "grad_norm": 29.725751876831055, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8739771842956543, + "num_tokens": 612017819.0, + "step": 16042 + }, + { + "epoch": 2.040834499427554, + "ewc_loss": 0.045997072011232376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2998536223894916e-05, + "grad_norm": 29.749719619750977, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8552011251449585, + "num_tokens": 612060669.0, + "step": 16043 + }, + { + "epoch": 2.0409617097061443, + "ewc_loss": 0.045975565910339355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2987782358541153e-05, + "grad_norm": 29.622055053710938, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8568465709686279, + "num_tokens": 612099801.0, + "step": 16044 + }, + { + "epoch": 2.041088919984735, + "ewc_loss": 0.045960068702697754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2980033463682048e-05, + "grad_norm": 29.695247650146484, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8754020929336548, + "num_tokens": 612139454.0, + "step": 16045 + }, + { + "epoch": 2.0412161302633254, + "ewc_loss": 0.0460495762526989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.302478787896689e-05, + "grad_norm": 29.658048629760742, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.851168692111969, + "num_tokens": 612181695.0, + "step": 16046 + }, + { + "epoch": 2.041343340541916, + "ewc_loss": 0.0461144857108593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3057242287904955e-05, + "grad_norm": 29.77597999572754, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8847699165344238, + "num_tokens": 612223200.0, + "step": 16047 + }, + { + "epoch": 2.0414705508205064, + "ewc_loss": 0.046061549335718155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.303077417309396e-05, + "grad_norm": 29.67976951599121, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8712410926818848, + "num_tokens": 612265946.0, + "step": 16048 + }, + { + "epoch": 2.041597761099097, + "ewc_loss": 0.04601948708295822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3009743017610162e-05, + "grad_norm": 29.628040313720703, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.886460542678833, + "num_tokens": 612301246.0, + "step": 16049 + }, + { + "epoch": 2.0417249713776875, + "ewc_loss": 0.04600689932703972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3003449314273894e-05, + "grad_norm": 29.69595718383789, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8549818396568298, + "num_tokens": 612337789.0, + "step": 16050 + }, + { + "epoch": 2.041852181656278, + "ewc_loss": 0.046062715351581573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3031358068692498e-05, + "grad_norm": 29.710981369018555, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8827649354934692, + "num_tokens": 612372391.0, + "step": 16051 + }, + { + "epoch": 2.0419793919348685, + "ewc_loss": 0.04597243666648865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2986218027654104e-05, + "grad_norm": 29.625009536743164, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8738144636154175, + "num_tokens": 612408806.0, + "step": 16052 + }, + { + "epoch": 2.0421066022134586, + "ewc_loss": 0.04600081592798233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3000407963991165e-05, + "grad_norm": 29.652563095092773, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8802335262298584, + "num_tokens": 612446349.0, + "step": 16053 + }, + { + "epoch": 2.042233812492049, + "ewc_loss": 0.04608861729502678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3044309273245744e-05, + "grad_norm": 29.733856201171875, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.882820725440979, + "num_tokens": 612489899.0, + "step": 16054 + }, + { + "epoch": 2.0423610227706397, + "ewc_loss": 0.046070586889982224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3035292542772368e-05, + "grad_norm": 29.630691528320312, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8659380674362183, + "num_tokens": 612532314.0, + "step": 16055 + }, + { + "epoch": 2.04248823304923, + "ewc_loss": 0.046003907918930054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.300195410498418e-05, + "grad_norm": 29.756418228149414, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8768563866615295, + "num_tokens": 612568033.0, + "step": 16056 + }, + { + "epoch": 2.0426154433278207, + "ewc_loss": 0.04606211185455322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.303105611645151e-05, + "grad_norm": 29.668636322021484, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8785406351089478, + "num_tokens": 612605160.0, + "step": 16057 + }, + { + "epoch": 2.0427426536064113, + "ewc_loss": 0.046061091125011444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3030544980429113e-05, + "grad_norm": 29.828210830688477, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8584486246109009, + "num_tokens": 612647851.0, + "step": 16058 + }, + { + "epoch": 2.042869863885002, + "ewc_loss": 0.04604893550276756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3024467736831866e-05, + "grad_norm": 29.764925003051758, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8707210421562195, + "num_tokens": 612684770.0, + "step": 16059 + }, + { + "epoch": 2.0429970741635923, + "ewc_loss": 0.04597659409046173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2988297132542357e-05, + "grad_norm": 29.774700164794922, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8596867918968201, + "num_tokens": 612721689.0, + "step": 16060 + }, + { + "epoch": 2.043124284442183, + "ewc_loss": 0.045968279242515564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.298413892276585e-05, + "grad_norm": 29.782299041748047, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8699712753295898, + "num_tokens": 612757256.0, + "step": 16061 + }, + { + "epoch": 2.0432514947207734, + "ewc_loss": 0.04600951075553894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.300475534866564e-05, + "grad_norm": 29.83633041381836, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.876950740814209, + "num_tokens": 612797806.0, + "step": 16062 + }, + { + "epoch": 2.043378704999364, + "ewc_loss": 0.045951224863529205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2975611500442028e-05, + "grad_norm": 29.692039489746094, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.881901741027832, + "num_tokens": 612833507.0, + "step": 16063 + }, + { + "epoch": 2.0435059152779544, + "ewc_loss": 0.04587765038013458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.293882607773412e-05, + "grad_norm": 29.66071891784668, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8762138485908508, + "num_tokens": 612872920.0, + "step": 16064 + }, + { + "epoch": 2.043633125556545, + "ewc_loss": 0.04593430459499359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2967151380726136e-05, + "grad_norm": 29.677465438842773, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8763265013694763, + "num_tokens": 612909534.0, + "step": 16065 + }, + { + "epoch": 2.0437603358351355, + "ewc_loss": 0.04597970098257065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2989850549492985e-05, + "grad_norm": 29.70081329345703, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8666331171989441, + "num_tokens": 612950531.0, + "step": 16066 + }, + { + "epoch": 2.043887546113726, + "ewc_loss": 0.045983344316482544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2991671357885934e-05, + "grad_norm": 29.859867095947266, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8731175065040588, + "num_tokens": 612984638.0, + "step": 16067 + }, + { + "epoch": 2.0440147563923166, + "ewc_loss": 0.046008240431547165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3004120521363802e-05, + "grad_norm": 29.685848236083984, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8825273513793945, + "num_tokens": 613020632.0, + "step": 16068 + }, + { + "epoch": 2.044141966670907, + "ewc_loss": 0.045932017266750336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.296600905538071e-05, + "grad_norm": 29.79236602783203, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8750631809234619, + "num_tokens": 613057381.0, + "step": 16069 + }, + { + "epoch": 2.0442691769494976, + "ewc_loss": 0.045990604907274246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2995302060735412e-05, + "grad_norm": 29.75666046142578, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8693311810493469, + "num_tokens": 613094287.0, + "step": 16070 + }, + { + "epoch": 2.044396387228088, + "ewc_loss": 0.04595775157213211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2978876586421393e-05, + "grad_norm": 29.710020065307617, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8776626586914062, + "num_tokens": 613126315.0, + "step": 16071 + }, + { + "epoch": 2.0445235975066787, + "ewc_loss": 0.046020325273275375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.301016320416238e-05, + "grad_norm": 29.588472366333008, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8667515516281128, + "num_tokens": 613160970.0, + "step": 16072 + }, + { + "epoch": 2.044650807785269, + "ewc_loss": 0.04601456597447395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.300728374393657e-05, + "grad_norm": 29.800325393676758, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.872911274433136, + "num_tokens": 613204685.0, + "step": 16073 + }, + { + "epoch": 2.0447780180638597, + "ewc_loss": 0.046139661222696304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3069829694577493e-05, + "grad_norm": 29.76001739501953, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8798122406005859, + "num_tokens": 613242394.0, + "step": 16074 + }, + { + "epoch": 2.0449052283424503, + "ewc_loss": 0.04590584710240364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.29529232456116e-05, + "grad_norm": 29.535778045654297, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8799163103103638, + "num_tokens": 613283517.0, + "step": 16075 + }, + { + "epoch": 2.045032438621041, + "ewc_loss": 0.04614857956767082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3074289856594987e-05, + "grad_norm": 29.849607467651367, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8763325810432434, + "num_tokens": 613317237.0, + "step": 16076 + }, + { + "epoch": 2.045159648899631, + "ewc_loss": 0.046021685004234314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3010841687209904e-05, + "grad_norm": 29.696979522705078, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8728445768356323, + "num_tokens": 613355741.0, + "step": 16077 + }, + { + "epoch": 2.0452868591782214, + "ewc_loss": 0.04594099521636963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.297049832122866e-05, + "grad_norm": 29.77259635925293, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8606844544410706, + "num_tokens": 613392131.0, + "step": 16078 + }, + { + "epoch": 2.045414069456812, + "ewc_loss": 0.04599900543689728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.29995021072682e-05, + "grad_norm": 29.657045364379883, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8621542453765869, + "num_tokens": 613433687.0, + "step": 16079 + }, + { + "epoch": 2.0455412797354025, + "ewc_loss": 0.04600311815738678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.300155938428361e-05, + "grad_norm": 29.700658798217773, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8748273849487305, + "num_tokens": 613472882.0, + "step": 16080 + }, + { + "epoch": 2.045668490013993, + "ewc_loss": 0.04601585492491722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3007927666185424e-05, + "grad_norm": 29.683088302612305, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8757681250572205, + "num_tokens": 613515869.0, + "step": 16081 + }, + { + "epoch": 2.0457957002925835, + "ewc_loss": 0.04603317007422447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.30165842367569e-05, + "grad_norm": 29.818614959716797, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8719640374183655, + "num_tokens": 613551835.0, + "step": 16082 + }, + { + "epoch": 2.045922910571174, + "ewc_loss": 0.04609048366546631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3045242414809763e-05, + "grad_norm": 29.69931411743164, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8687729835510254, + "num_tokens": 613592277.0, + "step": 16083 + }, + { + "epoch": 2.0460501208497646, + "ewc_loss": 0.045944035053253174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.297201717738062e-05, + "grad_norm": 29.72183609008789, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8715322017669678, + "num_tokens": 613625594.0, + "step": 16084 + }, + { + "epoch": 2.046177331128355, + "ewc_loss": 0.046125393360853195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.306269743712619e-05, + "grad_norm": 29.75853157043457, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8732199668884277, + "num_tokens": 613659924.0, + "step": 16085 + }, + { + "epoch": 2.0463045414069456, + "ewc_loss": 0.046037521213293076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.301875974808354e-05, + "grad_norm": 29.795120239257812, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8535090088844299, + "num_tokens": 613697067.0, + "step": 16086 + }, + { + "epoch": 2.046431751685536, + "ewc_loss": 0.046059440821409225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3029720978229307e-05, + "grad_norm": 29.874269485473633, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8784627318382263, + "num_tokens": 613738211.0, + "step": 16087 + }, + { + "epoch": 2.0465589619641267, + "ewc_loss": 0.04597805440425873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.298902654729318e-05, + "grad_norm": 29.657075881958008, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8763264417648315, + "num_tokens": 613782590.0, + "step": 16088 + }, + { + "epoch": 2.046686172242717, + "ewc_loss": 0.04593834653496742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.296917409694288e-05, + "grad_norm": 29.717363357543945, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8780654668807983, + "num_tokens": 613825770.0, + "step": 16089 + }, + { + "epoch": 2.0468133825213077, + "ewc_loss": 0.0460851825773716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3042592147248797e-05, + "grad_norm": 29.84530258178711, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8761419057846069, + "num_tokens": 613864025.0, + "step": 16090 + }, + { + "epoch": 2.0469405927998983, + "ewc_loss": 0.04594012722373009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2970063582761213e-05, + "grad_norm": 29.57765769958496, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8736646175384521, + "num_tokens": 613907181.0, + "step": 16091 + }, + { + "epoch": 2.047067803078489, + "ewc_loss": 0.04598652198910713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2993261154624633e-05, + "grad_norm": 29.73532485961914, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8617216944694519, + "num_tokens": 613951258.0, + "step": 16092 + }, + { + "epoch": 2.0471950133570793, + "ewc_loss": 0.04602402076125145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.301200947840698e-05, + "grad_norm": 29.782554626464844, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8695739507675171, + "num_tokens": 613991221.0, + "step": 16093 + }, + { + "epoch": 2.04732222363567, + "ewc_loss": 0.04601145535707474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3005728507996537e-05, + "grad_norm": 29.75644302368164, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8817992210388184, + "num_tokens": 614026862.0, + "step": 16094 + }, + { + "epoch": 2.0474494339142604, + "ewc_loss": 0.04597998782992363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2989994249655865e-05, + "grad_norm": 29.84004783630371, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8610560894012451, + "num_tokens": 614067560.0, + "step": 16095 + }, + { + "epoch": 2.047576644192851, + "ewc_loss": 0.04600724205374718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3003620299277827e-05, + "grad_norm": 29.645524978637695, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8753237724304199, + "num_tokens": 614106974.0, + "step": 16096 + }, + { + "epoch": 2.0477038544714414, + "ewc_loss": 0.04591497778892517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.29574889090145e-05, + "grad_norm": 29.731401443481445, + "learning_rate": 1e-06, + "loss": 0.3333, + "mean_token_accuracy": 0.8976905345916748, + "num_tokens": 614145641.0, + "step": 16097 + }, + { + "epoch": 2.047831064750032, + "ewc_loss": 0.04602205008268356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3011025405139662e-05, + "grad_norm": 29.744413375854492, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8744741082191467, + "num_tokens": 614181476.0, + "step": 16098 + }, + { + "epoch": 2.0479582750286225, + "ewc_loss": 0.045974425971508026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2987213014857844e-05, + "grad_norm": 29.87626838684082, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8665348291397095, + "num_tokens": 614219212.0, + "step": 16099 + }, + { + "epoch": 2.048085485307213, + "ewc_loss": 0.04601765796542168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3008829884929582e-05, + "grad_norm": 29.788959503173828, + "learning_rate": 1e-06, + "loss": 0.5084, + "mean_token_accuracy": 0.842710554599762, + "num_tokens": 614264017.0, + "step": 16100 + }, + { + "epoch": 2.0482126955858035, + "ewc_loss": 0.0458955354988575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.294776822964195e-05, + "grad_norm": 29.707149505615234, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8566594123840332, + "num_tokens": 614299031.0, + "step": 16101 + }, + { + "epoch": 2.0483399058643936, + "ewc_loss": 0.04602024331688881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.30101213674061e-05, + "grad_norm": 29.87748908996582, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8771779537200928, + "num_tokens": 614334387.0, + "step": 16102 + }, + { + "epoch": 2.048467116142984, + "ewc_loss": 0.0459805466234684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2990272555034608e-05, + "grad_norm": 29.630983352661133, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8727014064788818, + "num_tokens": 614372438.0, + "step": 16103 + }, + { + "epoch": 2.0485943264215747, + "ewc_loss": 0.04588025063276291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2940124836168252e-05, + "grad_norm": 29.665796279907227, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8671960234642029, + "num_tokens": 614409976.0, + "step": 16104 + }, + { + "epoch": 2.048721536700165, + "ewc_loss": 0.046023204922676086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3011602024780586e-05, + "grad_norm": 29.658977508544922, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8780858516693115, + "num_tokens": 614448740.0, + "step": 16105 + }, + { + "epoch": 2.0488487469787557, + "ewc_loss": 0.04598946124315262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.29947308980627e-05, + "grad_norm": 29.6629695892334, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8753501176834106, + "num_tokens": 614484644.0, + "step": 16106 + }, + { + "epoch": 2.0489759572573463, + "ewc_loss": 0.04611099138855934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.305549605807755e-05, + "grad_norm": 29.73053550720215, + "learning_rate": 1e-06, + "loss": 0.4785, + "mean_token_accuracy": 0.8519892692565918, + "num_tokens": 614521210.0, + "step": 16107 + }, + { + "epoch": 2.049103167535937, + "ewc_loss": 0.046008460223674774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3004229660728015e-05, + "grad_norm": 29.697376251220703, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8645874857902527, + "num_tokens": 614561783.0, + "step": 16108 + }, + { + "epoch": 2.0492303778145273, + "ewc_loss": 0.04607940465211868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3039701773086563e-05, + "grad_norm": 29.74199676513672, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8716355562210083, + "num_tokens": 614598643.0, + "step": 16109 + }, + { + "epoch": 2.049357588093118, + "ewc_loss": 0.04606831446290016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3034157493384555e-05, + "grad_norm": 29.939117431640625, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8706717491149902, + "num_tokens": 614633520.0, + "step": 16110 + }, + { + "epoch": 2.0494847983717084, + "ewc_loss": 0.04597967490553856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.298983781656716e-05, + "grad_norm": 29.654447555541992, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8582724332809448, + "num_tokens": 614673266.0, + "step": 16111 + }, + { + "epoch": 2.049612008650299, + "ewc_loss": 0.045882243663072586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2941121642361395e-05, + "grad_norm": 29.833965301513672, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8622665405273438, + "num_tokens": 614707201.0, + "step": 16112 + }, + { + "epoch": 2.0497392189288894, + "ewc_loss": 0.04614884778857231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.307442446181085e-05, + "grad_norm": 29.809066772460938, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8753475546836853, + "num_tokens": 614746542.0, + "step": 16113 + }, + { + "epoch": 2.04986642920748, + "ewc_loss": 0.04598909616470337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2994548999122344e-05, + "grad_norm": 29.76539421081543, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.860476553440094, + "num_tokens": 614781804.0, + "step": 16114 + }, + { + "epoch": 2.0499936394860705, + "ewc_loss": 0.045991700142621994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.299584957654588e-05, + "grad_norm": 29.750246047973633, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8711894750595093, + "num_tokens": 614817281.0, + "step": 16115 + }, + { + "epoch": 2.050120849764661, + "ewc_loss": 0.046030089259147644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.30150453717215e-05, + "grad_norm": 29.6468448638916, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8621043562889099, + "num_tokens": 614853666.0, + "step": 16116 + }, + { + "epoch": 2.0502480600432516, + "ewc_loss": 0.046098727732896805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.30493642447982e-05, + "grad_norm": 29.801776885986328, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8724232912063599, + "num_tokens": 614891221.0, + "step": 16117 + }, + { + "epoch": 2.050375270321842, + "ewc_loss": 0.04610711708664894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3053558834362775e-05, + "grad_norm": 29.757041931152344, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8773574233055115, + "num_tokens": 614925956.0, + "step": 16118 + }, + { + "epoch": 2.0505024806004326, + "ewc_loss": 0.04598743095993996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.299371590197552e-05, + "grad_norm": 29.579498291015625, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8759691715240479, + "num_tokens": 614959934.0, + "step": 16119 + }, + { + "epoch": 2.050629690879023, + "ewc_loss": 0.046259116381406784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3129558030632325e-05, + "grad_norm": 29.82522964477539, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8672814965248108, + "num_tokens": 615000579.0, + "step": 16120 + }, + { + "epoch": 2.0507569011576137, + "ewc_loss": 0.04615955427289009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3079777747625485e-05, + "grad_norm": 29.627546310424805, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8657832741737366, + "num_tokens": 615037371.0, + "step": 16121 + }, + { + "epoch": 2.050884111436204, + "ewc_loss": 0.046139638870954514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.306981878064107e-05, + "grad_norm": 29.816617965698242, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8620238304138184, + "num_tokens": 615075948.0, + "step": 16122 + }, + { + "epoch": 2.0510113217147947, + "ewc_loss": 0.04620858654379845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3104294086806476e-05, + "grad_norm": 29.808332443237305, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8842788934707642, + "num_tokens": 615114619.0, + "step": 16123 + }, + { + "epoch": 2.0511385319933853, + "ewc_loss": 0.0461287684738636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3064383640303276e-05, + "grad_norm": 29.74248695373535, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8712291717529297, + "num_tokens": 615149260.0, + "step": 16124 + }, + { + "epoch": 2.051265742271976, + "ewc_loss": 0.04616125300526619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.308062721567694e-05, + "grad_norm": 29.853042602539062, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.875762939453125, + "num_tokens": 615183284.0, + "step": 16125 + }, + { + "epoch": 2.0513929525505663, + "ewc_loss": 0.04612315446138382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3061576939653605e-05, + "grad_norm": 29.76690673828125, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8756375312805176, + "num_tokens": 615220015.0, + "step": 16126 + }, + { + "epoch": 2.0515201628291564, + "ewc_loss": 0.04609879478812218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3049396986607462e-05, + "grad_norm": 29.760555267333984, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8656681776046753, + "num_tokens": 615255614.0, + "step": 16127 + }, + { + "epoch": 2.051647373107747, + "ewc_loss": 0.04607906937599182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3039534426061437e-05, + "grad_norm": 29.76120948791504, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.871900200843811, + "num_tokens": 615290232.0, + "step": 16128 + }, + { + "epoch": 2.0517745833863374, + "ewc_loss": 0.04616430029273033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.308214970980771e-05, + "grad_norm": 29.815462112426758, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.875721275806427, + "num_tokens": 615328204.0, + "step": 16129 + }, + { + "epoch": 2.051901793664928, + "ewc_loss": 0.04604920744895935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3024604161037132e-05, + "grad_norm": 29.679773330688477, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8627063035964966, + "num_tokens": 615362846.0, + "step": 16130 + }, + { + "epoch": 2.0520290039435185, + "ewc_loss": 0.04612277075648308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3061385945766233e-05, + "grad_norm": 29.910072326660156, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8697911500930786, + "num_tokens": 615405404.0, + "step": 16131 + }, + { + "epoch": 2.052156214222109, + "ewc_loss": 0.04612329229712486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.306164606125094e-05, + "grad_norm": 29.845726013183594, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8599204421043396, + "num_tokens": 615439835.0, + "step": 16132 + }, + { + "epoch": 2.0522834245006996, + "ewc_loss": 0.046009186655282974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3004593458608724e-05, + "grad_norm": 29.67670249938965, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8813700079917908, + "num_tokens": 615477832.0, + "step": 16133 + }, + { + "epoch": 2.05241063477929, + "ewc_loss": 0.04605214297771454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3026072085485794e-05, + "grad_norm": 29.802343368530273, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8628172278404236, + "num_tokens": 615518089.0, + "step": 16134 + }, + { + "epoch": 2.0525378450578806, + "ewc_loss": 0.04611433669924736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.305716770933941e-05, + "grad_norm": 29.704191207885742, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.875052809715271, + "num_tokens": 615557286.0, + "step": 16135 + }, + { + "epoch": 2.052665055336471, + "ewc_loss": 0.04615030810236931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.307515387656167e-05, + "grad_norm": 29.955717086791992, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8745895624160767, + "num_tokens": 615593284.0, + "step": 16136 + }, + { + "epoch": 2.0527922656150617, + "ewc_loss": 0.046176981180906296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.308849070686847e-05, + "grad_norm": 29.599504470825195, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8669720888137817, + "num_tokens": 615633579.0, + "step": 16137 + }, + { + "epoch": 2.052919475893652, + "ewc_loss": 0.04607906565070152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3039532607072033e-05, + "grad_norm": 29.81317901611328, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8760932683944702, + "num_tokens": 615680135.0, + "step": 16138 + }, + { + "epoch": 2.0530466861722427, + "ewc_loss": 0.04615726321935654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.307863178430125e-05, + "grad_norm": 29.763025283813477, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8592162728309631, + "num_tokens": 615714801.0, + "step": 16139 + }, + { + "epoch": 2.0531738964508333, + "ewc_loss": 0.04600485414266586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.30024270422291e-05, + "grad_norm": 29.59646224975586, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8632453680038452, + "num_tokens": 615750538.0, + "step": 16140 + }, + { + "epoch": 2.053301106729424, + "ewc_loss": 0.04618502035737038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3092510673450306e-05, + "grad_norm": 29.882766723632812, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8762335777282715, + "num_tokens": 615789465.0, + "step": 16141 + }, + { + "epoch": 2.0534283170080143, + "ewc_loss": 0.04618682712316513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3093412892194465e-05, + "grad_norm": 29.712047576904297, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8561697006225586, + "num_tokens": 615832553.0, + "step": 16142 + }, + { + "epoch": 2.053555527286605, + "ewc_loss": 0.04597558453679085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.298779145348817e-05, + "grad_norm": 29.801591873168945, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.866378664970398, + "num_tokens": 615868341.0, + "step": 16143 + }, + { + "epoch": 2.0536827375651954, + "ewc_loss": 0.04607008025050163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3035039703245275e-05, + "grad_norm": 29.725814819335938, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8801194429397583, + "num_tokens": 615904324.0, + "step": 16144 + }, + { + "epoch": 2.053809947843786, + "ewc_loss": 0.046007972210645676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.300398591614794e-05, + "grad_norm": 29.73893928527832, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8780027031898499, + "num_tokens": 615947656.0, + "step": 16145 + }, + { + "epoch": 2.0539371581223764, + "ewc_loss": 0.04609736055135727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3048680304782465e-05, + "grad_norm": 29.641782760620117, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8650791049003601, + "num_tokens": 615989050.0, + "step": 16146 + }, + { + "epoch": 2.054064368400967, + "ewc_loss": 0.04602843150496483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3014215912553482e-05, + "grad_norm": 29.840986251831055, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8755582571029663, + "num_tokens": 616027638.0, + "step": 16147 + }, + { + "epoch": 2.0541915786795575, + "ewc_loss": 0.046094052493572235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3047026843414642e-05, + "grad_norm": 29.653839111328125, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8782345056533813, + "num_tokens": 616066392.0, + "step": 16148 + }, + { + "epoch": 2.054318788958148, + "ewc_loss": 0.04603162407875061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.301581116626039e-05, + "grad_norm": 29.78131103515625, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8797656893730164, + "num_tokens": 616108303.0, + "step": 16149 + }, + { + "epoch": 2.0544459992367385, + "ewc_loss": 0.04615237936377525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.307618888153229e-05, + "grad_norm": 29.68864631652832, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8632978200912476, + "num_tokens": 616154174.0, + "step": 16150 + }, + { + "epoch": 2.0545732095153286, + "ewc_loss": 0.04602968320250511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.30148416449083e-05, + "grad_norm": 29.8044376373291, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8686315417289734, + "num_tokens": 616193954.0, + "step": 16151 + }, + { + "epoch": 2.054700419793919, + "ewc_loss": 0.04611280560493469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.305640373378992e-05, + "grad_norm": 29.72667694091797, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8758803606033325, + "num_tokens": 616232771.0, + "step": 16152 + }, + { + "epoch": 2.0548276300725097, + "ewc_loss": 0.04607398435473442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.303699147887528e-05, + "grad_norm": 29.855098724365234, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8809641003608704, + "num_tokens": 616265151.0, + "step": 16153 + }, + { + "epoch": 2.0549548403511, + "ewc_loss": 0.046121787279844284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3060892999637872e-05, + "grad_norm": 29.80136489868164, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8653846979141235, + "num_tokens": 616303886.0, + "step": 16154 + }, + { + "epoch": 2.0550820506296907, + "ewc_loss": 0.045892566442489624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2946283934288658e-05, + "grad_norm": 29.830116271972656, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8803588151931763, + "num_tokens": 616340040.0, + "step": 16155 + }, + { + "epoch": 2.0552092609082813, + "ewc_loss": 0.04614785313606262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3073926058714278e-05, + "grad_norm": 29.867938995361328, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8757551908493042, + "num_tokens": 616371329.0, + "step": 16156 + }, + { + "epoch": 2.055336471186872, + "ewc_loss": 0.04594996199011803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2974980311118998e-05, + "grad_norm": 29.710397720336914, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8849588632583618, + "num_tokens": 616406383.0, + "step": 16157 + }, + { + "epoch": 2.0554636814654623, + "ewc_loss": 0.04600236937403679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.300118467246648e-05, + "grad_norm": 29.803848266601562, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8575700521469116, + "num_tokens": 616445600.0, + "step": 16158 + }, + { + "epoch": 2.055590891744053, + "ewc_loss": 0.04611758887767792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.305879388586618e-05, + "grad_norm": 29.794652938842773, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8640934824943542, + "num_tokens": 616485376.0, + "step": 16159 + }, + { + "epoch": 2.0557181020226434, + "ewc_loss": 0.04603908956050873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.301954555150587e-05, + "grad_norm": 29.666994094848633, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8620143532752991, + "num_tokens": 616527104.0, + "step": 16160 + }, + { + "epoch": 2.055845312301234, + "ewc_loss": 0.04599175974726677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.299588049936574e-05, + "grad_norm": 29.786907196044922, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8735882043838501, + "num_tokens": 616567557.0, + "step": 16161 + }, + { + "epoch": 2.0559725225798244, + "ewc_loss": 0.046083636581897736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.304181907675229e-05, + "grad_norm": 29.832502365112305, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8764258623123169, + "num_tokens": 616601612.0, + "step": 16162 + }, + { + "epoch": 2.056099732858415, + "ewc_loss": 0.04597727954387665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2988639102550223e-05, + "grad_norm": 29.734560012817383, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8695792555809021, + "num_tokens": 616638019.0, + "step": 16163 + }, + { + "epoch": 2.0562269431370055, + "ewc_loss": 0.04608980193734169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3044900444801897e-05, + "grad_norm": 29.992258071899414, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8689407110214233, + "num_tokens": 616675224.0, + "step": 16164 + }, + { + "epoch": 2.056354153415596, + "ewc_loss": 0.046001799404621124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3000899091130123e-05, + "grad_norm": 29.674142837524414, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8689267635345459, + "num_tokens": 616713628.0, + "step": 16165 + }, + { + "epoch": 2.0564813636941865, + "ewc_loss": 0.04596720635890961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2983602320891805e-05, + "grad_norm": 29.87324333190918, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8684374094009399, + "num_tokens": 616755985.0, + "step": 16166 + }, + { + "epoch": 2.056608573972777, + "ewc_loss": 0.04604974389076233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3024871552479453e-05, + "grad_norm": 29.816917419433594, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8712928891181946, + "num_tokens": 616786564.0, + "step": 16167 + }, + { + "epoch": 2.0567357842513676, + "ewc_loss": 0.04593437537550926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2967187760514207e-05, + "grad_norm": 29.799243927001953, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.841618537902832, + "num_tokens": 616823285.0, + "step": 16168 + }, + { + "epoch": 2.056862994529958, + "ewc_loss": 0.04604126885533333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3020635126158595e-05, + "grad_norm": 29.760990142822266, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8793861269950867, + "num_tokens": 616864465.0, + "step": 16169 + }, + { + "epoch": 2.0569902048085487, + "ewc_loss": 0.04607829079031944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3039145162329078e-05, + "grad_norm": 29.8188533782959, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8610205054283142, + "num_tokens": 616906670.0, + "step": 16170 + }, + { + "epoch": 2.057117415087139, + "ewc_loss": 0.04603434354066849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3017171770334244e-05, + "grad_norm": 29.8177433013916, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8689311742782593, + "num_tokens": 616947656.0, + "step": 16171 + }, + { + "epoch": 2.0572446253657297, + "ewc_loss": 0.04602944478392601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3014721591607668e-05, + "grad_norm": 29.7338924407959, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8633272647857666, + "num_tokens": 616980161.0, + "step": 16172 + }, + { + "epoch": 2.0573718356443202, + "ewc_loss": 0.0460546612739563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3027330826153047e-05, + "grad_norm": 29.8944034576416, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.870031476020813, + "num_tokens": 617015281.0, + "step": 16173 + }, + { + "epoch": 2.0574990459229108, + "ewc_loss": 0.04608551785349846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3042759494273923e-05, + "grad_norm": 29.686765670776367, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8718662261962891, + "num_tokens": 617054174.0, + "step": 16174 + }, + { + "epoch": 2.057626256201501, + "ewc_loss": 0.04596167430281639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2980837456998415e-05, + "grad_norm": 29.781827926635742, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8698963522911072, + "num_tokens": 617095160.0, + "step": 16175 + }, + { + "epoch": 2.0577534664800914, + "ewc_loss": 0.046226032078266144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.311301614099648e-05, + "grad_norm": 29.863828659057617, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8744556903839111, + "num_tokens": 617130781.0, + "step": 16176 + }, + { + "epoch": 2.057880676758682, + "ewc_loss": 0.04603308066725731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3016540581011213e-05, + "grad_norm": 29.84080696105957, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8683319687843323, + "num_tokens": 617169473.0, + "step": 16177 + }, + { + "epoch": 2.0580078870372724, + "ewc_loss": 0.04615247622132301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3076237994246185e-05, + "grad_norm": 29.817611694335938, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8642938137054443, + "num_tokens": 617212208.0, + "step": 16178 + }, + { + "epoch": 2.058135097315863, + "ewc_loss": 0.04605107381939888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.302553730260115e-05, + "grad_norm": 29.745155334472656, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8731249570846558, + "num_tokens": 617246055.0, + "step": 16179 + }, + { + "epoch": 2.0582623075944535, + "ewc_loss": 0.04606197401881218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3030986994854175e-05, + "grad_norm": 29.78257179260254, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8663124442100525, + "num_tokens": 617285614.0, + "step": 16180 + }, + { + "epoch": 2.058389517873044, + "ewc_loss": 0.04608730971813202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3043654437060468e-05, + "grad_norm": 29.726226806640625, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8792597651481628, + "num_tokens": 617323401.0, + "step": 16181 + }, + { + "epoch": 2.0585167281516346, + "ewc_loss": 0.04609724506735802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.304862209712155e-05, + "grad_norm": 29.737253189086914, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8566020131111145, + "num_tokens": 617372550.0, + "step": 16182 + }, + { + "epoch": 2.058643938430225, + "ewc_loss": 0.04608568921685219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.304284498677589e-05, + "grad_norm": 29.812307357788086, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8762013912200928, + "num_tokens": 617407894.0, + "step": 16183 + }, + { + "epoch": 2.0587711487088156, + "ewc_loss": 0.04608330875635147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3041653548716567e-05, + "grad_norm": 29.6981143951416, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8623995780944824, + "num_tokens": 617450117.0, + "step": 16184 + }, + { + "epoch": 2.058898358987406, + "ewc_loss": 0.04611833766102791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.305916859768331e-05, + "grad_norm": 29.870241165161133, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8756526708602905, + "num_tokens": 617485885.0, + "step": 16185 + }, + { + "epoch": 2.0590255692659967, + "ewc_loss": 0.04615110903978348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.307555405423045e-05, + "grad_norm": 29.751131057739258, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8544533848762512, + "num_tokens": 617520511.0, + "step": 16186 + }, + { + "epoch": 2.059152779544587, + "ewc_loss": 0.0459611676633358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2980584617471322e-05, + "grad_norm": 29.782121658325195, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.863762617111206, + "num_tokens": 617558288.0, + "step": 16187 + }, + { + "epoch": 2.0592799898231777, + "ewc_loss": 0.04612206295132637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.306103124283254e-05, + "grad_norm": 29.683269500732422, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.869443416595459, + "num_tokens": 617602991.0, + "step": 16188 + }, + { + "epoch": 2.0594072001017683, + "ewc_loss": 0.04599897563457489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.299948755535297e-05, + "grad_norm": 29.70993423461914, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8681008815765381, + "num_tokens": 617639629.0, + "step": 16189 + }, + { + "epoch": 2.059534410380359, + "ewc_loss": 0.04607708007097244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3038539438857697e-05, + "grad_norm": 29.671981811523438, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8636448383331299, + "num_tokens": 617676218.0, + "step": 16190 + }, + { + "epoch": 2.0596616206589493, + "ewc_loss": 0.04607846215367317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3039230654831044e-05, + "grad_norm": 29.63678550720215, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8638532161712646, + "num_tokens": 617716270.0, + "step": 16191 + }, + { + "epoch": 2.05978883093754, + "ewc_loss": 0.04621683433651924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3108417735784315e-05, + "grad_norm": 29.741825103759766, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8797062039375305, + "num_tokens": 617755559.0, + "step": 16192 + }, + { + "epoch": 2.0599160412161304, + "ewc_loss": 0.046163409948349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3081705876393244e-05, + "grad_norm": 29.854249954223633, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8733406066894531, + "num_tokens": 617793522.0, + "step": 16193 + }, + { + "epoch": 2.060043251494721, + "ewc_loss": 0.046110715717077255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3055357814882882e-05, + "grad_norm": 29.639177322387695, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8749228119850159, + "num_tokens": 617828005.0, + "step": 16194 + }, + { + "epoch": 2.0601704617733114, + "ewc_loss": 0.04610376060009003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3051879907143302e-05, + "grad_norm": 29.779760360717773, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8663338422775269, + "num_tokens": 617863824.0, + "step": 16195 + }, + { + "epoch": 2.060297672051902, + "ewc_loss": 0.04618106037378311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3090529793989845e-05, + "grad_norm": 29.750673294067383, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8667013645172119, + "num_tokens": 617903648.0, + "step": 16196 + }, + { + "epoch": 2.0604248823304925, + "ewc_loss": 0.04606311768293381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3031558157526888e-05, + "grad_norm": 29.65993309020996, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8673986196517944, + "num_tokens": 617946229.0, + "step": 16197 + }, + { + "epoch": 2.060552092609083, + "ewc_loss": 0.046128880232572556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3064440028974786e-05, + "grad_norm": 29.84752655029297, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8704206943511963, + "num_tokens": 617990486.0, + "step": 16198 + }, + { + "epoch": 2.0606793028876735, + "ewc_loss": 0.04606734588742256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3033673642203212e-05, + "grad_norm": 29.73678970336914, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.881973147392273, + "num_tokens": 618031743.0, + "step": 16199 + }, + { + "epoch": 2.0608065131662636, + "ewc_loss": 0.04598810151219368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.2994050596025772e-05, + "grad_norm": 29.697696685791016, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8675678968429565, + "num_tokens": 618070181.0, + "step": 16200 + }, + { + "epoch": 2.060933723444854, + "ewc_loss": 0.04604341834783554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3021708329906687e-05, + "grad_norm": 29.710121154785156, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8769555687904358, + "num_tokens": 618106396.0, + "step": 16201 + }, + { + "epoch": 2.0610609337234447, + "ewc_loss": 0.046108316630125046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3054159100865945e-05, + "grad_norm": 29.827341079711914, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8758091926574707, + "num_tokens": 618148281.0, + "step": 16202 + }, + { + "epoch": 2.061188144002035, + "ewc_loss": 0.04610646516084671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.305323323525954e-05, + "grad_norm": 29.762243270874023, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8637882471084595, + "num_tokens": 618182721.0, + "step": 16203 + }, + { + "epoch": 2.0613153542806257, + "ewc_loss": 0.04607957974076271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3039789084577933e-05, + "grad_norm": 29.66547203063965, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8856600522994995, + "num_tokens": 618219497.0, + "step": 16204 + }, + { + "epoch": 2.0614425645592163, + "ewc_loss": 0.046080950647592545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.304047484358307e-05, + "grad_norm": 29.78743553161621, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8868150115013123, + "num_tokens": 618258643.0, + "step": 16205 + }, + { + "epoch": 2.061569774837807, + "ewc_loss": 0.04604749754071236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3023749236017466e-05, + "grad_norm": 29.52140235900879, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8861819505691528, + "num_tokens": 618292682.0, + "step": 16206 + }, + { + "epoch": 2.0616969851163973, + "ewc_loss": 0.04602322354912758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3011611119727604e-05, + "grad_norm": 29.792776107788086, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8565007448196411, + "num_tokens": 618331375.0, + "step": 16207 + }, + { + "epoch": 2.061824195394988, + "ewc_loss": 0.04623415693640709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3117077944334596e-05, + "grad_norm": 29.775033950805664, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8757244348526001, + "num_tokens": 618367452.0, + "step": 16208 + }, + { + "epoch": 2.0619514056735784, + "ewc_loss": 0.046044155955314636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3022077584755607e-05, + "grad_norm": 29.721668243408203, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8684953451156616, + "num_tokens": 618410470.0, + "step": 16209 + }, + { + "epoch": 2.062078615952169, + "ewc_loss": 0.04608596861362457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3042985048959963e-05, + "grad_norm": 29.74195098876953, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8634388446807861, + "num_tokens": 618447524.0, + "step": 16210 + }, + { + "epoch": 2.0622058262307594, + "ewc_loss": 0.04612843692302704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3064218112267554e-05, + "grad_norm": 29.64052963256836, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8517363667488098, + "num_tokens": 618481793.0, + "step": 16211 + }, + { + "epoch": 2.06233303650935, + "ewc_loss": 0.04619305580854416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3096527002053335e-05, + "grad_norm": 29.752084732055664, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8768466711044312, + "num_tokens": 618517097.0, + "step": 16212 + }, + { + "epoch": 2.0624602467879405, + "ewc_loss": 0.04625964164733887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3129819965106435e-05, + "grad_norm": 29.848615646362305, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8681655526161194, + "num_tokens": 618554221.0, + "step": 16213 + }, + { + "epoch": 2.062587457066531, + "ewc_loss": 0.04622069001197815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3110345864552073e-05, + "grad_norm": 29.811899185180664, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8803805112838745, + "num_tokens": 618591279.0, + "step": 16214 + }, + { + "epoch": 2.0627146673451215, + "ewc_loss": 0.046180590987205505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3090295144356787e-05, + "grad_norm": 29.753034591674805, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8573551177978516, + "num_tokens": 618628168.0, + "step": 16215 + }, + { + "epoch": 2.062841877623712, + "ewc_loss": 0.046145908534526825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3072954718372785e-05, + "grad_norm": 29.71481704711914, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8758463263511658, + "num_tokens": 618664580.0, + "step": 16216 + }, + { + "epoch": 2.0629690879023026, + "ewc_loss": 0.04622222110629082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3111109840101562e-05, + "grad_norm": 29.777097702026367, + "learning_rate": 1e-06, + "loss": 0.4539, + "mean_token_accuracy": 0.8596420288085938, + "num_tokens": 618699007.0, + "step": 16217 + }, + { + "epoch": 2.063096298180893, + "ewc_loss": 0.04619409516453743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.309704723302275e-05, + "grad_norm": 29.645097732543945, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8716686367988586, + "num_tokens": 618736630.0, + "step": 16218 + }, + { + "epoch": 2.0632235084594837, + "ewc_loss": 0.046235375106334686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3117687305784784e-05, + "grad_norm": 29.858787536621094, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8780595064163208, + "num_tokens": 618777868.0, + "step": 16219 + }, + { + "epoch": 2.063350718738074, + "ewc_loss": 0.046267617493867874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.313380900886841e-05, + "grad_norm": 29.702415466308594, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8744188547134399, + "num_tokens": 618819134.0, + "step": 16220 + }, + { + "epoch": 2.0634779290166647, + "ewc_loss": 0.046218108385801315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3109054382075556e-05, + "grad_norm": 29.902969360351562, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8711199760437012, + "num_tokens": 618862231.0, + "step": 16221 + }, + { + "epoch": 2.0636051392952552, + "ewc_loss": 0.04628067463636398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3140337361837737e-05, + "grad_norm": 29.80768585205078, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8641204833984375, + "num_tokens": 618906816.0, + "step": 16222 + }, + { + "epoch": 2.0637323495738458, + "ewc_loss": 0.04608890414237976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.304445115441922e-05, + "grad_norm": 29.849193572998047, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8714382648468018, + "num_tokens": 618943239.0, + "step": 16223 + }, + { + "epoch": 2.0638595598524363, + "ewc_loss": 0.04613553732633591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3067768779583275e-05, + "grad_norm": 29.736953735351562, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8840697407722473, + "num_tokens": 618978641.0, + "step": 16224 + }, + { + "epoch": 2.0639867701310264, + "ewc_loss": 0.04618259146809578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3091295588528737e-05, + "grad_norm": 29.824729919433594, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8711663484573364, + "num_tokens": 619020522.0, + "step": 16225 + }, + { + "epoch": 2.064113980409617, + "ewc_loss": 0.04616960883140564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3084803615347482e-05, + "grad_norm": 29.845380783081055, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8831693530082703, + "num_tokens": 619060769.0, + "step": 16226 + }, + { + "epoch": 2.0642411906882074, + "ewc_loss": 0.04606989398598671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3034946934785694e-05, + "grad_norm": 29.661273956298828, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.88270103931427, + "num_tokens": 619098711.0, + "step": 16227 + }, + { + "epoch": 2.064368400966798, + "ewc_loss": 0.04619583114981651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.309791489096824e-05, + "grad_norm": 29.91184425354004, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8671512007713318, + "num_tokens": 619133673.0, + "step": 16228 + }, + { + "epoch": 2.0644956112453885, + "ewc_loss": 0.0461660735309124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.308303737663664e-05, + "grad_norm": 29.65937042236328, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8641694188117981, + "num_tokens": 619174554.0, + "step": 16229 + }, + { + "epoch": 2.064622821523979, + "ewc_loss": 0.0460580550134182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3029027943266556e-05, + "grad_norm": 29.745290756225586, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8598898649215698, + "num_tokens": 619211095.0, + "step": 16230 + }, + { + "epoch": 2.0647500318025696, + "ewc_loss": 0.046161677688360214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3080838218447752e-05, + "grad_norm": 29.744434356689453, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8807315230369568, + "num_tokens": 619246346.0, + "step": 16231 + }, + { + "epoch": 2.06487724208116, + "ewc_loss": 0.0461602658033371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3080132450559177e-05, + "grad_norm": 29.789234161376953, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8688963055610657, + "num_tokens": 619280629.0, + "step": 16232 + }, + { + "epoch": 2.0650044523597506, + "ewc_loss": 0.046116311103105545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3058155420585535e-05, + "grad_norm": 29.832983016967773, + "learning_rate": 1e-06, + "loss": 0.4847, + "mean_token_accuracy": 0.8508864641189575, + "num_tokens": 619320928.0, + "step": 16233 + }, + { + "epoch": 2.065131662638341, + "ewc_loss": 0.04615119472146034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3075597709976137e-05, + "grad_norm": 29.70587158203125, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8723940849304199, + "num_tokens": 619369747.0, + "step": 16234 + }, + { + "epoch": 2.0652588729169317, + "ewc_loss": 0.04611112177371979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.305556154169608e-05, + "grad_norm": 29.833498001098633, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8725579977035522, + "num_tokens": 619408694.0, + "step": 16235 + }, + { + "epoch": 2.065386083195522, + "ewc_loss": 0.046159446239471436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.307972317794338e-05, + "grad_norm": 29.786380767822266, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8782950043678284, + "num_tokens": 619446858.0, + "step": 16236 + }, + { + "epoch": 2.0655132934741127, + "ewc_loss": 0.0461508110165596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.307540489709936e-05, + "grad_norm": 29.860998153686523, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8707924485206604, + "num_tokens": 619482281.0, + "step": 16237 + }, + { + "epoch": 2.0656405037527032, + "ewc_loss": 0.04612266272306442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3061331376084127e-05, + "grad_norm": 29.766748428344727, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8798100352287292, + "num_tokens": 619520309.0, + "step": 16238 + }, + { + "epoch": 2.065767714031294, + "ewc_loss": 0.04613063856959343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.30653186008567e-05, + "grad_norm": 29.905515670776367, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8751295208930969, + "num_tokens": 619554662.0, + "step": 16239 + }, + { + "epoch": 2.0658949243098843, + "ewc_loss": 0.046131689101457596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3065844288794324e-05, + "grad_norm": 29.819013595581055, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8675743341445923, + "num_tokens": 619599632.0, + "step": 16240 + }, + { + "epoch": 2.066022134588475, + "ewc_loss": 0.04603872820734978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3019363652565517e-05, + "grad_norm": 29.882591247558594, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8843692541122437, + "num_tokens": 619639414.0, + "step": 16241 + }, + { + "epoch": 2.0661493448670654, + "ewc_loss": 0.04611000791192055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.305500311194919e-05, + "grad_norm": 29.821718215942383, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8722113370895386, + "num_tokens": 619675878.0, + "step": 16242 + }, + { + "epoch": 2.066276555145656, + "ewc_loss": 0.04605615884065628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3028080249787308e-05, + "grad_norm": 29.92580795288086, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8745125532150269, + "num_tokens": 619709486.0, + "step": 16243 + }, + { + "epoch": 2.0664037654242464, + "ewc_loss": 0.046100106090307236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3050053641782142e-05, + "grad_norm": 29.62217903137207, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.871092677116394, + "num_tokens": 619753576.0, + "step": 16244 + }, + { + "epoch": 2.066530975702837, + "ewc_loss": 0.046042028814554214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3021015294943936e-05, + "grad_norm": 29.814746856689453, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8713119626045227, + "num_tokens": 619792520.0, + "step": 16245 + }, + { + "epoch": 2.0666581859814275, + "ewc_loss": 0.0460808202624321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3040409359964542e-05, + "grad_norm": 29.682910919189453, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8496389389038086, + "num_tokens": 619827871.0, + "step": 16246 + }, + { + "epoch": 2.066785396260018, + "ewc_loss": 0.04608200117945671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3041000531520694e-05, + "grad_norm": 29.834625244140625, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8660610914230347, + "num_tokens": 619864527.0, + "step": 16247 + }, + { + "epoch": 2.0669126065386085, + "ewc_loss": 0.04615240544080734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3076203433447517e-05, + "grad_norm": 29.75448989868164, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8726780414581299, + "num_tokens": 619904934.0, + "step": 16248 + }, + { + "epoch": 2.0670398168171986, + "ewc_loss": 0.046064816415309906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3032407625578344e-05, + "grad_norm": 29.711620330810547, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8791505098342896, + "num_tokens": 619945472.0, + "step": 16249 + }, + { + "epoch": 2.067167027095789, + "ewc_loss": 0.04613671079277992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3068354494171217e-05, + "grad_norm": 29.810253143310547, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8825787305831909, + "num_tokens": 619981738.0, + "step": 16250 + }, + { + "epoch": 2.0672942373743797, + "ewc_loss": 0.04616239294409752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.308119655936025e-05, + "grad_norm": 29.63352394104004, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8734548091888428, + "num_tokens": 620024593.0, + "step": 16251 + }, + { + "epoch": 2.06742144765297, + "ewc_loss": 0.04612931236624718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.306465648871381e-05, + "grad_norm": 29.81527328491211, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8771624565124512, + "num_tokens": 620064059.0, + "step": 16252 + }, + { + "epoch": 2.0675486579315607, + "ewc_loss": 0.0462203286588192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.311016396561172e-05, + "grad_norm": 29.69801139831543, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8726074695587158, + "num_tokens": 620100028.0, + "step": 16253 + }, + { + "epoch": 2.0676758682101513, + "ewc_loss": 0.04614594951272011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3072974727256224e-05, + "grad_norm": 29.81428337097168, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8842312097549438, + "num_tokens": 620135181.0, + "step": 16254 + }, + { + "epoch": 2.067803078488742, + "ewc_loss": 0.04625966399908066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.312983269803226e-05, + "grad_norm": 29.752511978149414, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8731212615966797, + "num_tokens": 620174887.0, + "step": 16255 + }, + { + "epoch": 2.0679302887673323, + "ewc_loss": 0.046252571046352386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3126285668695346e-05, + "grad_norm": 29.777240753173828, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.87083899974823, + "num_tokens": 620212924.0, + "step": 16256 + }, + { + "epoch": 2.068057499045923, + "ewc_loss": 0.046245552599430084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3122776838135906e-05, + "grad_norm": 29.77036476135254, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8616573810577393, + "num_tokens": 620251088.0, + "step": 16257 + }, + { + "epoch": 2.0681847093245134, + "ewc_loss": 0.046115707606077194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3057853468344547e-05, + "grad_norm": 29.74498176574707, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8701003193855286, + "num_tokens": 620287888.0, + "step": 16258 + }, + { + "epoch": 2.068311919603104, + "ewc_loss": 0.04616378992795944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3081895051291212e-05, + "grad_norm": 29.795866012573242, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8702322244644165, + "num_tokens": 620324937.0, + "step": 16259 + }, + { + "epoch": 2.0684391298816944, + "ewc_loss": 0.046148255467414856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.307412796653807e-05, + "grad_norm": 29.70185661315918, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8717126250267029, + "num_tokens": 620363390.0, + "step": 16260 + }, + { + "epoch": 2.068566340160285, + "ewc_loss": 0.046186961233615875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3093480194802396e-05, + "grad_norm": 29.73912811279297, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8778296709060669, + "num_tokens": 620403707.0, + "step": 16261 + }, + { + "epoch": 2.0686935504388755, + "ewc_loss": 0.04621997848153114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3109989342628978e-05, + "grad_norm": 29.76198387145996, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8742104768753052, + "num_tokens": 620445994.0, + "step": 16262 + }, + { + "epoch": 2.068820760717466, + "ewc_loss": 0.0461977981030941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.309889896423556e-05, + "grad_norm": 29.794504165649414, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8785446882247925, + "num_tokens": 620480883.0, + "step": 16263 + }, + { + "epoch": 2.0689479709960565, + "ewc_loss": 0.04614724963903427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.307362410647329e-05, + "grad_norm": 29.766368865966797, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.884181022644043, + "num_tokens": 620518935.0, + "step": 16264 + }, + { + "epoch": 2.069075181274647, + "ewc_loss": 0.04617132246494293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3085660359356552e-05, + "grad_norm": 29.76739501953125, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.864406406879425, + "num_tokens": 620551985.0, + "step": 16265 + }, + { + "epoch": 2.0692023915532376, + "ewc_loss": 0.04627387970685959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3136939489631914e-05, + "grad_norm": 29.811521530151367, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8674048185348511, + "num_tokens": 620592809.0, + "step": 16266 + }, + { + "epoch": 2.069329601831828, + "ewc_loss": 0.046173855662345886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3086928194970824e-05, + "grad_norm": 29.701576232910156, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8575809001922607, + "num_tokens": 620627951.0, + "step": 16267 + }, + { + "epoch": 2.0694568121104187, + "ewc_loss": 0.04617997631430626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3089987735147588e-05, + "grad_norm": 29.6981258392334, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8758989572525024, + "num_tokens": 620660892.0, + "step": 16268 + }, + { + "epoch": 2.069584022389009, + "ewc_loss": 0.046234194189310074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3117097953218035e-05, + "grad_norm": 29.86589813232422, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8477216362953186, + "num_tokens": 620698165.0, + "step": 16269 + }, + { + "epoch": 2.0697112326675997, + "ewc_loss": 0.0461922362446785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3096117729437537e-05, + "grad_norm": 29.6772403717041, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.861961841583252, + "num_tokens": 620734758.0, + "step": 16270 + }, + { + "epoch": 2.0698384429461902, + "ewc_loss": 0.046218641102313995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3109319954528473e-05, + "grad_norm": 29.77814483642578, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8519009947776794, + "num_tokens": 620770224.0, + "step": 16271 + }, + { + "epoch": 2.0699656532247808, + "ewc_loss": 0.04628276824951172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3141383280744776e-05, + "grad_norm": 29.649824142456055, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8787846565246582, + "num_tokens": 620805724.0, + "step": 16272 + }, + { + "epoch": 2.070092863503371, + "ewc_loss": 0.0462254099547863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3112705093808472e-05, + "grad_norm": 29.73378562927246, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8594362735748291, + "num_tokens": 620847622.0, + "step": 16273 + }, + { + "epoch": 2.0702200737819614, + "ewc_loss": 0.046283964067697525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3141981728258543e-05, + "grad_norm": 29.731876373291016, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8678487539291382, + "num_tokens": 620879042.0, + "step": 16274 + }, + { + "epoch": 2.070347284060552, + "ewc_loss": 0.04627428948879242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3137145035434514e-05, + "grad_norm": 29.743831634521484, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8676071166992188, + "num_tokens": 620922685.0, + "step": 16275 + }, + { + "epoch": 2.0704744943391424, + "ewc_loss": 0.0463150329887867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3157515897764824e-05, + "grad_norm": 29.887893676757812, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8666679263114929, + "num_tokens": 620954854.0, + "step": 16276 + }, + { + "epoch": 2.070601704617733, + "ewc_loss": 0.04635756090283394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3178779883892275e-05, + "grad_norm": 29.790122985839844, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8678990602493286, + "num_tokens": 620994581.0, + "step": 16277 + }, + { + "epoch": 2.0707289148963235, + "ewc_loss": 0.04620084539055824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.310042327735573e-05, + "grad_norm": 29.839258193969727, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8718420267105103, + "num_tokens": 621030052.0, + "step": 16278 + }, + { + "epoch": 2.070856125174914, + "ewc_loss": 0.04633527249097824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3167636754806153e-05, + "grad_norm": 29.695907592773438, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8801490664482117, + "num_tokens": 621072474.0, + "step": 16279 + }, + { + "epoch": 2.0709833354535045, + "ewc_loss": 0.04624611884355545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3123058781493455e-05, + "grad_norm": 29.868328094482422, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8735944032669067, + "num_tokens": 621104365.0, + "step": 16280 + }, + { + "epoch": 2.071110545732095, + "ewc_loss": 0.0463828407227993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3191420041257516e-05, + "grad_norm": 29.78049659729004, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8766700625419617, + "num_tokens": 621140497.0, + "step": 16281 + }, + { + "epoch": 2.0712377560106856, + "ewc_loss": 0.04626650735735893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.313325421710033e-05, + "grad_norm": 29.88190269470215, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8741234540939331, + "num_tokens": 621177431.0, + "step": 16282 + }, + { + "epoch": 2.071364966289276, + "ewc_loss": 0.04634670540690422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3173352019512095e-05, + "grad_norm": 29.891233444213867, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8752195835113525, + "num_tokens": 621212874.0, + "step": 16283 + }, + { + "epoch": 2.0714921765678667, + "ewc_loss": 0.04620425030589104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.310212585143745e-05, + "grad_norm": 29.727886199951172, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8759424686431885, + "num_tokens": 621249927.0, + "step": 16284 + }, + { + "epoch": 2.071619386846457, + "ewc_loss": 0.04629109799861908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3145548766478896e-05, + "grad_norm": 29.896663665771484, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8462938070297241, + "num_tokens": 621280794.0, + "step": 16285 + }, + { + "epoch": 2.0717465971250477, + "ewc_loss": 0.046336349099874496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3168175175669603e-05, + "grad_norm": 29.869155883789062, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8549248576164246, + "num_tokens": 621321167.0, + "step": 16286 + }, + { + "epoch": 2.0718738074036382, + "ewc_loss": 0.04621075093746185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3105374566512182e-05, + "grad_norm": 29.766395568847656, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8829512596130371, + "num_tokens": 621357962.0, + "step": 16287 + }, + { + "epoch": 2.0720010176822288, + "ewc_loss": 0.04624296724796295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3121483536669984e-05, + "grad_norm": 29.85546875, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8896690607070923, + "num_tokens": 621398933.0, + "step": 16288 + }, + { + "epoch": 2.0721282279608193, + "ewc_loss": 0.04622167721390724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3110838810680434e-05, + "grad_norm": 29.867692947387695, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8903399705886841, + "num_tokens": 621442334.0, + "step": 16289 + }, + { + "epoch": 2.07225543823941, + "ewc_loss": 0.04621286690235138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3106433218345046e-05, + "grad_norm": 29.86212730407715, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8734987378120422, + "num_tokens": 621478706.0, + "step": 16290 + }, + { + "epoch": 2.0723826485180004, + "ewc_loss": 0.046146586537361145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3073293050401844e-05, + "grad_norm": 29.848188400268555, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.876496434211731, + "num_tokens": 621513911.0, + "step": 16291 + }, + { + "epoch": 2.072509858796591, + "ewc_loss": 0.04622073844075203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.311036951141432e-05, + "grad_norm": 29.800189971923828, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.864886999130249, + "num_tokens": 621554080.0, + "step": 16292 + }, + { + "epoch": 2.0726370690751814, + "ewc_loss": 0.04619782418012619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3098911697161384e-05, + "grad_norm": 29.9653263092041, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8733913898468018, + "num_tokens": 621592059.0, + "step": 16293 + }, + { + "epoch": 2.072764279353772, + "ewc_loss": 0.04620768129825592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3103841158444993e-05, + "grad_norm": 29.788232803344727, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8831815719604492, + "num_tokens": 621629626.0, + "step": 16294 + }, + { + "epoch": 2.0728914896323625, + "ewc_loss": 0.046171292662620544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3085645807441324e-05, + "grad_norm": 29.9313907623291, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8757971525192261, + "num_tokens": 621671341.0, + "step": 16295 + }, + { + "epoch": 2.073018699910953, + "ewc_loss": 0.046257030218839645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3128515749704093e-05, + "grad_norm": 29.92530059814453, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8853517770767212, + "num_tokens": 621708756.0, + "step": 16296 + }, + { + "epoch": 2.0731459101895435, + "ewc_loss": 0.04610498994588852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.30524947255617e-05, + "grad_norm": 29.73900604248047, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8763326406478882, + "num_tokens": 621748009.0, + "step": 16297 + }, + { + "epoch": 2.0732731204681336, + "ewc_loss": 0.046180739998817444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3090369722922333e-05, + "grad_norm": 29.86311912536621, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8802752494812012, + "num_tokens": 621788695.0, + "step": 16298 + }, + { + "epoch": 2.073400330746724, + "ewc_loss": 0.04613584280014038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3067921574693173e-05, + "grad_norm": 29.7135066986084, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8712751269340515, + "num_tokens": 621828099.0, + "step": 16299 + }, + { + "epoch": 2.0735275410253147, + "ewc_loss": 0.04617450758814812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.308725379407406e-05, + "grad_norm": 29.887367248535156, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8548624515533447, + "num_tokens": 621867033.0, + "step": 16300 + }, + { + "epoch": 2.073654751303905, + "ewc_loss": 0.046226922422647476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3113461793400347e-05, + "grad_norm": 29.751672744750977, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8618464469909668, + "num_tokens": 621907435.0, + "step": 16301 + }, + { + "epoch": 2.0737819615824957, + "ewc_loss": 0.04614963382482529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3074817363522016e-05, + "grad_norm": 29.74413299560547, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8618564605712891, + "num_tokens": 621948995.0, + "step": 16302 + }, + { + "epoch": 2.0739091718610863, + "ewc_loss": 0.04632784426212311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3163922378444113e-05, + "grad_norm": 29.81369972229004, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8833990097045898, + "num_tokens": 621991109.0, + "step": 16303 + }, + { + "epoch": 2.074036382139677, + "ewc_loss": 0.046167902648448944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.308395050931722e-05, + "grad_norm": 29.81200408935547, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8778209686279297, + "num_tokens": 622026585.0, + "step": 16304 + }, + { + "epoch": 2.0741635924182673, + "ewc_loss": 0.04620413854718208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.310206946276594e-05, + "grad_norm": 29.670490264892578, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8810735940933228, + "num_tokens": 622065208.0, + "step": 16305 + }, + { + "epoch": 2.074290802696858, + "ewc_loss": 0.04618854448199272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.309427145519294e-05, + "grad_norm": 29.822154998779297, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8628324866294861, + "num_tokens": 622104838.0, + "step": 16306 + }, + { + "epoch": 2.0744180129754484, + "ewc_loss": 0.04631424695253372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.315712299605366e-05, + "grad_norm": 29.94968032836914, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8623588681221008, + "num_tokens": 622136879.0, + "step": 16307 + }, + { + "epoch": 2.074545223254039, + "ewc_loss": 0.04618454724550247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3092274204827845e-05, + "grad_norm": 29.74900245666504, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8832824230194092, + "num_tokens": 622175938.0, + "step": 16308 + }, + { + "epoch": 2.0746724335326294, + "ewc_loss": 0.04618053138256073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.309026604052633e-05, + "grad_norm": 29.816707611083984, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8783843517303467, + "num_tokens": 622216663.0, + "step": 16309 + }, + { + "epoch": 2.07479964381122, + "ewc_loss": 0.04620058462023735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3100292310118675e-05, + "grad_norm": 29.718355178833008, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8799045085906982, + "num_tokens": 622250450.0, + "step": 16310 + }, + { + "epoch": 2.0749268540898105, + "ewc_loss": 0.04621059074997902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3105294530978426e-05, + "grad_norm": 29.821300506591797, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8638241291046143, + "num_tokens": 622291919.0, + "step": 16311 + }, + { + "epoch": 2.075054064368401, + "ewc_loss": 0.04628375545144081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.314187804586254e-05, + "grad_norm": 29.85431671142578, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8712862730026245, + "num_tokens": 622325892.0, + "step": 16312 + }, + { + "epoch": 2.0751812746469915, + "ewc_loss": 0.0462774783372879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.313873847015202e-05, + "grad_norm": 29.8300724029541, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8722819089889526, + "num_tokens": 622356966.0, + "step": 16313 + }, + { + "epoch": 2.075308484925582, + "ewc_loss": 0.04629150405526161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3145752493292093e-05, + "grad_norm": 29.897050857543945, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8750324249267578, + "num_tokens": 622389931.0, + "step": 16314 + }, + { + "epoch": 2.0754356952041726, + "ewc_loss": 0.046186476945877075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3093238269211724e-05, + "grad_norm": 29.736385345458984, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8708643317222595, + "num_tokens": 622427817.0, + "step": 16315 + }, + { + "epoch": 2.075562905482763, + "ewc_loss": 0.046172212809324265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.308610601176042e-05, + "grad_norm": 29.85643768310547, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8624991178512573, + "num_tokens": 622470353.0, + "step": 16316 + }, + { + "epoch": 2.0756901157613536, + "ewc_loss": 0.04630410298705101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3152051653596573e-05, + "grad_norm": 29.832355499267578, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8880069851875305, + "num_tokens": 622507577.0, + "step": 16317 + }, + { + "epoch": 2.075817326039944, + "ewc_loss": 0.0462188795208931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3109440007829107e-05, + "grad_norm": 29.72770118713379, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8673194646835327, + "num_tokens": 622544746.0, + "step": 16318 + }, + { + "epoch": 2.0759445363185347, + "ewc_loss": 0.046218231320381165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3109116227715276e-05, + "grad_norm": 29.707420349121094, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.86454176902771, + "num_tokens": 622583688.0, + "step": 16319 + }, + { + "epoch": 2.0760717465971252, + "ewc_loss": 0.0462755523622036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3137776224757545e-05, + "grad_norm": 29.80809211730957, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8716499209403992, + "num_tokens": 622619911.0, + "step": 16320 + }, + { + "epoch": 2.0761989568757158, + "ewc_loss": 0.04625510796904564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3127553504309617e-05, + "grad_norm": 29.76572608947754, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.877354085445404, + "num_tokens": 622665732.0, + "step": 16321 + }, + { + "epoch": 2.0763261671543063, + "ewc_loss": 0.04626282677054405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.313141339982394e-05, + "grad_norm": 29.843507766723633, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8790003061294556, + "num_tokens": 622705746.0, + "step": 16322 + }, + { + "epoch": 2.0764533774328964, + "ewc_loss": 0.04632170870900154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.316085374332033e-05, + "grad_norm": 29.820276260375977, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8729944825172424, + "num_tokens": 622745481.0, + "step": 16323 + }, + { + "epoch": 2.076580587711487, + "ewc_loss": 0.04628419876098633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3142099962569773e-05, + "grad_norm": 29.818727493286133, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8734647035598755, + "num_tokens": 622786910.0, + "step": 16324 + }, + { + "epoch": 2.0767077979900774, + "ewc_loss": 0.04626822471618652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3134112780098803e-05, + "grad_norm": 29.631460189819336, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.867688775062561, + "num_tokens": 622826976.0, + "step": 16325 + }, + { + "epoch": 2.076835008268668, + "ewc_loss": 0.046287164092063904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3143582438933663e-05, + "grad_norm": 29.921056747436523, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8722150325775146, + "num_tokens": 622858945.0, + "step": 16326 + }, + { + "epoch": 2.0769622185472585, + "ewc_loss": 0.046376634389162064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3188316845335066e-05, + "grad_norm": 29.750146865844727, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8836644291877747, + "num_tokens": 622893229.0, + "step": 16327 + }, + { + "epoch": 2.077089428825849, + "ewc_loss": 0.046152953058481216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.307647628185805e-05, + "grad_norm": 29.816621780395508, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8813414573669434, + "num_tokens": 622933701.0, + "step": 16328 + }, + { + "epoch": 2.0772166391044395, + "ewc_loss": 0.04630308225750923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3151540517574176e-05, + "grad_norm": 29.826034545898438, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8567003011703491, + "num_tokens": 622971226.0, + "step": 16329 + }, + { + "epoch": 2.07734384938303, + "ewc_loss": 0.04626874998211861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3134374714572914e-05, + "grad_norm": 29.8397274017334, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8685414791107178, + "num_tokens": 623009232.0, + "step": 16330 + }, + { + "epoch": 2.0774710596616206, + "ewc_loss": 0.04626040905714035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3130203771870583e-05, + "grad_norm": 29.749954223632812, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8652715086936951, + "num_tokens": 623048005.0, + "step": 16331 + }, + { + "epoch": 2.077598269940211, + "ewc_loss": 0.04626220837235451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.313110417162534e-05, + "grad_norm": 29.855669021606445, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8717608451843262, + "num_tokens": 623088474.0, + "step": 16332 + }, + { + "epoch": 2.0777254802188017, + "ewc_loss": 0.04625605791807175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3128028260543942e-05, + "grad_norm": 29.6202449798584, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8725996613502502, + "num_tokens": 623123700.0, + "step": 16333 + }, + { + "epoch": 2.077852690497392, + "ewc_loss": 0.04629248008131981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3146239982452244e-05, + "grad_norm": 29.907657623291016, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.873327910900116, + "num_tokens": 623161406.0, + "step": 16334 + }, + { + "epoch": 2.0779799007759827, + "ewc_loss": 0.04627807065844536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.31390349654248e-05, + "grad_norm": 29.640527725219727, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8623956441879272, + "num_tokens": 623199770.0, + "step": 16335 + }, + { + "epoch": 2.0781071110545732, + "ewc_loss": 0.046256884932518005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.312844299012795e-05, + "grad_norm": 29.808143615722656, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8736791014671326, + "num_tokens": 623234635.0, + "step": 16336 + }, + { + "epoch": 2.0782343213331638, + "ewc_loss": 0.04648539796471596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3242699171532877e-05, + "grad_norm": 29.82825469970703, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8736680746078491, + "num_tokens": 623273695.0, + "step": 16337 + }, + { + "epoch": 2.0783615316117543, + "ewc_loss": 0.04629451036453247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3147254978539422e-05, + "grad_norm": 29.714921951293945, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8560196161270142, + "num_tokens": 623315278.0, + "step": 16338 + }, + { + "epoch": 2.078488741890345, + "ewc_loss": 0.046321213245391846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.316060636076145e-05, + "grad_norm": 29.78152847290039, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8802427053451538, + "num_tokens": 623351753.0, + "step": 16339 + }, + { + "epoch": 2.0786159521689354, + "ewc_loss": 0.0463830940425396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3191547370515764e-05, + "grad_norm": 29.879959106445312, + "learning_rate": 1e-06, + "loss": 0.4899, + "mean_token_accuracy": 0.8538463711738586, + "num_tokens": 623387812.0, + "step": 16340 + }, + { + "epoch": 2.078743162447526, + "ewc_loss": 0.04636915773153305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.318457882211078e-05, + "grad_norm": 29.784164428710938, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8751528263092041, + "num_tokens": 623426784.0, + "step": 16341 + }, + { + "epoch": 2.0788703727261164, + "ewc_loss": 0.04622014984488487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3110074835130945e-05, + "grad_norm": 29.802669525146484, + "learning_rate": 1e-06, + "loss": 0.5028, + "mean_token_accuracy": 0.8488671779632568, + "num_tokens": 623463362.0, + "step": 16342 + }, + { + "epoch": 2.078997583004707, + "ewc_loss": 0.046348508447408676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3174254238256253e-05, + "grad_norm": 29.83511734008789, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8651392459869385, + "num_tokens": 623500108.0, + "step": 16343 + }, + { + "epoch": 2.0791247932832975, + "ewc_loss": 0.04629335552453995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3146678358898498e-05, + "grad_norm": 29.753646850585938, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.868387758731842, + "num_tokens": 623537159.0, + "step": 16344 + }, + { + "epoch": 2.079252003561888, + "ewc_loss": 0.04620550945401192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3102755221771076e-05, + "grad_norm": 29.83586883544922, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8897863030433655, + "num_tokens": 623576256.0, + "step": 16345 + }, + { + "epoch": 2.0793792138404785, + "ewc_loss": 0.0464213602244854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.321067950106226e-05, + "grad_norm": 29.878192901611328, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8679447770118713, + "num_tokens": 623612454.0, + "step": 16346 + }, + { + "epoch": 2.0795064241190686, + "ewc_loss": 0.0463118702173233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3155935195973143e-05, + "grad_norm": 29.870119094848633, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8852360844612122, + "num_tokens": 623654392.0, + "step": 16347 + }, + { + "epoch": 2.079633634397659, + "ewc_loss": 0.04632359743118286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.316179779882077e-05, + "grad_norm": 29.867753982543945, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8528276681900024, + "num_tokens": 623697350.0, + "step": 16348 + }, + { + "epoch": 2.0797608446762497, + "ewc_loss": 0.04630805924534798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3154028895078227e-05, + "grad_norm": 29.91330337524414, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8749555349349976, + "num_tokens": 623736272.0, + "step": 16349 + }, + { + "epoch": 2.07988805495484, + "ewc_loss": 0.046217575669288635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3108786990633234e-05, + "grad_norm": 29.853363037109375, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8621428608894348, + "num_tokens": 623781319.0, + "step": 16350 + }, + { + "epoch": 2.0800152652334307, + "ewc_loss": 0.04629021883010864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3145108571043238e-05, + "grad_norm": 29.911314010620117, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.872128963470459, + "num_tokens": 623821944.0, + "step": 16351 + }, + { + "epoch": 2.0801424755120212, + "ewc_loss": 0.04627874866127968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.313937511644326e-05, + "grad_norm": 29.84029197692871, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8663256168365479, + "num_tokens": 623858399.0, + "step": 16352 + }, + { + "epoch": 2.0802696857906118, + "ewc_loss": 0.046207964420318604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.310398303961847e-05, + "grad_norm": 29.87253761291504, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8731484413146973, + "num_tokens": 623897839.0, + "step": 16353 + }, + { + "epoch": 2.0803968960692023, + "ewc_loss": 0.04620954394340515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.310477248101961e-05, + "grad_norm": 29.83030891418457, + "learning_rate": 1e-06, + "loss": 0.477, + "mean_token_accuracy": 0.8553347587585449, + "num_tokens": 623933127.0, + "step": 16354 + }, + { + "epoch": 2.080524106347793, + "ewc_loss": 0.04619785025715828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.309892443008721e-05, + "grad_norm": 29.870864868164062, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8712027072906494, + "num_tokens": 623972140.0, + "step": 16355 + }, + { + "epoch": 2.0806513166263834, + "ewc_loss": 0.046213582158088684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3106791559257545e-05, + "grad_norm": 29.75910186767578, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8597511053085327, + "num_tokens": 624008875.0, + "step": 16356 + }, + { + "epoch": 2.080778526904974, + "ewc_loss": 0.04619278386235237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3096392396837473e-05, + "grad_norm": 29.73626708984375, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8685312867164612, + "num_tokens": 624045162.0, + "step": 16357 + }, + { + "epoch": 2.0809057371835644, + "ewc_loss": 0.04625287279486656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.312643664481584e-05, + "grad_norm": 29.85421371459961, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8809582591056824, + "num_tokens": 624085275.0, + "step": 16358 + }, + { + "epoch": 2.081032947462155, + "ewc_loss": 0.04628325253725052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.314162702532485e-05, + "grad_norm": 29.666505813598633, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8685274124145508, + "num_tokens": 624127479.0, + "step": 16359 + }, + { + "epoch": 2.0811601577407455, + "ewc_loss": 0.046195004135370255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3097501980373636e-05, + "grad_norm": 29.783159255981445, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8743352293968201, + "num_tokens": 624165231.0, + "step": 16360 + }, + { + "epoch": 2.081287368019336, + "ewc_loss": 0.046284712851047516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3142356440075673e-05, + "grad_norm": 29.7537784576416, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8597183227539062, + "num_tokens": 624205109.0, + "step": 16361 + }, + { + "epoch": 2.0814145782979265, + "ewc_loss": 0.046286266297101974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3143133148550987e-05, + "grad_norm": 29.933992385864258, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8667344450950623, + "num_tokens": 624241849.0, + "step": 16362 + }, + { + "epoch": 2.081541788576517, + "ewc_loss": 0.046291932463645935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3145965315052308e-05, + "grad_norm": 29.763410568237305, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8723662495613098, + "num_tokens": 624274174.0, + "step": 16363 + }, + { + "epoch": 2.0816689988551076, + "ewc_loss": 0.04620284214615822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3101421902538277e-05, + "grad_norm": 29.801551818847656, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8690446615219116, + "num_tokens": 624311526.0, + "step": 16364 + }, + { + "epoch": 2.081796209133698, + "ewc_loss": 0.046326301991939545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.316315112693701e-05, + "grad_norm": 29.90821647644043, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8566210269927979, + "num_tokens": 624346657.0, + "step": 16365 + }, + { + "epoch": 2.0819234194122886, + "ewc_loss": 0.04623208940029144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3116044758353382e-05, + "grad_norm": 29.751197814941406, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8561574220657349, + "num_tokens": 624386563.0, + "step": 16366 + }, + { + "epoch": 2.082050629690879, + "ewc_loss": 0.04625415429472923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3127076929085888e-05, + "grad_norm": 29.80804443359375, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.880811870098114, + "num_tokens": 624427299.0, + "step": 16367 + }, + { + "epoch": 2.0821778399694697, + "ewc_loss": 0.046241044998168945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3120523110264912e-05, + "grad_norm": 29.858062744140625, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.853611409664154, + "num_tokens": 624461253.0, + "step": 16368 + }, + { + "epoch": 2.0823050502480602, + "ewc_loss": 0.04625270888209343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.312635479029268e-05, + "grad_norm": 29.862154006958008, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8796243667602539, + "num_tokens": 624498699.0, + "step": 16369 + }, + { + "epoch": 2.0824322605266508, + "ewc_loss": 0.046191323548555374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3095661163097247e-05, + "grad_norm": 29.738155364990234, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8685543537139893, + "num_tokens": 624534655.0, + "step": 16370 + }, + { + "epoch": 2.082559470805241, + "ewc_loss": 0.046275943517684937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3137972675613128e-05, + "grad_norm": 29.897327423095703, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8742701411247253, + "num_tokens": 624575342.0, + "step": 16371 + }, + { + "epoch": 2.0826866810838314, + "ewc_loss": 0.04625363275408745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.312681681360118e-05, + "grad_norm": 29.805788040161133, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8752047419548035, + "num_tokens": 624608493.0, + "step": 16372 + }, + { + "epoch": 2.082813891362422, + "ewc_loss": 0.04618873447179794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3094367861631326e-05, + "grad_norm": 29.72045135498047, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8529209494590759, + "num_tokens": 624655777.0, + "step": 16373 + }, + { + "epoch": 2.0829411016410124, + "ewc_loss": 0.046280935406684875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3140468329074793e-05, + "grad_norm": 29.705514907836914, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8811758756637573, + "num_tokens": 624699349.0, + "step": 16374 + }, + { + "epoch": 2.083068311919603, + "ewc_loss": 0.04637226089835167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3186130420072004e-05, + "grad_norm": 29.9020938873291, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8782968521118164, + "num_tokens": 624730146.0, + "step": 16375 + }, + { + "epoch": 2.0831955221981935, + "ewc_loss": 0.046305522322654724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3152761059463955e-05, + "grad_norm": 29.765233993530273, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.863095760345459, + "num_tokens": 624773186.0, + "step": 16376 + }, + { + "epoch": 2.083322732476784, + "ewc_loss": 0.04632901772856712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3164508093032055e-05, + "grad_norm": 29.97917938232422, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8689138889312744, + "num_tokens": 624810256.0, + "step": 16377 + }, + { + "epoch": 2.0834499427553745, + "ewc_loss": 0.04643242433667183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3216212866827846e-05, + "grad_norm": 29.89020347595215, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8616029024124146, + "num_tokens": 624848166.0, + "step": 16378 + }, + { + "epoch": 2.083577153033965, + "ewc_loss": 0.04624437913298607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.312218930455856e-05, + "grad_norm": 29.816638946533203, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8696751594543457, + "num_tokens": 624887601.0, + "step": 16379 + }, + { + "epoch": 2.0837043633125556, + "ewc_loss": 0.04636026918888092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3180135030997917e-05, + "grad_norm": 30.000585556030273, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8708769083023071, + "num_tokens": 624921160.0, + "step": 16380 + }, + { + "epoch": 2.083831573591146, + "ewc_loss": 0.04620460420846939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3102302293409593e-05, + "grad_norm": 29.68075942993164, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8631462454795837, + "num_tokens": 624959053.0, + "step": 16381 + }, + { + "epoch": 2.0839587838697367, + "ewc_loss": 0.04624343663454056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3121718186303042e-05, + "grad_norm": 29.976755142211914, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8837127089500427, + "num_tokens": 624995272.0, + "step": 16382 + }, + { + "epoch": 2.084085994148327, + "ewc_loss": 0.046307213604450226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3153606889536604e-05, + "grad_norm": 29.81722640991211, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.8869256377220154, + "num_tokens": 625033388.0, + "step": 16383 + }, + { + "epoch": 2.0842132044269177, + "ewc_loss": 0.046197857707738876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3098928068066016e-05, + "grad_norm": 29.967374801635742, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8609457015991211, + "num_tokens": 625071103.0, + "step": 16384 + }, + { + "epoch": 2.0843404147055082, + "ewc_loss": 0.04624584689736366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3122924176277593e-05, + "grad_norm": 29.729032516479492, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8824829459190369, + "num_tokens": 625109562.0, + "step": 16385 + }, + { + "epoch": 2.0844676249840988, + "ewc_loss": 0.046153489500284195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3076745492289774e-05, + "grad_norm": 29.846282958984375, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.876341700553894, + "num_tokens": 625147438.0, + "step": 16386 + }, + { + "epoch": 2.0845948352626893, + "ewc_loss": 0.04633377492427826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3166887331171893e-05, + "grad_norm": 29.822816848754883, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8799346685409546, + "num_tokens": 625184293.0, + "step": 16387 + }, + { + "epoch": 2.08472204554128, + "ewc_loss": 0.04630785435438156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.315392703167163e-05, + "grad_norm": 29.855987548828125, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8721356987953186, + "num_tokens": 625218201.0, + "step": 16388 + }, + { + "epoch": 2.0848492558198704, + "ewc_loss": 0.04630628973245621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3153144866228104e-05, + "grad_norm": 29.864294052124023, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8691822290420532, + "num_tokens": 625252072.0, + "step": 16389 + }, + { + "epoch": 2.084976466098461, + "ewc_loss": 0.04636254906654358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3181273718364537e-05, + "grad_norm": 29.879472732543945, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8832044005393982, + "num_tokens": 625286990.0, + "step": 16390 + }, + { + "epoch": 2.0851036763770514, + "ewc_loss": 0.046320199966430664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3160100681707263e-05, + "grad_norm": 29.96802520751953, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8816283345222473, + "num_tokens": 625326579.0, + "step": 16391 + }, + { + "epoch": 2.085230886655642, + "ewc_loss": 0.04626275226473808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.313137702003587e-05, + "grad_norm": 29.8328914642334, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8761171102523804, + "num_tokens": 625367317.0, + "step": 16392 + }, + { + "epoch": 2.0853580969342325, + "ewc_loss": 0.046333733946084976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3166867322288454e-05, + "grad_norm": 29.946258544921875, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8642591834068298, + "num_tokens": 625401047.0, + "step": 16393 + }, + { + "epoch": 2.085485307212823, + "ewc_loss": 0.04624299332499504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.312149626959581e-05, + "grad_norm": 29.860008239746094, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8565022945404053, + "num_tokens": 625441878.0, + "step": 16394 + }, + { + "epoch": 2.0856125174914135, + "ewc_loss": 0.04626951739192009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3134758521337062e-05, + "grad_norm": 29.85028839111328, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8692965507507324, + "num_tokens": 625477007.0, + "step": 16395 + }, + { + "epoch": 2.0857397277700036, + "ewc_loss": 0.0463232584297657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.316162863280624e-05, + "grad_norm": 29.905704498291016, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8833654522895813, + "num_tokens": 625519908.0, + "step": 16396 + }, + { + "epoch": 2.085866938048594, + "ewc_loss": 0.04624227061867714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3121136109693907e-05, + "grad_norm": 29.814037322998047, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8732957243919373, + "num_tokens": 625554946.0, + "step": 16397 + }, + { + "epoch": 2.0859941483271847, + "ewc_loss": 0.04618526250123978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.309263072675094e-05, + "grad_norm": 29.741235733032227, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8818652033805847, + "num_tokens": 625592121.0, + "step": 16398 + }, + { + "epoch": 2.086121358605775, + "ewc_loss": 0.04635082185268402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.317541111551691e-05, + "grad_norm": 29.8864803314209, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8692313432693481, + "num_tokens": 625630596.0, + "step": 16399 + }, + { + "epoch": 2.0862485688843657, + "ewc_loss": 0.04620979353785515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.310489617229905e-05, + "grad_norm": 29.680387496948242, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8531966209411621, + "num_tokens": 625669535.0, + "step": 16400 + }, + { + "epoch": 2.0863757791629562, + "ewc_loss": 0.046280600130558014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3140299163060263e-05, + "grad_norm": 29.817672729492188, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8685967922210693, + "num_tokens": 625710072.0, + "step": 16401 + }, + { + "epoch": 2.0865029894415468, + "ewc_loss": 0.04632767289876938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3163836885942146e-05, + "grad_norm": 29.737051010131836, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8783760070800781, + "num_tokens": 625746328.0, + "step": 16402 + }, + { + "epoch": 2.0866301997201373, + "ewc_loss": 0.04637147858738899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3185739337350242e-05, + "grad_norm": 29.902942657470703, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8716453909873962, + "num_tokens": 625787108.0, + "step": 16403 + }, + { + "epoch": 2.086757409998728, + "ewc_loss": 0.04634857922792435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.317428879905492e-05, + "grad_norm": 29.79228401184082, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8741669058799744, + "num_tokens": 625827887.0, + "step": 16404 + }, + { + "epoch": 2.0868846202773184, + "ewc_loss": 0.046347711235284805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3173855879576877e-05, + "grad_norm": 29.84836196899414, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8708121180534363, + "num_tokens": 625863475.0, + "step": 16405 + }, + { + "epoch": 2.087011830555909, + "ewc_loss": 0.04634779319167137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3173895897343755e-05, + "grad_norm": 29.78845977783203, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8866782784461975, + "num_tokens": 625902369.0, + "step": 16406 + }, + { + "epoch": 2.0871390408344994, + "ewc_loss": 0.04626336321234703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3131680791266263e-05, + "grad_norm": 29.862060546875, + "learning_rate": 1e-06, + "loss": 0.489, + "mean_token_accuracy": 0.8463276624679565, + "num_tokens": 625941054.0, + "step": 16407 + }, + { + "epoch": 2.08726625111309, + "ewc_loss": 0.04632290452718735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3161452190834098e-05, + "grad_norm": 29.74498176574707, + "learning_rate": 1e-06, + "loss": 0.4916, + "mean_token_accuracy": 0.8533880114555359, + "num_tokens": 625981113.0, + "step": 16408 + }, + { + "epoch": 2.0873934613916805, + "ewc_loss": 0.04626128450036049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3130642148316838e-05, + "grad_norm": 29.92165756225586, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8678699731826782, + "num_tokens": 626014758.0, + "step": 16409 + }, + { + "epoch": 2.087520671670271, + "ewc_loss": 0.04633501172065735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3167505787569098e-05, + "grad_norm": 29.803970336914062, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8564287424087524, + "num_tokens": 626054442.0, + "step": 16410 + }, + { + "epoch": 2.0876478819488615, + "ewc_loss": 0.046203795820474625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3101898477762006e-05, + "grad_norm": 29.866735458374023, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8526730537414551, + "num_tokens": 626093993.0, + "step": 16411 + }, + { + "epoch": 2.087775092227452, + "ewc_loss": 0.04630593582987785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.315296842425596e-05, + "grad_norm": 29.87021827697754, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8531718254089355, + "num_tokens": 626131242.0, + "step": 16412 + }, + { + "epoch": 2.0879023025060426, + "ewc_loss": 0.046255387365818024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.312769356649369e-05, + "grad_norm": 29.85272979736328, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8549849987030029, + "num_tokens": 626162137.0, + "step": 16413 + }, + { + "epoch": 2.088029512784633, + "ewc_loss": 0.04634156450629234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3170781787484884e-05, + "grad_norm": 29.763988494873047, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8713057041168213, + "num_tokens": 626201031.0, + "step": 16414 + }, + { + "epoch": 2.0881567230632236, + "ewc_loss": 0.046273890882730484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3136944946600124e-05, + "grad_norm": 29.980051040649414, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8797654509544373, + "num_tokens": 626243647.0, + "step": 16415 + }, + { + "epoch": 2.088283933341814, + "ewc_loss": 0.046365223824977875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3182612494565547e-05, + "grad_norm": 29.835657119750977, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8753950595855713, + "num_tokens": 626281346.0, + "step": 16416 + }, + { + "epoch": 2.0884111436204047, + "ewc_loss": 0.04621801897883415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3109008907340467e-05, + "grad_norm": 30.021093368530273, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8803150653839111, + "num_tokens": 626315964.0, + "step": 16417 + }, + { + "epoch": 2.0885383538989952, + "ewc_loss": 0.046363234519958496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3181617507361807e-05, + "grad_norm": 29.881704330444336, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.863940954208374, + "num_tokens": 626367920.0, + "step": 16418 + }, + { + "epoch": 2.0886655641775858, + "ewc_loss": 0.046174611896276474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.308730654476676e-05, + "grad_norm": 29.852853775024414, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8664973378181458, + "num_tokens": 626412123.0, + "step": 16419 + }, + { + "epoch": 2.0887927744561763, + "ewc_loss": 0.04629381746053696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.314690937055275e-05, + "grad_norm": 29.9034423828125, + "learning_rate": 1e-06, + "loss": 0.5079, + "mean_token_accuracy": 0.8423947691917419, + "num_tokens": 626451560.0, + "step": 16420 + }, + { + "epoch": 2.0889199847347664, + "ewc_loss": 0.04621201008558273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3106005755835213e-05, + "grad_norm": 29.811052322387695, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.864482045173645, + "num_tokens": 626489361.0, + "step": 16421 + }, + { + "epoch": 2.089047195013357, + "ewc_loss": 0.046274565160274506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3137283278629184e-05, + "grad_norm": 29.863906860351562, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8852870464324951, + "num_tokens": 626527150.0, + "step": 16422 + }, + { + "epoch": 2.0891744052919474, + "ewc_loss": 0.04619884490966797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.309942283318378e-05, + "grad_norm": 29.88273811340332, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8627041578292847, + "num_tokens": 626568110.0, + "step": 16423 + }, + { + "epoch": 2.089301615570538, + "ewc_loss": 0.046222466975450516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3111233531381004e-05, + "grad_norm": 29.926321029663086, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8713361024856567, + "num_tokens": 626607215.0, + "step": 16424 + }, + { + "epoch": 2.0894288258491285, + "ewc_loss": 0.046167075634002686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3083537598722614e-05, + "grad_norm": 29.94200325012207, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8749637603759766, + "num_tokens": 626644424.0, + "step": 16425 + }, + { + "epoch": 2.089556036127719, + "ewc_loss": 0.04618356376886368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3091781258699484e-05, + "grad_norm": 29.88231658935547, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8489218950271606, + "num_tokens": 626689509.0, + "step": 16426 + }, + { + "epoch": 2.0896832464063095, + "ewc_loss": 0.04618178680539131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3090893591870554e-05, + "grad_norm": 29.819093704223633, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8585125803947449, + "num_tokens": 626733866.0, + "step": 16427 + }, + { + "epoch": 2.0898104566849, + "ewc_loss": 0.04617690667510033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3088452508090995e-05, + "grad_norm": 29.901395797729492, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8790789842605591, + "num_tokens": 626767465.0, + "step": 16428 + }, + { + "epoch": 2.0899376669634906, + "ewc_loss": 0.04620321840047836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3101609258446842e-05, + "grad_norm": 29.94999885559082, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8567166328430176, + "num_tokens": 626806049.0, + "step": 16429 + }, + { + "epoch": 2.090064877242081, + "ewc_loss": 0.046168141067028046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3084070562617853e-05, + "grad_norm": 29.949050903320312, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8559772968292236, + "num_tokens": 626848917.0, + "step": 16430 + }, + { + "epoch": 2.0901920875206716, + "ewc_loss": 0.04617031663656235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3085158318281174e-05, + "grad_norm": 29.847333908081055, + "learning_rate": 1e-06, + "loss": 0.4796, + "mean_token_accuracy": 0.8569377064704895, + "num_tokens": 626884442.0, + "step": 16431 + }, + { + "epoch": 2.090319297799262, + "ewc_loss": 0.046159178018569946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3079588572727516e-05, + "grad_norm": 29.815284729003906, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8669456839561462, + "num_tokens": 626922517.0, + "step": 16432 + }, + { + "epoch": 2.0904465080778527, + "ewc_loss": 0.04623507335782051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.311753632966429e-05, + "grad_norm": 29.9200439453125, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8685441613197327, + "num_tokens": 626962880.0, + "step": 16433 + }, + { + "epoch": 2.0905737183564432, + "ewc_loss": 0.04627668112516403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3138340111472644e-05, + "grad_norm": 29.913114547729492, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8771032691001892, + "num_tokens": 627008115.0, + "step": 16434 + }, + { + "epoch": 2.0907009286350338, + "ewc_loss": 0.04618207365274429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3091037292033434e-05, + "grad_norm": 29.868486404418945, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8810847997665405, + "num_tokens": 627043935.0, + "step": 16435 + }, + { + "epoch": 2.0908281389136243, + "ewc_loss": 0.04620283842086792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3101420083548874e-05, + "grad_norm": 29.908611297607422, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.870928943157196, + "num_tokens": 627078495.0, + "step": 16436 + }, + { + "epoch": 2.090955349192215, + "ewc_loss": 0.046231240034103394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3115619114832953e-05, + "grad_norm": 29.8341064453125, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8740472793579102, + "num_tokens": 627116652.0, + "step": 16437 + }, + { + "epoch": 2.0910825594708053, + "ewc_loss": 0.04618285223841667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3091426555765793e-05, + "grad_norm": 29.908973693847656, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8775393962860107, + "num_tokens": 627152326.0, + "step": 16438 + }, + { + "epoch": 2.091209769749396, + "ewc_loss": 0.04621891677379608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3109458197723143e-05, + "grad_norm": 29.86765480041504, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8797802925109863, + "num_tokens": 627189502.0, + "step": 16439 + }, + { + "epoch": 2.0913369800279864, + "ewc_loss": 0.04622581973671913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3112910639611073e-05, + "grad_norm": 29.9859561920166, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8845021724700928, + "num_tokens": 627225438.0, + "step": 16440 + }, + { + "epoch": 2.091464190306577, + "ewc_loss": 0.04634274169802666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3171371140051633e-05, + "grad_norm": 29.919147491455078, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8663015365600586, + "num_tokens": 627258154.0, + "step": 16441 + }, + { + "epoch": 2.0915914005851675, + "ewc_loss": 0.0462578684091568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3128934117266908e-05, + "grad_norm": 29.829647064208984, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8628469705581665, + "num_tokens": 627301804.0, + "step": 16442 + }, + { + "epoch": 2.091718610863758, + "ewc_loss": 0.0462457612156868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3122880520531908e-05, + "grad_norm": 29.935504913330078, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8738396763801575, + "num_tokens": 627344301.0, + "step": 16443 + }, + { + "epoch": 2.0918458211423485, + "ewc_loss": 0.046388816088438034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.319440864084754e-05, + "grad_norm": 29.90526580810547, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8778069019317627, + "num_tokens": 627381068.0, + "step": 16444 + }, + { + "epoch": 2.0919730314209386, + "ewc_loss": 0.04629172384738922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3145861632656306e-05, + "grad_norm": 29.891162872314453, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8634601831436157, + "num_tokens": 627423410.0, + "step": 16445 + }, + { + "epoch": 2.092100241699529, + "ewc_loss": 0.04633359983563423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3166800019680522e-05, + "grad_norm": 29.86284637451172, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8795679211616516, + "num_tokens": 627456593.0, + "step": 16446 + }, + { + "epoch": 2.0922274519781197, + "ewc_loss": 0.046262893825769424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3131446141633205e-05, + "grad_norm": 30.02831268310547, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8722548484802246, + "num_tokens": 627493742.0, + "step": 16447 + }, + { + "epoch": 2.09235466225671, + "ewc_loss": 0.046299874782562256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3149937987909652e-05, + "grad_norm": 29.880809783935547, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8763895034790039, + "num_tokens": 627529809.0, + "step": 16448 + }, + { + "epoch": 2.0924818725353007, + "ewc_loss": 0.04618818685412407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.309409319423139e-05, + "grad_norm": 29.850788116455078, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8727728724479675, + "num_tokens": 627570693.0, + "step": 16449 + }, + { + "epoch": 2.0926090828138912, + "ewc_loss": 0.04632541537284851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3162707293522544e-05, + "grad_norm": 29.828617095947266, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8775060772895813, + "num_tokens": 627607608.0, + "step": 16450 + }, + { + "epoch": 2.0927362930924818, + "ewc_loss": 0.04630096256732941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3150481865741313e-05, + "grad_norm": 30.002111434936523, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8677161931991577, + "num_tokens": 627642102.0, + "step": 16451 + }, + { + "epoch": 2.0928635033710723, + "ewc_loss": 0.04634246602654457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3171232896856964e-05, + "grad_norm": 29.814191818237305, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8644055128097534, + "num_tokens": 627678838.0, + "step": 16452 + }, + { + "epoch": 2.092990713649663, + "ewc_loss": 0.046250082552433014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.312504147994332e-05, + "grad_norm": 29.890172958374023, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.873339831829071, + "num_tokens": 627714952.0, + "step": 16453 + }, + { + "epoch": 2.0931179239282534, + "ewc_loss": 0.046367790549993515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3183894882095046e-05, + "grad_norm": 29.98792266845703, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8603866100311279, + "num_tokens": 627754710.0, + "step": 16454 + }, + { + "epoch": 2.093245134206844, + "ewc_loss": 0.0462392196059227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3119609977584332e-05, + "grad_norm": 29.86098861694336, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.875071108341217, + "num_tokens": 627791504.0, + "step": 16455 + }, + { + "epoch": 2.0933723444854344, + "ewc_loss": 0.04624389857053757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3121949197957292e-05, + "grad_norm": 29.929044723510742, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.883985161781311, + "num_tokens": 627829683.0, + "step": 16456 + }, + { + "epoch": 2.093499554764025, + "ewc_loss": 0.04633910953998566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.316955396963749e-05, + "grad_norm": 29.803369522094727, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8706293106079102, + "num_tokens": 627869826.0, + "step": 16457 + }, + { + "epoch": 2.0936267650426155, + "ewc_loss": 0.04631408303976059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.31570411415305e-05, + "grad_norm": 29.840221405029297, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8668084144592285, + "num_tokens": 627904165.0, + "step": 16458 + }, + { + "epoch": 2.093753975321206, + "ewc_loss": 0.046349022537469864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3174510715762153e-05, + "grad_norm": 29.772493362426758, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8754900693893433, + "num_tokens": 627936559.0, + "step": 16459 + }, + { + "epoch": 2.0938811855997965, + "ewc_loss": 0.046406589448451996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3203294404083863e-05, + "grad_norm": 30.00904083251953, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8756865859031677, + "num_tokens": 627971742.0, + "step": 16460 + }, + { + "epoch": 2.094008395878387, + "ewc_loss": 0.046345800161361694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3172900910140015e-05, + "grad_norm": 29.872220993041992, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8793565630912781, + "num_tokens": 628013104.0, + "step": 16461 + }, + { + "epoch": 2.0941356061569776, + "ewc_loss": 0.046276092529296875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.313804543518927e-05, + "grad_norm": 29.93173599243164, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.867186427116394, + "num_tokens": 628049378.0, + "step": 16462 + }, + { + "epoch": 2.094262816435568, + "ewc_loss": 0.046310193836688995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.315509664185811e-05, + "grad_norm": 29.93364906311035, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8880805969238281, + "num_tokens": 628087213.0, + "step": 16463 + }, + { + "epoch": 2.0943900267141586, + "ewc_loss": 0.04631303250789642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3156515453592874e-05, + "grad_norm": 29.88496208190918, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8754205107688904, + "num_tokens": 628122060.0, + "step": 16464 + }, + { + "epoch": 2.094517236992749, + "ewc_loss": 0.046322647482156754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3161323042586446e-05, + "grad_norm": 29.928543090820312, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8738288879394531, + "num_tokens": 628166554.0, + "step": 16465 + }, + { + "epoch": 2.0946444472713397, + "ewc_loss": 0.046316344290971756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3158172552939504e-05, + "grad_norm": 29.7304744720459, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8688918352127075, + "num_tokens": 628199953.0, + "step": 16466 + }, + { + "epoch": 2.09477165754993, + "ewc_loss": 0.046250201761722565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3125101506593637e-05, + "grad_norm": 29.851099014282227, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8727520704269409, + "num_tokens": 628237044.0, + "step": 16467 + }, + { + "epoch": 2.0948988678285207, + "ewc_loss": 0.04645227640867233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3226137273013592e-05, + "grad_norm": 29.868314743041992, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8731377124786377, + "num_tokens": 628275773.0, + "step": 16468 + }, + { + "epoch": 2.095026078107111, + "ewc_loss": 0.0463285967707634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3164298909250647e-05, + "grad_norm": 29.776111602783203, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.856560230255127, + "num_tokens": 628314549.0, + "step": 16469 + }, + { + "epoch": 2.0951532883857014, + "ewc_loss": 0.04639364033937454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3196820620796643e-05, + "grad_norm": 29.817655563354492, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8657304048538208, + "num_tokens": 628349417.0, + "step": 16470 + }, + { + "epoch": 2.095280498664292, + "ewc_loss": 0.04644770547747612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3223852622322738e-05, + "grad_norm": 29.861806869506836, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8747875690460205, + "num_tokens": 628384435.0, + "step": 16471 + }, + { + "epoch": 2.0954077089428824, + "ewc_loss": 0.046415816992521286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.320790918020066e-05, + "grad_norm": 29.81177520751953, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8582801818847656, + "num_tokens": 628424124.0, + "step": 16472 + }, + { + "epoch": 2.095534919221473, + "ewc_loss": 0.046467915177345276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3233957108459435e-05, + "grad_norm": 29.833633422851562, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8719481229782104, + "num_tokens": 628462412.0, + "step": 16473 + }, + { + "epoch": 2.0956621295000635, + "ewc_loss": 0.046413108706474304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3206554033095017e-05, + "grad_norm": 29.84379005432129, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8529119491577148, + "num_tokens": 628499066.0, + "step": 16474 + }, + { + "epoch": 2.095789339778654, + "ewc_loss": 0.04640717804431915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3203589080367237e-05, + "grad_norm": 29.912445068359375, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8842244148254395, + "num_tokens": 628532379.0, + "step": 16475 + }, + { + "epoch": 2.0959165500572445, + "ewc_loss": 0.04648814722895622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3244074327521957e-05, + "grad_norm": 29.900165557861328, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8797940015792847, + "num_tokens": 628570497.0, + "step": 16476 + }, + { + "epoch": 2.096043760335835, + "ewc_loss": 0.04637661948800087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3188309569377452e-05, + "grad_norm": 29.782907485961914, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8758662939071655, + "num_tokens": 628608298.0, + "step": 16477 + }, + { + "epoch": 2.0961709706144256, + "ewc_loss": 0.04641105607151985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3205528123071417e-05, + "grad_norm": 29.945837020874023, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8894616961479187, + "num_tokens": 628641217.0, + "step": 16478 + }, + { + "epoch": 2.096298180893016, + "ewc_loss": 0.04639621824026108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3198108465294354e-05, + "grad_norm": 29.862886428833008, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.871571958065033, + "num_tokens": 628682480.0, + "step": 16479 + }, + { + "epoch": 2.0964253911716066, + "ewc_loss": 0.04650387167930603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3251935999724083e-05, + "grad_norm": 29.8487548828125, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8687976598739624, + "num_tokens": 628718359.0, + "step": 16480 + }, + { + "epoch": 2.096552601450197, + "ewc_loss": 0.046460747718811035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.323037369933445e-05, + "grad_norm": 29.88618278503418, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8780761957168579, + "num_tokens": 628752214.0, + "step": 16481 + }, + { + "epoch": 2.0966798117287877, + "ewc_loss": 0.04644935950636864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.322468026250135e-05, + "grad_norm": 29.776451110839844, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8838691711425781, + "num_tokens": 628787882.0, + "step": 16482 + }, + { + "epoch": 2.0968070220073782, + "ewc_loss": 0.046491652727127075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3245826014317572e-05, + "grad_norm": 29.962915420532227, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.867999255657196, + "num_tokens": 628824910.0, + "step": 16483 + }, + { + "epoch": 2.0969342322859688, + "ewc_loss": 0.04653060808777809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.326530375285074e-05, + "grad_norm": 29.755111694335938, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8818539381027222, + "num_tokens": 628860657.0, + "step": 16484 + }, + { + "epoch": 2.0970614425645593, + "ewc_loss": 0.04645456746220589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3227283236337826e-05, + "grad_norm": 29.960514068603516, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8540232181549072, + "num_tokens": 628900619.0, + "step": 16485 + }, + { + "epoch": 2.09718865284315, + "ewc_loss": 0.046568673104047775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.328433583898004e-05, + "grad_norm": 29.89966583251953, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8822444081306458, + "num_tokens": 628944304.0, + "step": 16486 + }, + { + "epoch": 2.0973158631217403, + "ewc_loss": 0.046416275203228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3208138372865506e-05, + "grad_norm": 29.819927215576172, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.874259352684021, + "num_tokens": 628980490.0, + "step": 16487 + }, + { + "epoch": 2.097443073400331, + "ewc_loss": 0.0464538112282753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3226904886541888e-05, + "grad_norm": 29.880361557006836, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8750874996185303, + "num_tokens": 629011924.0, + "step": 16488 + }, + { + "epoch": 2.0975702836789214, + "ewc_loss": 0.04647413268685341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3237065761350095e-05, + "grad_norm": 29.842819213867188, + "learning_rate": 1e-06, + "loss": 0.5041, + "mean_token_accuracy": 0.8452420830726624, + "num_tokens": 629049514.0, + "step": 16489 + }, + { + "epoch": 2.097697493957512, + "ewc_loss": 0.04648028314113617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.324014167243149e-05, + "grad_norm": 29.85191535949707, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8885148763656616, + "num_tokens": 629085374.0, + "step": 16490 + }, + { + "epoch": 2.0978247042361025, + "ewc_loss": 0.04647492617368698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3237462301040068e-05, + "grad_norm": 29.960723876953125, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8679782152175903, + "num_tokens": 629126307.0, + "step": 16491 + }, + { + "epoch": 2.097951914514693, + "ewc_loss": 0.046390995383262634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3195498215500265e-05, + "grad_norm": 29.7602481842041, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.870800256729126, + "num_tokens": 629159693.0, + "step": 16492 + }, + { + "epoch": 2.0980791247932835, + "ewc_loss": 0.04642326012253761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3211630832520314e-05, + "grad_norm": 30.014738082885742, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8767251968383789, + "num_tokens": 629198544.0, + "step": 16493 + }, + { + "epoch": 2.0982063350718736, + "ewc_loss": 0.04651179164648056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3255895939655602e-05, + "grad_norm": 29.934160232543945, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8635734915733337, + "num_tokens": 629240956.0, + "step": 16494 + }, + { + "epoch": 2.098333545350464, + "ewc_loss": 0.046432118862867355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3216060071717948e-05, + "grad_norm": 29.997724533081055, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8729492425918579, + "num_tokens": 629276196.0, + "step": 16495 + }, + { + "epoch": 2.0984607556290547, + "ewc_loss": 0.0464370958507061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3218548449222e-05, + "grad_norm": 29.862075805664062, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8577902317047119, + "num_tokens": 629311398.0, + "step": 16496 + }, + { + "epoch": 2.098587965907645, + "ewc_loss": 0.04638437554240227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3192187654785812e-05, + "grad_norm": 30.02525520324707, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8659852743148804, + "num_tokens": 629349488.0, + "step": 16497 + }, + { + "epoch": 2.0987151761862357, + "ewc_loss": 0.04644174873828888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3220874936669134e-05, + "grad_norm": 29.830984115600586, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8840459585189819, + "num_tokens": 629388023.0, + "step": 16498 + }, + { + "epoch": 2.0988423864648262, + "ewc_loss": 0.046311672776937485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.315583697054535e-05, + "grad_norm": 30.00995445251465, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8710126280784607, + "num_tokens": 629432140.0, + "step": 16499 + }, + { + "epoch": 2.0989695967434168, + "ewc_loss": 0.04646674543619156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3233373212860897e-05, + "grad_norm": 29.90554428100586, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8566116690635681, + "num_tokens": 629480690.0, + "step": 16500 + }, + { + "epoch": 2.0990968070220073, + "ewc_loss": 0.04639154672622681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.31957728829002e-05, + "grad_norm": 30.076358795166016, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8879997134208679, + "num_tokens": 629522500.0, + "step": 16501 + }, + { + "epoch": 2.099224017300598, + "ewc_loss": 0.046316348016262054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3158174371928908e-05, + "grad_norm": 29.82593536376953, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8709800243377686, + "num_tokens": 629563450.0, + "step": 16502 + }, + { + "epoch": 2.0993512275791884, + "ewc_loss": 0.04634609818458557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3173048248281702e-05, + "grad_norm": 30.013851165771484, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8806173801422119, + "num_tokens": 629603753.0, + "step": 16503 + }, + { + "epoch": 2.099478437857779, + "ewc_loss": 0.04643172025680542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3215859982883558e-05, + "grad_norm": 29.9754638671875, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8735318183898926, + "num_tokens": 629637828.0, + "step": 16504 + }, + { + "epoch": 2.0996056481363694, + "ewc_loss": 0.04626269266009331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.313134609721601e-05, + "grad_norm": 29.817426681518555, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.879173994064331, + "num_tokens": 629674636.0, + "step": 16505 + }, + { + "epoch": 2.09973285841496, + "ewc_loss": 0.04634388908743858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.317194412171375e-05, + "grad_norm": 29.918912887573242, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.862190842628479, + "num_tokens": 629717046.0, + "step": 16506 + }, + { + "epoch": 2.0998600686935505, + "ewc_loss": 0.046387188136577606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3193593733594753e-05, + "grad_norm": 29.94938087463379, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.881719708442688, + "num_tokens": 629750331.0, + "step": 16507 + }, + { + "epoch": 2.099987278972141, + "ewc_loss": 0.04628147557377815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3140737539506517e-05, + "grad_norm": 29.875301361083984, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8530286550521851, + "num_tokens": 629790095.0, + "step": 16508 + }, + { + "epoch": 2.1001144892507315, + "ewc_loss": 0.046279359608888626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3139678887673654e-05, + "grad_norm": 29.975502014160156, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8848949670791626, + "num_tokens": 629823035.0, + "step": 16509 + }, + { + "epoch": 2.100241699529322, + "ewc_loss": 0.0462963804602623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3148189939092845e-05, + "grad_norm": 29.772018432617188, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8664090633392334, + "num_tokens": 629857096.0, + "step": 16510 + }, + { + "epoch": 2.1003689098079126, + "ewc_loss": 0.04633371904492378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.316686004633084e-05, + "grad_norm": 29.94092559814453, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8699276447296143, + "num_tokens": 629893448.0, + "step": 16511 + }, + { + "epoch": 2.100496120086503, + "ewc_loss": 0.046458031982183456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3229016733239405e-05, + "grad_norm": 29.924001693725586, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8697658777236938, + "num_tokens": 629931301.0, + "step": 16512 + }, + { + "epoch": 2.1006233303650936, + "ewc_loss": 0.04631955176591873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3159775082604028e-05, + "grad_norm": 29.92398452758789, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8793470859527588, + "num_tokens": 629963481.0, + "step": 16513 + }, + { + "epoch": 2.100750540643684, + "ewc_loss": 0.04640279710292816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3201399017125368e-05, + "grad_norm": 29.856210708618164, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8662392497062683, + "num_tokens": 630002739.0, + "step": 16514 + }, + { + "epoch": 2.1008777509222747, + "ewc_loss": 0.046274200081825256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3137099560699426e-05, + "grad_norm": 29.87790870666504, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.863705575466156, + "num_tokens": 630041206.0, + "step": 16515 + }, + { + "epoch": 2.101004961200865, + "ewc_loss": 0.046455398201942444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3227699784911238e-05, + "grad_norm": 29.86859893798828, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.8570648431777954, + "num_tokens": 630083811.0, + "step": 16516 + }, + { + "epoch": 2.1011321714794557, + "ewc_loss": 0.04631715640425682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3158578187576495e-05, + "grad_norm": 29.91722869873047, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8692984580993652, + "num_tokens": 630119823.0, + "step": 16517 + }, + { + "epoch": 2.1012593817580463, + "ewc_loss": 0.04648645594716072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.324322849744931e-05, + "grad_norm": 29.843496322631836, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8714255690574646, + "num_tokens": 630163807.0, + "step": 16518 + }, + { + "epoch": 2.1013865920366364, + "ewc_loss": 0.04639405012130737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.319702434760984e-05, + "grad_norm": 29.85588836669922, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8662083148956299, + "num_tokens": 630203038.0, + "step": 16519 + }, + { + "epoch": 2.101513802315227, + "ewc_loss": 0.046436525881290436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3218262867885642e-05, + "grad_norm": 29.81584358215332, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8746515512466431, + "num_tokens": 630238505.0, + "step": 16520 + }, + { + "epoch": 2.1016410125938174, + "ewc_loss": 0.046459879726171494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3229938960867003e-05, + "grad_norm": 29.832271575927734, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8699499368667603, + "num_tokens": 630275279.0, + "step": 16521 + }, + { + "epoch": 2.101768222872408, + "ewc_loss": 0.04651103541254997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3255517589859664e-05, + "grad_norm": 29.893924713134766, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8684384822845459, + "num_tokens": 630317288.0, + "step": 16522 + }, + { + "epoch": 2.1018954331509985, + "ewc_loss": 0.046469349414110184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3234675609273836e-05, + "grad_norm": 29.824588775634766, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8721030950546265, + "num_tokens": 630352786.0, + "step": 16523 + }, + { + "epoch": 2.102022643429589, + "ewc_loss": 0.04644118994474411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3220594812300988e-05, + "grad_norm": 29.942346572875977, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8715745210647583, + "num_tokens": 630391138.0, + "step": 16524 + }, + { + "epoch": 2.1021498537081795, + "ewc_loss": 0.04655817151069641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3279086235561408e-05, + "grad_norm": 29.8596134185791, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8584554195404053, + "num_tokens": 630426567.0, + "step": 16525 + }, + { + "epoch": 2.10227706398677, + "ewc_loss": 0.0464157871901989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3207892809296027e-05, + "grad_norm": 29.938369750976562, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8615096807479858, + "num_tokens": 630467308.0, + "step": 16526 + }, + { + "epoch": 2.1024042742653606, + "ewc_loss": 0.04657109081745148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.32855454669334e-05, + "grad_norm": 29.922826766967773, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8821223378181458, + "num_tokens": 630503333.0, + "step": 16527 + }, + { + "epoch": 2.102531484543951, + "ewc_loss": 0.046462561935186386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.323128137504682e-05, + "grad_norm": 29.96870994567871, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8657971620559692, + "num_tokens": 630536838.0, + "step": 16528 + }, + { + "epoch": 2.1026586948225416, + "ewc_loss": 0.04647481068968773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.323740591236856e-05, + "grad_norm": 29.835100173950195, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8838894963264465, + "num_tokens": 630570833.0, + "step": 16529 + }, + { + "epoch": 2.102785905101132, + "ewc_loss": 0.046465955674648285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3232978492160328e-05, + "grad_norm": 29.893796920776367, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8786386847496033, + "num_tokens": 630615043.0, + "step": 16530 + }, + { + "epoch": 2.1029131153797227, + "ewc_loss": 0.04648273438215256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.324136767128948e-05, + "grad_norm": 29.83039093017578, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8934615254402161, + "num_tokens": 630651368.0, + "step": 16531 + }, + { + "epoch": 2.1030403256583132, + "ewc_loss": 0.04647788405418396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.323894113942515e-05, + "grad_norm": 29.984498977661133, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8633789420127869, + "num_tokens": 630690341.0, + "step": 16532 + }, + { + "epoch": 2.1031675359369038, + "ewc_loss": 0.046501293778419495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.325064633623697e-05, + "grad_norm": 29.832460403442383, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8680987358093262, + "num_tokens": 630727168.0, + "step": 16533 + }, + { + "epoch": 2.1032947462154943, + "ewc_loss": 0.04636755213141441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3183776647783816e-05, + "grad_norm": 29.92823600769043, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8754339218139648, + "num_tokens": 630766737.0, + "step": 16534 + }, + { + "epoch": 2.103421956494085, + "ewc_loss": 0.04647195711731911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3235978005686775e-05, + "grad_norm": 29.722827911376953, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8689384460449219, + "num_tokens": 630803721.0, + "step": 16535 + }, + { + "epoch": 2.1035491667726753, + "ewc_loss": 0.04644380882382393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.322190448467154e-05, + "grad_norm": 30.002056121826172, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8766674995422363, + "num_tokens": 630843697.0, + "step": 16536 + }, + { + "epoch": 2.103676377051266, + "ewc_loss": 0.04648527130484581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3242635506903753e-05, + "grad_norm": 29.748912811279297, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8724700808525085, + "num_tokens": 630874325.0, + "step": 16537 + }, + { + "epoch": 2.1038035873298564, + "ewc_loss": 0.046402521431446075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.32012607739307e-05, + "grad_norm": 29.902875900268555, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8665173649787903, + "num_tokens": 630911456.0, + "step": 16538 + }, + { + "epoch": 2.103930797608447, + "ewc_loss": 0.046549830585718155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3274915292859077e-05, + "grad_norm": 29.916170120239258, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8691202402114868, + "num_tokens": 630954418.0, + "step": 16539 + }, + { + "epoch": 2.1040580078870375, + "ewc_loss": 0.04642179608345032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3210897779790685e-05, + "grad_norm": 29.93777084350586, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.870351254940033, + "num_tokens": 630991826.0, + "step": 16540 + }, + { + "epoch": 2.104185218165628, + "ewc_loss": 0.046379245817661285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3189622879726812e-05, + "grad_norm": 29.91114044189453, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8713183403015137, + "num_tokens": 631025831.0, + "step": 16541 + }, + { + "epoch": 2.1043124284442185, + "ewc_loss": 0.04643433168530464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3217166017275304e-05, + "grad_norm": 29.820446014404297, + "learning_rate": 1e-06, + "loss": 0.4777, + "mean_token_accuracy": 0.856051504611969, + "num_tokens": 631062896.0, + "step": 16542 + }, + { + "epoch": 2.1044396387228086, + "ewc_loss": 0.04655613750219345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3278069420484826e-05, + "grad_norm": 29.978696823120117, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8775984048843384, + "num_tokens": 631099291.0, + "step": 16543 + }, + { + "epoch": 2.104566849001399, + "ewc_loss": 0.04650526121258736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3252630853676237e-05, + "grad_norm": 29.881114959716797, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8629842400550842, + "num_tokens": 631141570.0, + "step": 16544 + }, + { + "epoch": 2.1046940592799896, + "ewc_loss": 0.04640193283557892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3200966097647324e-05, + "grad_norm": 29.885311126708984, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8633219003677368, + "num_tokens": 631178882.0, + "step": 16545 + }, + { + "epoch": 2.10482126955858, + "ewc_loss": 0.04645887389779091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3229436919791624e-05, + "grad_norm": 29.855905532836914, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8674219846725464, + "num_tokens": 631218059.0, + "step": 16546 + }, + { + "epoch": 2.1049484798371707, + "ewc_loss": 0.04649593308568001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3247966964845546e-05, + "grad_norm": 29.917280197143555, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8719345927238464, + "num_tokens": 631252827.0, + "step": 16547 + }, + { + "epoch": 2.1050756901157612, + "ewc_loss": 0.0464690625667572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3234531909110956e-05, + "grad_norm": 29.79068374633789, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8774030208587646, + "num_tokens": 631294370.0, + "step": 16548 + }, + { + "epoch": 2.1052029003943518, + "ewc_loss": 0.04650283604860306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3251417587744072e-05, + "grad_norm": 29.928916931152344, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8675054907798767, + "num_tokens": 631331091.0, + "step": 16549 + }, + { + "epoch": 2.1053301106729423, + "ewc_loss": 0.046471137553453445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3235568733070977e-05, + "grad_norm": 29.693103790283203, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8744653463363647, + "num_tokens": 631367285.0, + "step": 16550 + }, + { + "epoch": 2.105457320951533, + "ewc_loss": 0.046516481786966324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3258240616996773e-05, + "grad_norm": 29.966909408569336, + "learning_rate": 1e-06, + "loss": 0.4636, + "mean_token_accuracy": 0.85723876953125, + "num_tokens": 631402471.0, + "step": 16551 + }, + { + "epoch": 2.1055845312301233, + "ewc_loss": 0.04664832353591919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3324162611970678e-05, + "grad_norm": 29.71240234375, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8707903027534485, + "num_tokens": 631439706.0, + "step": 16552 + }, + { + "epoch": 2.105711741508714, + "ewc_loss": 0.04651166498661041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3255832275026478e-05, + "grad_norm": 30.050233840942383, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.875140368938446, + "num_tokens": 631471437.0, + "step": 16553 + }, + { + "epoch": 2.1058389517873044, + "ewc_loss": 0.04660578817129135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3302893168875016e-05, + "grad_norm": 29.755123138427734, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.867567777633667, + "num_tokens": 631515028.0, + "step": 16554 + }, + { + "epoch": 2.105966162065895, + "ewc_loss": 0.04640170559287071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3200853320304304e-05, + "grad_norm": 29.96025276184082, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8654977083206177, + "num_tokens": 631552946.0, + "step": 16555 + }, + { + "epoch": 2.1060933723444855, + "ewc_loss": 0.04657982289791107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.328991104150191e-05, + "grad_norm": 29.781509399414062, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8752086162567139, + "num_tokens": 631596045.0, + "step": 16556 + }, + { + "epoch": 2.106220582623076, + "ewc_loss": 0.046396370977163315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3198184862849303e-05, + "grad_norm": 29.930416107177734, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8786957859992981, + "num_tokens": 631630233.0, + "step": 16557 + }, + { + "epoch": 2.1063477929016665, + "ewc_loss": 0.04662289842963219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3311449695029296e-05, + "grad_norm": 29.86867332458496, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8771438002586365, + "num_tokens": 631666665.0, + "step": 16558 + }, + { + "epoch": 2.106475003180257, + "ewc_loss": 0.04648078605532646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.324039269296918e-05, + "grad_norm": 29.938772201538086, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8674414753913879, + "num_tokens": 631709536.0, + "step": 16559 + }, + { + "epoch": 2.1066022134588476, + "ewc_loss": 0.04648656025528908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3243279429152608e-05, + "grad_norm": 29.796602249145508, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8853593468666077, + "num_tokens": 631744333.0, + "step": 16560 + }, + { + "epoch": 2.106729423737438, + "ewc_loss": 0.04636459797620773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3182299628388137e-05, + "grad_norm": 29.832502365112305, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8564023375511169, + "num_tokens": 631783522.0, + "step": 16561 + }, + { + "epoch": 2.1068566340160286, + "ewc_loss": 0.04657270014286041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3286349460249767e-05, + "grad_norm": 29.950246810913086, + "learning_rate": 1e-06, + "loss": 0.4679, + "mean_token_accuracy": 0.8542548418045044, + "num_tokens": 631822579.0, + "step": 16562 + }, + { + "epoch": 2.106983844294619, + "ewc_loss": 0.046454254537820816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.322712680324912e-05, + "grad_norm": 29.82819366455078, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8775577545166016, + "num_tokens": 631862642.0, + "step": 16563 + }, + { + "epoch": 2.1071110545732097, + "ewc_loss": 0.046495988965034485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.32479942496866e-05, + "grad_norm": 29.896337509155273, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.881517767906189, + "num_tokens": 631903161.0, + "step": 16564 + }, + { + "epoch": 2.1072382648518, + "ewc_loss": 0.04643595591187477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3217977286549285e-05, + "grad_norm": 29.83355712890625, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8736782670021057, + "num_tokens": 631938149.0, + "step": 16565 + }, + { + "epoch": 2.1073654751303907, + "ewc_loss": 0.0465112142264843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3255606720340438e-05, + "grad_norm": 29.945524215698242, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8713348507881165, + "num_tokens": 631974717.0, + "step": 16566 + }, + { + "epoch": 2.107492685408981, + "ewc_loss": 0.04650121182203293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.325060631847009e-05, + "grad_norm": 29.832101821899414, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8744498491287231, + "num_tokens": 632011147.0, + "step": 16567 + }, + { + "epoch": 2.1076198956875714, + "ewc_loss": 0.046420853585004807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3210426661535166e-05, + "grad_norm": 29.90923500061035, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8772298097610474, + "num_tokens": 632051915.0, + "step": 16568 + }, + { + "epoch": 2.107747105966162, + "ewc_loss": 0.046499695628881454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3249847799888812e-05, + "grad_norm": 29.938446044921875, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8619077801704407, + "num_tokens": 632083047.0, + "step": 16569 + }, + { + "epoch": 2.1078743162447524, + "ewc_loss": 0.046472374349832535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3236187189468183e-05, + "grad_norm": 29.957904815673828, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8830792903900146, + "num_tokens": 632119250.0, + "step": 16570 + }, + { + "epoch": 2.108001526523343, + "ewc_loss": 0.046476781368255615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3238389985635877e-05, + "grad_norm": 29.822538375854492, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8771407604217529, + "num_tokens": 632152696.0, + "step": 16571 + }, + { + "epoch": 2.1081287368019335, + "ewc_loss": 0.046358320862054825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3179160052677616e-05, + "grad_norm": 29.819246292114258, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8911869525909424, + "num_tokens": 632188167.0, + "step": 16572 + }, + { + "epoch": 2.108255947080524, + "ewc_loss": 0.04653414338827133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3267071810550988e-05, + "grad_norm": 29.88290023803711, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.869015097618103, + "num_tokens": 632229567.0, + "step": 16573 + }, + { + "epoch": 2.1083831573591145, + "ewc_loss": 0.046455610543489456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3227805286296643e-05, + "grad_norm": 29.799863815307617, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8806679844856262, + "num_tokens": 632266870.0, + "step": 16574 + }, + { + "epoch": 2.108510367637705, + "ewc_loss": 0.046584971249103546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3292484911507927e-05, + "grad_norm": 29.98993682861328, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8715891242027283, + "num_tokens": 632303844.0, + "step": 16575 + }, + { + "epoch": 2.1086375779162956, + "ewc_loss": 0.04653703793883324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.326851972611621e-05, + "grad_norm": 29.8847713470459, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8734313249588013, + "num_tokens": 632339343.0, + "step": 16576 + }, + { + "epoch": 2.108764788194886, + "ewc_loss": 0.04656088352203369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.328044138266705e-05, + "grad_norm": 29.947011947631836, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8761841058731079, + "num_tokens": 632385516.0, + "step": 16577 + }, + { + "epoch": 2.1088919984734766, + "ewc_loss": 0.04649614170193672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3248070647241548e-05, + "grad_norm": 29.797143936157227, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8752833604812622, + "num_tokens": 632420664.0, + "step": 16578 + }, + { + "epoch": 2.109019208752067, + "ewc_loss": 0.04646654799580574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3233273168443702e-05, + "grad_norm": 29.855867385864258, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.877871036529541, + "num_tokens": 632458082.0, + "step": 16579 + }, + { + "epoch": 2.1091464190306577, + "ewc_loss": 0.04660144820809364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.330072493350599e-05, + "grad_norm": 29.877164840698242, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8588154911994934, + "num_tokens": 632495963.0, + "step": 16580 + }, + { + "epoch": 2.109273629309248, + "ewc_loss": 0.046394139528274536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.319706982234493e-05, + "grad_norm": 29.872724533081055, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8730784058570862, + "num_tokens": 632529968.0, + "step": 16581 + }, + { + "epoch": 2.1094008395878387, + "ewc_loss": 0.04655842483043671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3279211745830253e-05, + "grad_norm": 29.944623947143555, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8800874948501587, + "num_tokens": 632567527.0, + "step": 16582 + }, + { + "epoch": 2.1095280498664293, + "ewc_loss": 0.046461086720228195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.323054286534898e-05, + "grad_norm": 29.790006637573242, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8702137470245361, + "num_tokens": 632606192.0, + "step": 16583 + }, + { + "epoch": 2.10965526014502, + "ewc_loss": 0.046443235129117966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.322161708434578e-05, + "grad_norm": 29.8856143951416, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8821547627449036, + "num_tokens": 632645940.0, + "step": 16584 + }, + { + "epoch": 2.1097824704236103, + "ewc_loss": 0.046541403979063034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3270702513400465e-05, + "grad_norm": 29.87857437133789, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8798671960830688, + "num_tokens": 632689385.0, + "step": 16585 + }, + { + "epoch": 2.109909680702201, + "ewc_loss": 0.04648854210972786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.324427077837754e-05, + "grad_norm": 29.99863624572754, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8780984282493591, + "num_tokens": 632720178.0, + "step": 16586 + }, + { + "epoch": 2.1100368909807914, + "ewc_loss": 0.0465659461915493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3282973415916786e-05, + "grad_norm": 29.893774032592773, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.86296147108078, + "num_tokens": 632763249.0, + "step": 16587 + }, + { + "epoch": 2.110164101259382, + "ewc_loss": 0.04648595675826073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.324297747691162e-05, + "grad_norm": 29.99051856994629, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8529019951820374, + "num_tokens": 632803293.0, + "step": 16588 + }, + { + "epoch": 2.1102913115379724, + "ewc_loss": 0.04650784283876419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3253922336152755e-05, + "grad_norm": 29.864990234375, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8629360198974609, + "num_tokens": 632838731.0, + "step": 16589 + }, + { + "epoch": 2.110418521816563, + "ewc_loss": 0.046481531113386154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3240765585796908e-05, + "grad_norm": 30.08509063720703, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8618820905685425, + "num_tokens": 632875044.0, + "step": 16590 + }, + { + "epoch": 2.1105457320951535, + "ewc_loss": 0.046499673277139664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.324983688595239e-05, + "grad_norm": 29.915817260742188, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8509063720703125, + "num_tokens": 632911330.0, + "step": 16591 + }, + { + "epoch": 2.1106729423737436, + "ewc_loss": 0.04644652456045151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3223261450766586e-05, + "grad_norm": 30.0583438873291, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8610901236534119, + "num_tokens": 632952122.0, + "step": 16592 + }, + { + "epoch": 2.110800152652334, + "ewc_loss": 0.04643256217241287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3216280169435777e-05, + "grad_norm": 29.89832305908203, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8532025218009949, + "num_tokens": 632985602.0, + "step": 16593 + }, + { + "epoch": 2.1109273629309246, + "ewc_loss": 0.04639109969139099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3195549147203565e-05, + "grad_norm": 29.983030319213867, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8725895881652832, + "num_tokens": 633023792.0, + "step": 16594 + }, + { + "epoch": 2.111054573209515, + "ewc_loss": 0.046548545360565186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3274273189599626e-05, + "grad_norm": 29.94408416748047, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8726779222488403, + "num_tokens": 633065457.0, + "step": 16595 + }, + { + "epoch": 2.1111817834881057, + "ewc_loss": 0.046416573226451874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3208285711007193e-05, + "grad_norm": 29.97332191467285, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.88071608543396, + "num_tokens": 633101402.0, + "step": 16596 + }, + { + "epoch": 2.1113089937666962, + "ewc_loss": 0.046507395803928375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.325369860045612e-05, + "grad_norm": 29.94229507446289, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8632704019546509, + "num_tokens": 633147002.0, + "step": 16597 + }, + { + "epoch": 2.1114362040452868, + "ewc_loss": 0.04641162231564522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.320581188541837e-05, + "grad_norm": 30.022722244262695, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8709696531295776, + "num_tokens": 633184209.0, + "step": 16598 + }, + { + "epoch": 2.1115634143238773, + "ewc_loss": 0.04648294299840927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3241471353685483e-05, + "grad_norm": 29.876506805419922, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8766907453536987, + "num_tokens": 633224525.0, + "step": 16599 + }, + { + "epoch": 2.111690624602468, + "ewc_loss": 0.04640582203865051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3202910597319715e-05, + "grad_norm": 30.019962310791016, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8856481313705444, + "num_tokens": 633253171.0, + "step": 16600 + }, + { + "epoch": 2.1118178348810583, + "ewc_loss": 0.0465000718832016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3250035155797377e-05, + "grad_norm": 29.833782196044922, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8679790496826172, + "num_tokens": 633294326.0, + "step": 16601 + }, + { + "epoch": 2.111945045159649, + "ewc_loss": 0.046428341418504715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3214170141727664e-05, + "grad_norm": 30.137126922607422, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8848445415496826, + "num_tokens": 633327236.0, + "step": 16602 + }, + { + "epoch": 2.1120722554382394, + "ewc_loss": 0.04650479927659035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3252399842021987e-05, + "grad_norm": 29.766538619995117, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8815934658050537, + "num_tokens": 633368593.0, + "step": 16603 + }, + { + "epoch": 2.11219946571683, + "ewc_loss": 0.046346552670001984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3173275621957146e-05, + "grad_norm": 30.033971786499023, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8497494459152222, + "num_tokens": 633402598.0, + "step": 16604 + }, + { + "epoch": 2.1123266759954205, + "ewc_loss": 0.046577125787734985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3288563170353882e-05, + "grad_norm": 29.79058265686035, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8671529293060303, + "num_tokens": 633437069.0, + "step": 16605 + }, + { + "epoch": 2.112453886274011, + "ewc_loss": 0.04640158638358116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3200793293653987e-05, + "grad_norm": 29.96595573425293, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8734003305435181, + "num_tokens": 633479752.0, + "step": 16606 + }, + { + "epoch": 2.1125810965526015, + "ewc_loss": 0.046588748693466187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.329437484149821e-05, + "grad_norm": 29.912321090698242, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8685418367385864, + "num_tokens": 633516949.0, + "step": 16607 + }, + { + "epoch": 2.112708306831192, + "ewc_loss": 0.046474993228912354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3237496861838736e-05, + "grad_norm": 30.11182403564453, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8755292296409607, + "num_tokens": 633547511.0, + "step": 16608 + }, + { + "epoch": 2.1128355171097826, + "ewc_loss": 0.046550050377845764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.327502443222329e-05, + "grad_norm": 29.921735763549805, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8729761838912964, + "num_tokens": 633583940.0, + "step": 16609 + }, + { + "epoch": 2.112962727388373, + "ewc_loss": 0.04650373011827469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3251865059137344e-05, + "grad_norm": 30.06195640563965, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8601394891738892, + "num_tokens": 633626245.0, + "step": 16610 + }, + { + "epoch": 2.1130899376669636, + "ewc_loss": 0.046604570001363754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3302285626414232e-05, + "grad_norm": 29.861236572265625, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8741827011108398, + "num_tokens": 633662889.0, + "step": 16611 + }, + { + "epoch": 2.113217147945554, + "ewc_loss": 0.04644114896655083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.322057480341755e-05, + "grad_norm": 29.9638614654541, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8670625686645508, + "num_tokens": 633701461.0, + "step": 16612 + }, + { + "epoch": 2.1133443582241447, + "ewc_loss": 0.04653102159500122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3265511117642745e-05, + "grad_norm": 29.746225357055664, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8723747134208679, + "num_tokens": 633741302.0, + "step": 16613 + }, + { + "epoch": 2.113471568502735, + "ewc_loss": 0.046484850347042084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.324242450413294e-05, + "grad_norm": 30.04864501953125, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8762506246566772, + "num_tokens": 633780337.0, + "step": 16614 + }, + { + "epoch": 2.1135987787813257, + "ewc_loss": 0.04667375236749649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.333687552891206e-05, + "grad_norm": 29.915180206298828, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8776233196258545, + "num_tokens": 633814450.0, + "step": 16615 + }, + { + "epoch": 2.1137259890599163, + "ewc_loss": 0.04648385941982269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3241929739015177e-05, + "grad_norm": 29.954442977905273, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8637132048606873, + "num_tokens": 633856252.0, + "step": 16616 + }, + { + "epoch": 2.1138531993385064, + "ewc_loss": 0.04664931446313858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3324657377088442e-05, + "grad_norm": 29.95817756652832, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8596434593200684, + "num_tokens": 633895235.0, + "step": 16617 + }, + { + "epoch": 2.113980409617097, + "ewc_loss": 0.04654953256249428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3274766135727987e-05, + "grad_norm": 29.891576766967773, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8739044070243835, + "num_tokens": 633938948.0, + "step": 16618 + }, + { + "epoch": 2.1141076198956874, + "ewc_loss": 0.04654621332883835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3273107217391953e-05, + "grad_norm": 29.933231353759766, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8786349892616272, + "num_tokens": 633973393.0, + "step": 16619 + }, + { + "epoch": 2.114234830174278, + "ewc_loss": 0.04654061794281006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.32703096116893e-05, + "grad_norm": 30.02060890197754, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8748840093612671, + "num_tokens": 634006714.0, + "step": 16620 + }, + { + "epoch": 2.1143620404528685, + "ewc_loss": 0.046567682176828384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3283841073862277e-05, + "grad_norm": 29.931922912597656, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8761160969734192, + "num_tokens": 634046104.0, + "step": 16621 + }, + { + "epoch": 2.114489250731459, + "ewc_loss": 0.04650940001010895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3254700863617472e-05, + "grad_norm": 29.947107315063477, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8627922534942627, + "num_tokens": 634086401.0, + "step": 16622 + }, + { + "epoch": 2.1146164610100495, + "ewc_loss": 0.04656318202614784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.328159098397009e-05, + "grad_norm": 29.919212341308594, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8623596429824829, + "num_tokens": 634125479.0, + "step": 16623 + }, + { + "epoch": 2.11474367128864, + "ewc_loss": 0.04652591049671173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3262955437530763e-05, + "grad_norm": 30.00428009033203, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8529300689697266, + "num_tokens": 634163873.0, + "step": 16624 + }, + { + "epoch": 2.1148708815672306, + "ewc_loss": 0.04658230021595955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3291149773285724e-05, + "grad_norm": 29.847232818603516, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8687752485275269, + "num_tokens": 634200533.0, + "step": 16625 + }, + { + "epoch": 2.114998091845821, + "ewc_loss": 0.04644501581788063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3222508389153518e-05, + "grad_norm": 29.936973571777344, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8632954359054565, + "num_tokens": 634235891.0, + "step": 16626 + }, + { + "epoch": 2.1151253021244116, + "ewc_loss": 0.0465887077152729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.329435301362537e-05, + "grad_norm": 29.87746810913086, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8696775436401367, + "num_tokens": 634266748.0, + "step": 16627 + }, + { + "epoch": 2.115252512403002, + "ewc_loss": 0.046514205634593964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3257101929630153e-05, + "grad_norm": 29.97475814819336, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8592725992202759, + "num_tokens": 634302603.0, + "step": 16628 + }, + { + "epoch": 2.1153797226815927, + "ewc_loss": 0.046608999371528625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3304499336518347e-05, + "grad_norm": 29.944931030273438, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8730513453483582, + "num_tokens": 634345471.0, + "step": 16629 + }, + { + "epoch": 2.115506932960183, + "ewc_loss": 0.04658318683505058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.329159360670019e-05, + "grad_norm": 29.98954200744629, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.869703471660614, + "num_tokens": 634383902.0, + "step": 16630 + }, + { + "epoch": 2.1156341432387737, + "ewc_loss": 0.04656832292675972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.32841612159973e-05, + "grad_norm": 29.816654205322266, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.863899290561676, + "num_tokens": 634421082.0, + "step": 16631 + }, + { + "epoch": 2.1157613535173643, + "ewc_loss": 0.04649095982313156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.32454804063309e-05, + "grad_norm": 30.005321502685547, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8665043711662292, + "num_tokens": 634456490.0, + "step": 16632 + }, + { + "epoch": 2.115888563795955, + "ewc_loss": 0.04654707759618759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3273538317880593e-05, + "grad_norm": 29.731538772583008, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8519620895385742, + "num_tokens": 634498252.0, + "step": 16633 + }, + { + "epoch": 2.1160157740745453, + "ewc_loss": 0.04650653153657913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3253265680978075e-05, + "grad_norm": 29.93965721130371, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8782572746276855, + "num_tokens": 634540769.0, + "step": 16634 + }, + { + "epoch": 2.116142984353136, + "ewc_loss": 0.046674393117427826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3337197490036488e-05, + "grad_norm": 29.75504493713379, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8772709369659424, + "num_tokens": 634573807.0, + "step": 16635 + }, + { + "epoch": 2.1162701946317264, + "ewc_loss": 0.04658559337258339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3292795958695933e-05, + "grad_norm": 29.95876121520996, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8659640550613403, + "num_tokens": 634609542.0, + "step": 16636 + }, + { + "epoch": 2.116397404910317, + "ewc_loss": 0.04671027883887291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3355140001513064e-05, + "grad_norm": 29.788450241088867, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8793612718582153, + "num_tokens": 634648641.0, + "step": 16637 + }, + { + "epoch": 2.1165246151889074, + "ewc_loss": 0.04644031822681427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3220158254844137e-05, + "grad_norm": 29.848831176757812, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8691647052764893, + "num_tokens": 634685531.0, + "step": 16638 + }, + { + "epoch": 2.116651825467498, + "ewc_loss": 0.04670219123363495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.335109638806898e-05, + "grad_norm": 29.93343162536621, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8724008798599243, + "num_tokens": 634717239.0, + "step": 16639 + }, + { + "epoch": 2.116779035746088, + "ewc_loss": 0.0465836338698864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3291817342396826e-05, + "grad_norm": 29.953657150268555, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8666068315505981, + "num_tokens": 634750058.0, + "step": 16640 + }, + { + "epoch": 2.1169062460246786, + "ewc_loss": 0.04653312265872955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.326656067452859e-05, + "grad_norm": 29.865198135375977, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8644626140594482, + "num_tokens": 634793480.0, + "step": 16641 + }, + { + "epoch": 2.117033456303269, + "ewc_loss": 0.04658128693699837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.329064409423154e-05, + "grad_norm": 30.038909912109375, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8606006503105164, + "num_tokens": 634829923.0, + "step": 16642 + }, + { + "epoch": 2.1171606665818596, + "ewc_loss": 0.04663612321019173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3318061721511185e-05, + "grad_norm": 29.789539337158203, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.866559624671936, + "num_tokens": 634868499.0, + "step": 16643 + }, + { + "epoch": 2.11728787686045, + "ewc_loss": 0.04658927023410797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3294634956982918e-05, + "grad_norm": 29.89992904663086, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8788314461708069, + "num_tokens": 634905922.0, + "step": 16644 + }, + { + "epoch": 2.1174150871390407, + "ewc_loss": 0.046739187091588974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.336959369131364e-05, + "grad_norm": 29.979524612426758, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8723253011703491, + "num_tokens": 634946190.0, + "step": 16645 + }, + { + "epoch": 2.1175422974176312, + "ewc_loss": 0.04655960947275162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.327980473637581e-05, + "grad_norm": 29.758338928222656, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8834449052810669, + "num_tokens": 634981840.0, + "step": 16646 + }, + { + "epoch": 2.1176695076962218, + "ewc_loss": 0.046722739934921265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.336137004022021e-05, + "grad_norm": 30.083402633666992, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8688967227935791, + "num_tokens": 635024755.0, + "step": 16647 + }, + { + "epoch": 2.1177967179748123, + "ewc_loss": 0.046745415776968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.337270780117251e-05, + "grad_norm": 29.871349334716797, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8751760721206665, + "num_tokens": 635063076.0, + "step": 16648 + }, + { + "epoch": 2.117923928253403, + "ewc_loss": 0.04656286910176277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3281434550881386e-05, + "grad_norm": 30.069238662719727, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8823531866073608, + "num_tokens": 635103052.0, + "step": 16649 + }, + { + "epoch": 2.1180511385319933, + "ewc_loss": 0.04681934416294098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3409671484841965e-05, + "grad_norm": 29.93349838256836, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8798351883888245, + "num_tokens": 635140917.0, + "step": 16650 + }, + { + "epoch": 2.118178348810584, + "ewc_loss": 0.04648798331618309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3243992472998798e-05, + "grad_norm": 29.874784469604492, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8756169080734253, + "num_tokens": 635182049.0, + "step": 16651 + }, + { + "epoch": 2.1183055590891744, + "ewc_loss": 0.046653904020786285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3326952941715717e-05, + "grad_norm": 29.996755599975586, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8655771613121033, + "num_tokens": 635217613.0, + "step": 16652 + }, + { + "epoch": 2.118432769367765, + "ewc_loss": 0.046676505357027054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3338252503890544e-05, + "grad_norm": 30.049692153930664, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8793981075286865, + "num_tokens": 635252716.0, + "step": 16653 + }, + { + "epoch": 2.1185599796463555, + "ewc_loss": 0.0465797558426857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3289878299692646e-05, + "grad_norm": 29.952844619750977, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8749790787696838, + "num_tokens": 635288919.0, + "step": 16654 + }, + { + "epoch": 2.118687189924946, + "ewc_loss": 0.046562500298023224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3281250832951628e-05, + "grad_norm": 30.00362205505371, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8659520745277405, + "num_tokens": 635329781.0, + "step": 16655 + }, + { + "epoch": 2.1188144002035365, + "ewc_loss": 0.04649018123745918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.324509114259854e-05, + "grad_norm": 29.856462478637695, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8799608945846558, + "num_tokens": 635363844.0, + "step": 16656 + }, + { + "epoch": 2.118941610482127, + "ewc_loss": 0.04650256410241127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3251281163538806e-05, + "grad_norm": 29.99046516418457, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8678172826766968, + "num_tokens": 635397568.0, + "step": 16657 + }, + { + "epoch": 2.1190688207607176, + "ewc_loss": 0.04667953774333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.33397695410531e-05, + "grad_norm": 30.092288970947266, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8564406037330627, + "num_tokens": 635432903.0, + "step": 16658 + }, + { + "epoch": 2.119196031039308, + "ewc_loss": 0.04654252156615257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3271260943147354e-05, + "grad_norm": 29.911182403564453, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8724662065505981, + "num_tokens": 635473388.0, + "step": 16659 + }, + { + "epoch": 2.1193232413178986, + "ewc_loss": 0.04647761955857277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3238810172188096e-05, + "grad_norm": 30.060869216918945, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8745989203453064, + "num_tokens": 635512966.0, + "step": 16660 + }, + { + "epoch": 2.119450451596489, + "ewc_loss": 0.046538688242435455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3269343728316016e-05, + "grad_norm": 29.994413375854492, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8569341897964478, + "num_tokens": 635547814.0, + "step": 16661 + }, + { + "epoch": 2.1195776618750797, + "ewc_loss": 0.04650412127375603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3252061509992927e-05, + "grad_norm": 29.95285415649414, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.861214280128479, + "num_tokens": 635583120.0, + "step": 16662 + }, + { + "epoch": 2.11970487215367, + "ewc_loss": 0.046502020210027695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3251010134117678e-05, + "grad_norm": 30.12914276123047, + "learning_rate": 1e-06, + "loss": 0.449, + "mean_token_accuracy": 0.8652853965759277, + "num_tokens": 635621531.0, + "step": 16663 + }, + { + "epoch": 2.1198320824322607, + "ewc_loss": 0.046556804329156876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3278402295545675e-05, + "grad_norm": 29.896039962768555, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8608643412590027, + "num_tokens": 635659641.0, + "step": 16664 + }, + { + "epoch": 2.119959292710851, + "ewc_loss": 0.04644419252872467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3222095478558913e-05, + "grad_norm": 30.026397705078125, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8661717176437378, + "num_tokens": 635697953.0, + "step": 16665 + }, + { + "epoch": 2.1200865029894413, + "ewc_loss": 0.04661385715007782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.330692768737208e-05, + "grad_norm": 30.04670524597168, + "learning_rate": 1e-06, + "loss": 0.4957, + "mean_token_accuracy": 0.8448721170425415, + "num_tokens": 635742519.0, + "step": 16666 + }, + { + "epoch": 2.120213713268032, + "ewc_loss": 0.046507369726896286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.325368404854089e-05, + "grad_norm": 29.865903854370117, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8771672248840332, + "num_tokens": 635777786.0, + "step": 16667 + }, + { + "epoch": 2.1203409235466224, + "ewc_loss": 0.04654940590262413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3274702471098863e-05, + "grad_norm": 29.997079849243164, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8720928430557251, + "num_tokens": 635814490.0, + "step": 16668 + }, + { + "epoch": 2.120468133825213, + "ewc_loss": 0.046564362943172455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3282182155526243e-05, + "grad_norm": 29.86529541015625, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8580871820449829, + "num_tokens": 635858351.0, + "step": 16669 + }, + { + "epoch": 2.1205953441038035, + "ewc_loss": 0.04651525244116783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3257625798578374e-05, + "grad_norm": 29.889768600463867, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8680518865585327, + "num_tokens": 635895585.0, + "step": 16670 + }, + { + "epoch": 2.120722554382394, + "ewc_loss": 0.0466219037771225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3310951291932724e-05, + "grad_norm": 29.956228256225586, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8780851364135742, + "num_tokens": 635933591.0, + "step": 16671 + }, + { + "epoch": 2.1208497646609845, + "ewc_loss": 0.04657343402504921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3286716896109283e-05, + "grad_norm": 29.8961238861084, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8670303821563721, + "num_tokens": 635972040.0, + "step": 16672 + }, + { + "epoch": 2.120976974939575, + "ewc_loss": 0.04655488207936287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3277441869140603e-05, + "grad_norm": 29.966358184814453, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8737014532089233, + "num_tokens": 636013287.0, + "step": 16673 + }, + { + "epoch": 2.1211041852181656, + "ewc_loss": 0.046662554144859314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3331276679527946e-05, + "grad_norm": 30.090290069580078, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8655409812927246, + "num_tokens": 636046137.0, + "step": 16674 + }, + { + "epoch": 2.121231395496756, + "ewc_loss": 0.04657454788684845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3287273506866768e-05, + "grad_norm": 30.05228614807129, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8592261075973511, + "num_tokens": 636088870.0, + "step": 16675 + }, + { + "epoch": 2.1213586057753466, + "ewc_loss": 0.046517569571733475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3258784494828433e-05, + "grad_norm": 29.950389862060547, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8803930282592773, + "num_tokens": 636130519.0, + "step": 16676 + }, + { + "epoch": 2.121485816053937, + "ewc_loss": 0.046613190323114395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3306594812311232e-05, + "grad_norm": 30.11310386657715, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8737939596176147, + "num_tokens": 636165641.0, + "step": 16677 + }, + { + "epoch": 2.1216130263325277, + "ewc_loss": 0.04653170704841614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3265853087650612e-05, + "grad_norm": 29.960721969604492, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8560019731521606, + "num_tokens": 636203811.0, + "step": 16678 + }, + { + "epoch": 2.121740236611118, + "ewc_loss": 0.04652969911694527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3264849005499855e-05, + "grad_norm": 30.030609130859375, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8824859261512756, + "num_tokens": 636239503.0, + "step": 16679 + }, + { + "epoch": 2.1218674468897087, + "ewc_loss": 0.0465962216258049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3298111045733094e-05, + "grad_norm": 29.925743103027344, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8738270998001099, + "num_tokens": 636278994.0, + "step": 16680 + }, + { + "epoch": 2.1219946571682993, + "ewc_loss": 0.0465194508433342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3259724912350066e-05, + "grad_norm": 30.041778564453125, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8667819499969482, + "num_tokens": 636318025.0, + "step": 16681 + }, + { + "epoch": 2.12212186744689, + "ewc_loss": 0.04652392491698265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3261962269316427e-05, + "grad_norm": 29.983596801757812, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8723794221878052, + "num_tokens": 636357289.0, + "step": 16682 + }, + { + "epoch": 2.1222490777254803, + "ewc_loss": 0.0464860200881958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3243010218720883e-05, + "grad_norm": 30.078073501586914, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8550983667373657, + "num_tokens": 636399067.0, + "step": 16683 + }, + { + "epoch": 2.122376288004071, + "ewc_loss": 0.046483930200338364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3241964299813844e-05, + "grad_norm": 29.90986442565918, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8760521411895752, + "num_tokens": 636442418.0, + "step": 16684 + }, + { + "epoch": 2.1225034982826614, + "ewc_loss": 0.04656905308365822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3284526832867414e-05, + "grad_norm": 30.03041648864746, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8592697381973267, + "num_tokens": 636484273.0, + "step": 16685 + }, + { + "epoch": 2.122630708561252, + "ewc_loss": 0.04653654620051384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3268272343557328e-05, + "grad_norm": 30.00723648071289, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8793485164642334, + "num_tokens": 636525675.0, + "step": 16686 + }, + { + "epoch": 2.1227579188398424, + "ewc_loss": 0.04634194076061249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3170970962382853e-05, + "grad_norm": 29.88015365600586, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8691854476928711, + "num_tokens": 636559652.0, + "step": 16687 + }, + { + "epoch": 2.122885129118433, + "ewc_loss": 0.04645157605409622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.322578802704811e-05, + "grad_norm": 30.0109806060791, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8882362842559814, + "num_tokens": 636597900.0, + "step": 16688 + }, + { + "epoch": 2.1230123393970235, + "ewc_loss": 0.04654641076922417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3273205442819744e-05, + "grad_norm": 30.01348114013672, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.871872067451477, + "num_tokens": 636635822.0, + "step": 16689 + }, + { + "epoch": 2.1231395496756136, + "ewc_loss": 0.04647085443139076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.32354268518975e-05, + "grad_norm": 30.031925201416016, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8602224588394165, + "num_tokens": 636667275.0, + "step": 16690 + }, + { + "epoch": 2.123266759954204, + "ewc_loss": 0.046496275812387466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.324813794984948e-05, + "grad_norm": 30.00229263305664, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8725705742835999, + "num_tokens": 636704723.0, + "step": 16691 + }, + { + "epoch": 2.1233939702327946, + "ewc_loss": 0.046455543488264084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.322777254448738e-05, + "grad_norm": 30.005624771118164, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8758425116539001, + "num_tokens": 636748209.0, + "step": 16692 + }, + { + "epoch": 2.123521180511385, + "ewc_loss": 0.04654820263385773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.327410038560629e-05, + "grad_norm": 29.98782730102539, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8767693042755127, + "num_tokens": 636785052.0, + "step": 16693 + }, + { + "epoch": 2.1236483907899757, + "ewc_loss": 0.04649633914232254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.324816887266934e-05, + "grad_norm": 30.17217254638672, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8826836943626404, + "num_tokens": 636816073.0, + "step": 16694 + }, + { + "epoch": 2.123775601068566, + "ewc_loss": 0.046496566385030746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3248283469001763e-05, + "grad_norm": 29.884485244750977, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8770039081573486, + "num_tokens": 636848963.0, + "step": 16695 + }, + { + "epoch": 2.1239028113471567, + "ewc_loss": 0.04637981951236725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3189910280052572e-05, + "grad_norm": 30.022836685180664, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8837272524833679, + "num_tokens": 636880632.0, + "step": 16696 + }, + { + "epoch": 2.1240300216257473, + "ewc_loss": 0.04661143198609352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.330571624042932e-05, + "grad_norm": 29.98192024230957, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.858500599861145, + "num_tokens": 636923597.0, + "step": 16697 + }, + { + "epoch": 2.124157231904338, + "ewc_loss": 0.04638426750898361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3192133085103706e-05, + "grad_norm": 29.812009811401367, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.863869309425354, + "num_tokens": 636962250.0, + "step": 16698 + }, + { + "epoch": 2.1242844421829283, + "ewc_loss": 0.046595145016908646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3297572624869645e-05, + "grad_norm": 29.910747528076172, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8782006502151489, + "num_tokens": 636997973.0, + "step": 16699 + }, + { + "epoch": 2.124411652461519, + "ewc_loss": 0.046523526310920715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.326176399947144e-05, + "grad_norm": 29.896183013916016, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8585841059684753, + "num_tokens": 637037320.0, + "step": 16700 + }, + { + "epoch": 2.1245388627401094, + "ewc_loss": 0.04647097736597061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.323548869753722e-05, + "grad_norm": 29.96922492980957, + "learning_rate": 1e-06, + "loss": 0.4895, + "mean_token_accuracy": 0.8486083149909973, + "num_tokens": 637072267.0, + "step": 16701 + }, + { + "epoch": 2.1246660730187, + "ewc_loss": 0.0465828999876976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.329144990653731e-05, + "grad_norm": 29.94257164001465, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.885520339012146, + "num_tokens": 637108782.0, + "step": 16702 + }, + { + "epoch": 2.1247932832972904, + "ewc_loss": 0.04657210037112236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.328604932699818e-05, + "grad_norm": 29.991905212402344, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8694905638694763, + "num_tokens": 637147368.0, + "step": 16703 + }, + { + "epoch": 2.124920493575881, + "ewc_loss": 0.04653849080204964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3269245502888225e-05, + "grad_norm": 29.82807731628418, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8802635669708252, + "num_tokens": 637181909.0, + "step": 16704 + }, + { + "epoch": 2.1250477038544715, + "ewc_loss": 0.046517323702573776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3258662622538395e-05, + "grad_norm": 29.934778213500977, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8706607818603516, + "num_tokens": 637218498.0, + "step": 16705 + }, + { + "epoch": 2.125174914133062, + "ewc_loss": 0.04669603705406189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3348018657998182e-05, + "grad_norm": 29.93136978149414, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8718061447143555, + "num_tokens": 637254825.0, + "step": 16706 + }, + { + "epoch": 2.1253021244116526, + "ewc_loss": 0.046574804931879044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.328740265511442e-05, + "grad_norm": 29.960969924926758, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8616525530815125, + "num_tokens": 637301593.0, + "step": 16707 + }, + { + "epoch": 2.125429334690243, + "ewc_loss": 0.046582937240600586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3291468096431345e-05, + "grad_norm": 29.925989151000977, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8828590512275696, + "num_tokens": 637341676.0, + "step": 16708 + }, + { + "epoch": 2.1255565449688336, + "ewc_loss": 0.04662935808300972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.331467840122059e-05, + "grad_norm": 30.029741287231445, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8753749132156372, + "num_tokens": 637378840.0, + "step": 16709 + }, + { + "epoch": 2.125683755247424, + "ewc_loss": 0.04662151262164116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3310756660066545e-05, + "grad_norm": 30.03429412841797, + "learning_rate": 1e-06, + "loss": 0.4759, + "mean_token_accuracy": 0.8532475829124451, + "num_tokens": 637412711.0, + "step": 16710 + }, + { + "epoch": 2.1258109655260147, + "ewc_loss": 0.046541325747966766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3270662495633587e-05, + "grad_norm": 29.865711212158203, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8694987297058105, + "num_tokens": 637448287.0, + "step": 16711 + }, + { + "epoch": 2.125938175804605, + "ewc_loss": 0.04660295695066452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3301477995119058e-05, + "grad_norm": 30.02163314819336, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.86967933177948, + "num_tokens": 637487899.0, + "step": 16712 + }, + { + "epoch": 2.1260653860831957, + "ewc_loss": 0.04660819470882416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3304097339860164e-05, + "grad_norm": 29.95741844177246, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8754581212997437, + "num_tokens": 637528161.0, + "step": 16713 + }, + { + "epoch": 2.1261925963617863, + "ewc_loss": 0.046637002378702164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3318501916946843e-05, + "grad_norm": 30.069889068603516, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8581535816192627, + "num_tokens": 637565692.0, + "step": 16714 + }, + { + "epoch": 2.1263198066403763, + "ewc_loss": 0.04666014015674591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3330070689553395e-05, + "grad_norm": 30.035669326782227, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.857917308807373, + "num_tokens": 637600368.0, + "step": 16715 + }, + { + "epoch": 2.126447016918967, + "ewc_loss": 0.046588167548179626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3294083803193644e-05, + "grad_norm": 29.95258331298828, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8691962957382202, + "num_tokens": 637635732.0, + "step": 16716 + }, + { + "epoch": 2.1265742271975574, + "ewc_loss": 0.04663361236453056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.331680661882274e-05, + "grad_norm": 29.94588279724121, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8659918904304504, + "num_tokens": 637677127.0, + "step": 16717 + }, + { + "epoch": 2.126701437476148, + "ewc_loss": 0.04660734534263611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3303673515329137e-05, + "grad_norm": 29.99103546142578, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8711903095245361, + "num_tokens": 637715278.0, + "step": 16718 + }, + { + "epoch": 2.1268286477547385, + "ewc_loss": 0.04662603512406349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3313017663895153e-05, + "grad_norm": 29.907508850097656, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8735343813896179, + "num_tokens": 637742937.0, + "step": 16719 + }, + { + "epoch": 2.126955858033329, + "ewc_loss": 0.04656553640961647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3282767870114185e-05, + "grad_norm": 30.08260154724121, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8743555545806885, + "num_tokens": 637776761.0, + "step": 16720 + }, + { + "epoch": 2.1270830683119195, + "ewc_loss": 0.04666765779256821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.333382872166112e-05, + "grad_norm": 30.029335021972656, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8730643391609192, + "num_tokens": 637812197.0, + "step": 16721 + }, + { + "epoch": 2.12721027859051, + "ewc_loss": 0.04664488881826401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3322443666984327e-05, + "grad_norm": 30.189062118530273, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8750185966491699, + "num_tokens": 637850442.0, + "step": 16722 + }, + { + "epoch": 2.1273374888691006, + "ewc_loss": 0.04660256579518318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3301283363252878e-05, + "grad_norm": 30.07056999206543, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.871271550655365, + "num_tokens": 637889352.0, + "step": 16723 + }, + { + "epoch": 2.127464699147691, + "ewc_loss": 0.0465070866048336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3253543986356817e-05, + "grad_norm": 30.152124404907227, + "learning_rate": 1e-06, + "loss": 0.4674, + "mean_token_accuracy": 0.8525341749191284, + "num_tokens": 637927195.0, + "step": 16724 + }, + { + "epoch": 2.1275919094262816, + "ewc_loss": 0.04661625623703003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.330812822037842e-05, + "grad_norm": 30.098134994506836, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8593112826347351, + "num_tokens": 637969431.0, + "step": 16725 + }, + { + "epoch": 2.127719119704872, + "ewc_loss": 0.04662315919995308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3311578843276948e-05, + "grad_norm": 30.131345748901367, + "learning_rate": 1e-06, + "loss": 0.368, + "mean_token_accuracy": 0.8876850008964539, + "num_tokens": 638013910.0, + "step": 16726 + }, + { + "epoch": 2.1278463299834627, + "ewc_loss": 0.04657488316297531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3287440853891894e-05, + "grad_norm": 30.05971908569336, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8602662682533264, + "num_tokens": 638051848.0, + "step": 16727 + }, + { + "epoch": 2.127973540262053, + "ewc_loss": 0.04655658081173897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3278289518202655e-05, + "grad_norm": 29.975404739379883, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8718345165252686, + "num_tokens": 638092192.0, + "step": 16728 + }, + { + "epoch": 2.1281007505406437, + "ewc_loss": 0.0466206818819046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3310340111493133e-05, + "grad_norm": 30.206161499023438, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8801975846290588, + "num_tokens": 638134306.0, + "step": 16729 + }, + { + "epoch": 2.1282279608192343, + "ewc_loss": 0.04660777747631073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3303888156078756e-05, + "grad_norm": 30.10706901550293, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8566603660583496, + "num_tokens": 638177988.0, + "step": 16730 + }, + { + "epoch": 2.128355171097825, + "ewc_loss": 0.04662518948316574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3312593839364126e-05, + "grad_norm": 30.012170791625977, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8814889788627625, + "num_tokens": 638212237.0, + "step": 16731 + }, + { + "epoch": 2.1284823813764153, + "ewc_loss": 0.04648241400718689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.324120760022197e-05, + "grad_norm": 29.941598892211914, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8638336658477783, + "num_tokens": 638252492.0, + "step": 16732 + }, + { + "epoch": 2.128609591655006, + "ewc_loss": 0.04655155539512634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3275777493836358e-05, + "grad_norm": 30.035865783691406, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8698304891586304, + "num_tokens": 638294308.0, + "step": 16733 + }, + { + "epoch": 2.1287368019335964, + "ewc_loss": 0.04661563038825989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.330781535420101e-05, + "grad_norm": 29.981185913085938, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8747373819351196, + "num_tokens": 638330178.0, + "step": 16734 + }, + { + "epoch": 2.128864012212187, + "ewc_loss": 0.04652899503707886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.326449794054497e-05, + "grad_norm": 29.976665496826172, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8501945734024048, + "num_tokens": 638372731.0, + "step": 16735 + }, + { + "epoch": 2.1289912224907774, + "ewc_loss": 0.04658693075180054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3293465346796438e-05, + "grad_norm": 29.96222496032715, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8666703701019287, + "num_tokens": 638407522.0, + "step": 16736 + }, + { + "epoch": 2.129118432769368, + "ewc_loss": 0.046638861298561096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3319431420532055e-05, + "grad_norm": 30.0675048828125, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8721203804016113, + "num_tokens": 638443262.0, + "step": 16737 + }, + { + "epoch": 2.129245643047958, + "ewc_loss": 0.04658139497041702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.329069684492424e-05, + "grad_norm": 29.871774673461914, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.871005117893219, + "num_tokens": 638482729.0, + "step": 16738 + }, + { + "epoch": 2.129372853326549, + "ewc_loss": 0.04662466421723366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3312331904890016e-05, + "grad_norm": 30.038877487182617, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.87865149974823, + "num_tokens": 638520034.0, + "step": 16739 + }, + { + "epoch": 2.129500063605139, + "ewc_loss": 0.04665917530655861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3329586838372052e-05, + "grad_norm": 29.959842681884766, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8780885934829712, + "num_tokens": 638558631.0, + "step": 16740 + }, + { + "epoch": 2.1296272738837296, + "ewc_loss": 0.04658128321170807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3290642275242135e-05, + "grad_norm": 29.87022590637207, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.881641149520874, + "num_tokens": 638593565.0, + "step": 16741 + }, + { + "epoch": 2.12975448416232, + "ewc_loss": 0.04666677489876747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.333338670723606e-05, + "grad_norm": 30.066043853759766, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8742740154266357, + "num_tokens": 638625952.0, + "step": 16742 + }, + { + "epoch": 2.1298816944409107, + "ewc_loss": 0.04673284292221069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3366421373793855e-05, + "grad_norm": 29.947025299072266, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8675259351730347, + "num_tokens": 638662000.0, + "step": 16743 + }, + { + "epoch": 2.130008904719501, + "ewc_loss": 0.04661645367741585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3308226445806213e-05, + "grad_norm": 30.03882598876953, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8571053147315979, + "num_tokens": 638706020.0, + "step": 16744 + }, + { + "epoch": 2.1301361149980917, + "ewc_loss": 0.04670978710055351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3354894437943585e-05, + "grad_norm": 30.00559425354004, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8770508170127869, + "num_tokens": 638745354.0, + "step": 16745 + }, + { + "epoch": 2.1302633252766823, + "ewc_loss": 0.04663265869021416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.331633004359901e-05, + "grad_norm": 30.023561477661133, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8639217615127563, + "num_tokens": 638785497.0, + "step": 16746 + }, + { + "epoch": 2.130390535555273, + "ewc_loss": 0.04665268212556839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3326341761276126e-05, + "grad_norm": 29.980527877807617, + "learning_rate": 1e-06, + "loss": 0.4666, + "mean_token_accuracy": 0.8507304191589355, + "num_tokens": 638821650.0, + "step": 16747 + }, + { + "epoch": 2.1305177458338633, + "ewc_loss": 0.0466017872095108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.330089409952052e-05, + "grad_norm": 30.077816009521484, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8564382791519165, + "num_tokens": 638855589.0, + "step": 16748 + }, + { + "epoch": 2.130644956112454, + "ewc_loss": 0.04672527685761452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.336263787583448e-05, + "grad_norm": 30.04378890991211, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8699657917022705, + "num_tokens": 638895766.0, + "step": 16749 + }, + { + "epoch": 2.1307721663910444, + "ewc_loss": 0.046590451151132584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.329522612853907e-05, + "grad_norm": 29.987504959106445, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8604651093482971, + "num_tokens": 638932622.0, + "step": 16750 + }, + { + "epoch": 2.130899376669635, + "ewc_loss": 0.046814389526844025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3407194021274336e-05, + "grad_norm": 30.09636688232422, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8604080677032471, + "num_tokens": 638971588.0, + "step": 16751 + }, + { + "epoch": 2.1310265869482254, + "ewc_loss": 0.04662400856614113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3312004486797377e-05, + "grad_norm": 30.01548957824707, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8758866190910339, + "num_tokens": 639009921.0, + "step": 16752 + }, + { + "epoch": 2.131153797226816, + "ewc_loss": 0.04662115499377251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3310576580115594e-05, + "grad_norm": 30.08557891845703, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8699254989624023, + "num_tokens": 639048160.0, + "step": 16753 + }, + { + "epoch": 2.1312810075054065, + "ewc_loss": 0.04663835093379021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3319174943026155e-05, + "grad_norm": 30.10797691345215, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.882168710231781, + "num_tokens": 639080260.0, + "step": 16754 + }, + { + "epoch": 2.131408217783997, + "ewc_loss": 0.04655737057328224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3278686057892628e-05, + "grad_norm": 29.985950469970703, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8627159595489502, + "num_tokens": 639118758.0, + "step": 16755 + }, + { + "epoch": 2.1315354280625876, + "ewc_loss": 0.046650372445583344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3325186703004874e-05, + "grad_norm": 29.993066787719727, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8712005615234375, + "num_tokens": 639159528.0, + "step": 16756 + }, + { + "epoch": 2.131662638341178, + "ewc_loss": 0.04660642519593239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.330321331101004e-05, + "grad_norm": 30.094505310058594, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8737145662307739, + "num_tokens": 639200394.0, + "step": 16757 + }, + { + "epoch": 2.1317898486197686, + "ewc_loss": 0.04660917446017265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3304586648009717e-05, + "grad_norm": 29.895771026611328, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.861293613910675, + "num_tokens": 639240938.0, + "step": 16758 + }, + { + "epoch": 2.131917058898359, + "ewc_loss": 0.04661005735397339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.330502866243478e-05, + "grad_norm": 29.929088592529297, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8656736016273499, + "num_tokens": 639278097.0, + "step": 16759 + }, + { + "epoch": 2.1320442691769497, + "ewc_loss": 0.046655815094709396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.332790791115258e-05, + "grad_norm": 30.017488479614258, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.869920551776886, + "num_tokens": 639324512.0, + "step": 16760 + }, + { + "epoch": 2.13217147945554, + "ewc_loss": 0.046597398817539215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.329869857931044e-05, + "grad_norm": 29.883451461791992, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8750121593475342, + "num_tokens": 639366833.0, + "step": 16761 + }, + { + "epoch": 2.1322986897341307, + "ewc_loss": 0.04663407430052757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.331703763047699e-05, + "grad_norm": 30.044095993041992, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8557376861572266, + "num_tokens": 639405765.0, + "step": 16762 + }, + { + "epoch": 2.132425900012721, + "ewc_loss": 0.046705495566129684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.33527480304474e-05, + "grad_norm": 29.889253616333008, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8665969371795654, + "num_tokens": 639439622.0, + "step": 16763 + }, + { + "epoch": 2.1325531102913113, + "ewc_loss": 0.046672333031892776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3336166123044677e-05, + "grad_norm": 30.055652618408203, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8810174465179443, + "num_tokens": 639479943.0, + "step": 16764 + }, + { + "epoch": 2.132680320569902, + "ewc_loss": 0.046689584851264954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.334479177079629e-05, + "grad_norm": 29.95377540588379, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8647500872612, + "num_tokens": 639520487.0, + "step": 16765 + }, + { + "epoch": 2.1328075308484924, + "ewc_loss": 0.04670998081564903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3354990844381973e-05, + "grad_norm": 30.008468627929688, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8692601323127747, + "num_tokens": 639558997.0, + "step": 16766 + }, + { + "epoch": 2.132934741127083, + "ewc_loss": 0.04674583673477173, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.337291880394332e-05, + "grad_norm": 29.973451614379883, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8694612383842468, + "num_tokens": 639594569.0, + "step": 16767 + }, + { + "epoch": 2.1330619514056735, + "ewc_loss": 0.046567369252443314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3283684640773572e-05, + "grad_norm": 29.938396453857422, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8830693960189819, + "num_tokens": 639629827.0, + "step": 16768 + }, + { + "epoch": 2.133189161684264, + "ewc_loss": 0.04670194536447525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.335097269678954e-05, + "grad_norm": 29.94261932373047, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8636921048164368, + "num_tokens": 639667634.0, + "step": 16769 + }, + { + "epoch": 2.1333163719628545, + "ewc_loss": 0.046669866889715195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3334932848229073e-05, + "grad_norm": 29.877756118774414, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8675481081008911, + "num_tokens": 639700536.0, + "step": 16770 + }, + { + "epoch": 2.133443582241445, + "ewc_loss": 0.046679265797138214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3339633116847835e-05, + "grad_norm": 29.876354217529297, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8721339106559753, + "num_tokens": 639744006.0, + "step": 16771 + }, + { + "epoch": 2.1335707925200356, + "ewc_loss": 0.04672306776046753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3361533749266528e-05, + "grad_norm": 29.971460342407227, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8760350346565247, + "num_tokens": 639784950.0, + "step": 16772 + }, + { + "epoch": 2.133698002798626, + "ewc_loss": 0.046668972820043564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3334487195825204e-05, + "grad_norm": 29.95469856262207, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8842613101005554, + "num_tokens": 639823567.0, + "step": 16773 + }, + { + "epoch": 2.1338252130772166, + "ewc_loss": 0.046704500913619995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.335224962735083e-05, + "grad_norm": 29.950260162353516, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8803868889808655, + "num_tokens": 639863386.0, + "step": 16774 + }, + { + "epoch": 2.133952423355807, + "ewc_loss": 0.04661165550351143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3305827198782936e-05, + "grad_norm": 29.893268585205078, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8593314290046692, + "num_tokens": 639901219.0, + "step": 16775 + }, + { + "epoch": 2.1340796336343977, + "ewc_loss": 0.04672449827194214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.336224861210212e-05, + "grad_norm": 29.996755599975586, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8541326522827148, + "num_tokens": 639935240.0, + "step": 16776 + }, + { + "epoch": 2.134206843912988, + "ewc_loss": 0.04671167954802513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.335584031243343e-05, + "grad_norm": 29.953405380249023, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8620818853378296, + "num_tokens": 639975370.0, + "step": 16777 + }, + { + "epoch": 2.1343340541915787, + "ewc_loss": 0.04668499529361725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.334249802515842e-05, + "grad_norm": 30.023601531982422, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8523401021957397, + "num_tokens": 640020463.0, + "step": 16778 + }, + { + "epoch": 2.1344612644701693, + "ewc_loss": 0.04672738164663315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3363691070699133e-05, + "grad_norm": 29.960107803344727, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8652058243751526, + "num_tokens": 640057142.0, + "step": 16779 + }, + { + "epoch": 2.13458847474876, + "ewc_loss": 0.04666812717914581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3334063371294178e-05, + "grad_norm": 30.020559310913086, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8669664859771729, + "num_tokens": 640098879.0, + "step": 16780 + }, + { + "epoch": 2.1347156850273503, + "ewc_loss": 0.04671541228890419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3357706595561467e-05, + "grad_norm": 30.029817581176758, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8720105886459351, + "num_tokens": 640136077.0, + "step": 16781 + }, + { + "epoch": 2.134842895305941, + "ewc_loss": 0.04664561152458191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3322805645875633e-05, + "grad_norm": 29.923824310302734, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8566253781318665, + "num_tokens": 640175160.0, + "step": 16782 + }, + { + "epoch": 2.1349701055845314, + "ewc_loss": 0.04667626693844795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3338134269579314e-05, + "grad_norm": 29.97704315185547, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8945497274398804, + "num_tokens": 640216670.0, + "step": 16783 + }, + { + "epoch": 2.135097315863122, + "ewc_loss": 0.0465836264193058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.329181370441802e-05, + "grad_norm": 29.938282012939453, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8677818775177002, + "num_tokens": 640254872.0, + "step": 16784 + }, + { + "epoch": 2.1352245261417124, + "ewc_loss": 0.04675403982400894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3377020625048317e-05, + "grad_norm": 30.070825576782227, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8683076500892639, + "num_tokens": 640289319.0, + "step": 16785 + }, + { + "epoch": 2.135351736420303, + "ewc_loss": 0.04659763351082802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.329881681362167e-05, + "grad_norm": 29.9886531829834, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8845431804656982, + "num_tokens": 640323897.0, + "step": 16786 + }, + { + "epoch": 2.1354789466988935, + "ewc_loss": 0.046726930886507034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3363465516013093e-05, + "grad_norm": 30.00657081604004, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.875908613204956, + "num_tokens": 640366203.0, + "step": 16787 + }, + { + "epoch": 2.1356061569774836, + "ewc_loss": 0.04665729030966759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3328644601861015e-05, + "grad_norm": 29.93075180053711, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8799164891242981, + "num_tokens": 640408089.0, + "step": 16788 + }, + { + "epoch": 2.135733367256074, + "ewc_loss": 0.04675177112221718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3375885575660504e-05, + "grad_norm": 30.033565521240234, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8788036108016968, + "num_tokens": 640443532.0, + "step": 16789 + }, + { + "epoch": 2.1358605775346646, + "ewc_loss": 0.046685051172971725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3342525309999473e-05, + "grad_norm": 30.06464195251465, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8637207746505737, + "num_tokens": 640480425.0, + "step": 16790 + }, + { + "epoch": 2.135987787813255, + "ewc_loss": 0.0466729998588562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.333650081709493e-05, + "grad_norm": 30.022029876708984, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8723164200782776, + "num_tokens": 640525658.0, + "step": 16791 + }, + { + "epoch": 2.1361149980918457, + "ewc_loss": 0.046655118465423584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3327558665187098e-05, + "grad_norm": 30.038713455200195, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8798909187316895, + "num_tokens": 640563276.0, + "step": 16792 + }, + { + "epoch": 2.136242208370436, + "ewc_loss": 0.046622030436992645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.331101495656185e-05, + "grad_norm": 29.956443786621094, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8628509044647217, + "num_tokens": 640608150.0, + "step": 16793 + }, + { + "epoch": 2.1363694186490267, + "ewc_loss": 0.04665428400039673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3327142116613686e-05, + "grad_norm": 30.013776779174805, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8717308044433594, + "num_tokens": 640646127.0, + "step": 16794 + }, + { + "epoch": 2.1364966289276173, + "ewc_loss": 0.04665002599358559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3325013899011537e-05, + "grad_norm": 29.998506546020508, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8761759996414185, + "num_tokens": 640683195.0, + "step": 16795 + }, + { + "epoch": 2.136623839206208, + "ewc_loss": 0.04664783179759979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3323915229411796e-05, + "grad_norm": 29.970348358154297, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8852752447128296, + "num_tokens": 640724102.0, + "step": 16796 + }, + { + "epoch": 2.1367510494847983, + "ewc_loss": 0.046749990433454514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.337499608984217e-05, + "grad_norm": 29.953460693359375, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8676242232322693, + "num_tokens": 640762624.0, + "step": 16797 + }, + { + "epoch": 2.136878259763389, + "ewc_loss": 0.046729978173971176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3364989829133265e-05, + "grad_norm": 30.0894775390625, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8664502501487732, + "num_tokens": 640799437.0, + "step": 16798 + }, + { + "epoch": 2.1370054700419794, + "ewc_loss": 0.04678083956241608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3390419300994836e-05, + "grad_norm": 30.130142211914062, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8758153319358826, + "num_tokens": 640837115.0, + "step": 16799 + }, + { + "epoch": 2.13713268032057, + "ewc_loss": 0.046622488647699356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3311244149226695e-05, + "grad_norm": 29.868968963623047, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8707940578460693, + "num_tokens": 640882312.0, + "step": 16800 + }, + { + "epoch": 2.1372598905991604, + "ewc_loss": 0.0466657392680645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3332870114245452e-05, + "grad_norm": 29.95779800415039, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8702071905136108, + "num_tokens": 640921032.0, + "step": 16801 + }, + { + "epoch": 2.137387100877751, + "ewc_loss": 0.04671672359108925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3358361431746744e-05, + "grad_norm": 29.947832107543945, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.871665358543396, + "num_tokens": 640957424.0, + "step": 16802 + }, + { + "epoch": 2.1375143111563415, + "ewc_loss": 0.04662410169839859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.331205178052187e-05, + "grad_norm": 29.85814094543457, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8622303009033203, + "num_tokens": 640992985.0, + "step": 16803 + }, + { + "epoch": 2.137641521434932, + "ewc_loss": 0.046786874532699585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.339343700441532e-05, + "grad_norm": 30.086284637451172, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8791399598121643, + "num_tokens": 641030379.0, + "step": 16804 + }, + { + "epoch": 2.1377687317135226, + "ewc_loss": 0.04676622524857521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3383112420560792e-05, + "grad_norm": 29.874765396118164, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8627057075500488, + "num_tokens": 641068526.0, + "step": 16805 + }, + { + "epoch": 2.137895941992113, + "ewc_loss": 0.046732347458601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3366173991234973e-05, + "grad_norm": 30.11910629272461, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8639485239982605, + "num_tokens": 641103351.0, + "step": 16806 + }, + { + "epoch": 2.1380231522707036, + "ewc_loss": 0.04676859453320503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.33842965826625e-05, + "grad_norm": 29.943374633789062, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8721876740455627, + "num_tokens": 641143570.0, + "step": 16807 + }, + { + "epoch": 2.138150362549294, + "ewc_loss": 0.04655687138438225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.327843503735494e-05, + "grad_norm": 29.966773986816406, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8762074708938599, + "num_tokens": 641180625.0, + "step": 16808 + }, + { + "epoch": 2.1382775728278847, + "ewc_loss": 0.04680713266134262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3403566956403665e-05, + "grad_norm": 30.042057037353516, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8493639230728149, + "num_tokens": 641226770.0, + "step": 16809 + }, + { + "epoch": 2.138404783106475, + "ewc_loss": 0.046672847121953964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.333642441953998e-05, + "grad_norm": 30.015647888183594, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8821001052856445, + "num_tokens": 641269682.0, + "step": 16810 + }, + { + "epoch": 2.1385319933850653, + "ewc_loss": 0.04667414352297783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3337071979767643e-05, + "grad_norm": 29.899368286132812, + "learning_rate": 1e-06, + "loss": 0.3777, + "mean_token_accuracy": 0.8823597431182861, + "num_tokens": 641303803.0, + "step": 16811 + }, + { + "epoch": 2.1386592036636562, + "ewc_loss": 0.04668722301721573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.334361124667339e-05, + "grad_norm": 29.940704345703125, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8643461465835571, + "num_tokens": 641343141.0, + "step": 16812 + }, + { + "epoch": 2.1387864139422463, + "ewc_loss": 0.04666955769062042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.333477823412977e-05, + "grad_norm": 29.971277236938477, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8746239542961121, + "num_tokens": 641384006.0, + "step": 16813 + }, + { + "epoch": 2.138913624220837, + "ewc_loss": 0.046708181500434875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3354090444627218e-05, + "grad_norm": 29.95247459411621, + "learning_rate": 1e-06, + "loss": 0.4849, + "mean_token_accuracy": 0.8497125506401062, + "num_tokens": 641423998.0, + "step": 16814 + }, + { + "epoch": 2.1390408344994274, + "ewc_loss": 0.04666991904377937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3334960133070126e-05, + "grad_norm": 29.973209381103516, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8695663213729858, + "num_tokens": 641457886.0, + "step": 16815 + }, + { + "epoch": 2.139168044778018, + "ewc_loss": 0.04671793803572655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3358968974207528e-05, + "grad_norm": 29.941679000854492, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8705551624298096, + "num_tokens": 641503386.0, + "step": 16816 + }, + { + "epoch": 2.1392952550566084, + "ewc_loss": 0.046673666685819626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.333683369215578e-05, + "grad_norm": 29.937978744506836, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8811498880386353, + "num_tokens": 641539533.0, + "step": 16817 + }, + { + "epoch": 2.139422465335199, + "ewc_loss": 0.046650391072034836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3325195797951892e-05, + "grad_norm": 29.992223739624023, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.864822506904602, + "num_tokens": 641574020.0, + "step": 16818 + }, + { + "epoch": 2.1395496756137895, + "ewc_loss": 0.046724993735551834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3362495994661003e-05, + "grad_norm": 29.946002960205078, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8743449449539185, + "num_tokens": 641605206.0, + "step": 16819 + }, + { + "epoch": 2.13967688589238, + "ewc_loss": 0.046760447323322296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.338022386538796e-05, + "grad_norm": 30.162466049194336, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.860630214214325, + "num_tokens": 641649857.0, + "step": 16820 + }, + { + "epoch": 2.1398040961709706, + "ewc_loss": 0.046747706830501556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3373853764496744e-05, + "grad_norm": 29.945316314697266, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8650771379470825, + "num_tokens": 641683360.0, + "step": 16821 + }, + { + "epoch": 2.139931306449561, + "ewc_loss": 0.0466354675590992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3317734303418547e-05, + "grad_norm": 30.086042404174805, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8707343339920044, + "num_tokens": 641720105.0, + "step": 16822 + }, + { + "epoch": 2.1400585167281516, + "ewc_loss": 0.04680057242512703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.340028549951967e-05, + "grad_norm": 30.05071258544922, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8735080361366272, + "num_tokens": 641750718.0, + "step": 16823 + }, + { + "epoch": 2.140185727006742, + "ewc_loss": 0.04665571078658104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3327855160459876e-05, + "grad_norm": 29.980655670166016, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8683638572692871, + "num_tokens": 641791620.0, + "step": 16824 + }, + { + "epoch": 2.1403129372853327, + "ewc_loss": 0.046761248260736465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.338062404305674e-05, + "grad_norm": 30.102249145507812, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8581861853599548, + "num_tokens": 641833566.0, + "step": 16825 + }, + { + "epoch": 2.140440147563923, + "ewc_loss": 0.046617958694696426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.330897950741928e-05, + "grad_norm": 29.861597061157227, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8769993185997009, + "num_tokens": 641869080.0, + "step": 16826 + }, + { + "epoch": 2.1405673578425137, + "ewc_loss": 0.046632733196020126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.331636642338708e-05, + "grad_norm": 30.052040100097656, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8675955533981323, + "num_tokens": 641910195.0, + "step": 16827 + }, + { + "epoch": 2.1406945681211043, + "ewc_loss": 0.04672912508249283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.336456236662343e-05, + "grad_norm": 29.891582489013672, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8570177555084229, + "num_tokens": 641947270.0, + "step": 16828 + }, + { + "epoch": 2.140821778399695, + "ewc_loss": 0.04668298736214638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3341493943007663e-05, + "grad_norm": 30.04669952392578, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8718600273132324, + "num_tokens": 641986472.0, + "step": 16829 + }, + { + "epoch": 2.1409489886782853, + "ewc_loss": 0.04676095396280289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3380476704915054e-05, + "grad_norm": 29.977285385131836, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8563629984855652, + "num_tokens": 642028130.0, + "step": 16830 + }, + { + "epoch": 2.141076198956876, + "ewc_loss": 0.04668927192687988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3344635337707587e-05, + "grad_norm": 30.060529708862305, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8648831844329834, + "num_tokens": 642067097.0, + "step": 16831 + }, + { + "epoch": 2.1412034092354664, + "ewc_loss": 0.04668130353093147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.334065175091382e-05, + "grad_norm": 29.95112419128418, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8813050389289856, + "num_tokens": 642099628.0, + "step": 16832 + }, + { + "epoch": 2.141330619514057, + "ewc_loss": 0.04672585427761078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3362927095149644e-05, + "grad_norm": 30.041454315185547, + "learning_rate": 1e-06, + "loss": 0.3481, + "mean_token_accuracy": 0.8930325508117676, + "num_tokens": 642135941.0, + "step": 16833 + }, + { + "epoch": 2.1414578297926474, + "ewc_loss": 0.046689607203006744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3344804503722116e-05, + "grad_norm": 29.96055793762207, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8629223108291626, + "num_tokens": 642169465.0, + "step": 16834 + }, + { + "epoch": 2.141585040071238, + "ewc_loss": 0.04667914658784866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.333957309019752e-05, + "grad_norm": 30.036766052246094, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8739997148513794, + "num_tokens": 642208961.0, + "step": 16835 + }, + { + "epoch": 2.141712250349828, + "ewc_loss": 0.046754140406847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3377069737762213e-05, + "grad_norm": 30.070213317871094, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.869401216506958, + "num_tokens": 642249691.0, + "step": 16836 + }, + { + "epoch": 2.141839460628419, + "ewc_loss": 0.04668319970369339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.334159944439307e-05, + "grad_norm": 29.999378204345703, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8724111318588257, + "num_tokens": 642292425.0, + "step": 16837 + }, + { + "epoch": 2.141966670907009, + "ewc_loss": 0.046725623309612274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.336281249881722e-05, + "grad_norm": 30.116403579711914, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8801203966140747, + "num_tokens": 642330982.0, + "step": 16838 + }, + { + "epoch": 2.1420938811855996, + "ewc_loss": 0.0467224158346653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3361208150163293e-05, + "grad_norm": 29.93654441833496, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8713302612304688, + "num_tokens": 642365414.0, + "step": 16839 + }, + { + "epoch": 2.14222109146419, + "ewc_loss": 0.0466126911342144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3306345610762946e-05, + "grad_norm": 30.07137680053711, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8640632629394531, + "num_tokens": 642398660.0, + "step": 16840 + }, + { + "epoch": 2.1423483017427807, + "ewc_loss": 0.04682626575231552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3413133021676913e-05, + "grad_norm": 30.148473739624023, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8771737813949585, + "num_tokens": 642432605.0, + "step": 16841 + }, + { + "epoch": 2.142475512021371, + "ewc_loss": 0.04670425131917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.335212593607139e-05, + "grad_norm": 30.018407821655273, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8793492913246155, + "num_tokens": 642472041.0, + "step": 16842 + }, + { + "epoch": 2.1426027222999617, + "ewc_loss": 0.04666662588715553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3333312128670514e-05, + "grad_norm": 30.094730377197266, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8799014091491699, + "num_tokens": 642512399.0, + "step": 16843 + }, + { + "epoch": 2.1427299325785523, + "ewc_loss": 0.04673527926206589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.336764009669423e-05, + "grad_norm": 30.011192321777344, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8645187616348267, + "num_tokens": 642550829.0, + "step": 16844 + }, + { + "epoch": 2.142857142857143, + "ewc_loss": 0.046689845621585846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3344922738033347e-05, + "grad_norm": 30.208505630493164, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8703398704528809, + "num_tokens": 642591255.0, + "step": 16845 + }, + { + "epoch": 2.1429843531357333, + "ewc_loss": 0.046784576028585434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3392287403112277e-05, + "grad_norm": 30.17910385131836, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8723766803741455, + "num_tokens": 642625772.0, + "step": 16846 + }, + { + "epoch": 2.143111563414324, + "ewc_loss": 0.0465577207505703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.327886068087537e-05, + "grad_norm": 30.104936599731445, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8640359044075012, + "num_tokens": 642667596.0, + "step": 16847 + }, + { + "epoch": 2.1432387736929144, + "ewc_loss": 0.046603765338659286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3301881810766645e-05, + "grad_norm": 30.02022361755371, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.872081458568573, + "num_tokens": 642702159.0, + "step": 16848 + }, + { + "epoch": 2.143365983971505, + "ewc_loss": 0.04669511690735817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3347558453679085e-05, + "grad_norm": 30.18992805480957, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8853122591972351, + "num_tokens": 642737307.0, + "step": 16849 + }, + { + "epoch": 2.1434931942500954, + "ewc_loss": 0.04668558016419411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.334279088245239e-05, + "grad_norm": 30.11211585998535, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8608180284500122, + "num_tokens": 642776660.0, + "step": 16850 + }, + { + "epoch": 2.143620404528686, + "ewc_loss": 0.04664630442857742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.332315307285171e-05, + "grad_norm": 29.980751037597656, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8649277687072754, + "num_tokens": 642814131.0, + "step": 16851 + }, + { + "epoch": 2.1437476148072765, + "ewc_loss": 0.04675205424427986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.337602745683398e-05, + "grad_norm": 30.1688289642334, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8509768843650818, + "num_tokens": 642854138.0, + "step": 16852 + }, + { + "epoch": 2.143874825085867, + "ewc_loss": 0.04675588756799698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.337794467166532e-05, + "grad_norm": 30.11574935913086, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8773880004882812, + "num_tokens": 642893767.0, + "step": 16853 + }, + { + "epoch": 2.1440020353644575, + "ewc_loss": 0.04655155912041664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3275779312825762e-05, + "grad_norm": 30.01755714416504, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8851661086082458, + "num_tokens": 642928286.0, + "step": 16854 + }, + { + "epoch": 2.144129245643048, + "ewc_loss": 0.046741221100091934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.337061050639022e-05, + "grad_norm": 30.131006240844727, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8782929182052612, + "num_tokens": 642971220.0, + "step": 16855 + }, + { + "epoch": 2.1442564559216386, + "ewc_loss": 0.04671092703938484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3355463781626895e-05, + "grad_norm": 30.08466911315918, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8738570213317871, + "num_tokens": 643013618.0, + "step": 16856 + }, + { + "epoch": 2.144383666200229, + "ewc_loss": 0.046681661158800125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.334083001187537e-05, + "grad_norm": 30.06193733215332, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8729689717292786, + "num_tokens": 643058122.0, + "step": 16857 + }, + { + "epoch": 2.1445108764788197, + "ewc_loss": 0.04668104276061058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3340520783676766e-05, + "grad_norm": 30.111337661743164, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8579684495925903, + "num_tokens": 643096662.0, + "step": 16858 + }, + { + "epoch": 2.14463808675741, + "ewc_loss": 0.046560220420360565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3280110326595604e-05, + "grad_norm": 30.11458969116211, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.883239209651947, + "num_tokens": 643137923.0, + "step": 16859 + }, + { + "epoch": 2.1447652970360007, + "ewc_loss": 0.046665534377098083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.333276643184945e-05, + "grad_norm": 30.013303756713867, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8778327703475952, + "num_tokens": 643176918.0, + "step": 16860 + }, + { + "epoch": 2.144892507314591, + "ewc_loss": 0.04664190858602524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3320953914662823e-05, + "grad_norm": 29.993959426879883, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8886561989784241, + "num_tokens": 643217525.0, + "step": 16861 + }, + { + "epoch": 2.1450197175931813, + "ewc_loss": 0.04664556682109833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.332278381800279e-05, + "grad_norm": 30.03514289855957, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8876605033874512, + "num_tokens": 643255131.0, + "step": 16862 + }, + { + "epoch": 2.145146927871772, + "ewc_loss": 0.04672446474432945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.336223224119749e-05, + "grad_norm": 30.129806518554688, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8789609670639038, + "num_tokens": 643292092.0, + "step": 16863 + }, + { + "epoch": 2.1452741381503624, + "ewc_loss": 0.046669505536556244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3334752768278122e-05, + "grad_norm": 30.072399139404297, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8679308891296387, + "num_tokens": 643330510.0, + "step": 16864 + }, + { + "epoch": 2.145401348428953, + "ewc_loss": 0.04661179333925247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.330589632038027e-05, + "grad_norm": 30.018930435180664, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8659105896949768, + "num_tokens": 643372900.0, + "step": 16865 + }, + { + "epoch": 2.1455285587075434, + "ewc_loss": 0.04660015180706978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3300075554288924e-05, + "grad_norm": 29.956342697143555, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.878887414932251, + "num_tokens": 643412866.0, + "step": 16866 + }, + { + "epoch": 2.145655768986134, + "ewc_loss": 0.046635180711746216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3317590603255667e-05, + "grad_norm": 30.136320114135742, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8545514345169067, + "num_tokens": 643452846.0, + "step": 16867 + }, + { + "epoch": 2.1457829792647245, + "ewc_loss": 0.04662231355905533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3311156837735325e-05, + "grad_norm": 30.075223922729492, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8765252232551575, + "num_tokens": 643488264.0, + "step": 16868 + }, + { + "epoch": 2.145910189543315, + "ewc_loss": 0.04661354422569275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3306771254283376e-05, + "grad_norm": 30.101152420043945, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.858936607837677, + "num_tokens": 643528131.0, + "step": 16869 + }, + { + "epoch": 2.1460373998219056, + "ewc_loss": 0.046635765582323074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3317883460549638e-05, + "grad_norm": 30.040498733520508, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8714118003845215, + "num_tokens": 643564041.0, + "step": 16870 + }, + { + "epoch": 2.146164610100496, + "ewc_loss": 0.046586405485868454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3293203412322327e-05, + "grad_norm": 30.049922943115234, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8582166433334351, + "num_tokens": 643601507.0, + "step": 16871 + }, + { + "epoch": 2.1462918203790866, + "ewc_loss": 0.04662499949336052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3312499251915142e-05, + "grad_norm": 30.123205184936523, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8555623888969421, + "num_tokens": 643647161.0, + "step": 16872 + }, + { + "epoch": 2.146419030657677, + "ewc_loss": 0.04658406227827072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3292031983146444e-05, + "grad_norm": 29.989789962768555, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.876643717288971, + "num_tokens": 643687114.0, + "step": 16873 + }, + { + "epoch": 2.1465462409362677, + "ewc_loss": 0.04657594487071037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.328797199879773e-05, + "grad_norm": 30.052825927734375, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8556300401687622, + "num_tokens": 643722110.0, + "step": 16874 + }, + { + "epoch": 2.146673451214858, + "ewc_loss": 0.04663766175508499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3318831154028885e-05, + "grad_norm": 30.17057991027832, + "learning_rate": 1e-06, + "loss": 0.4998, + "mean_token_accuracy": 0.8489212393760681, + "num_tokens": 643765770.0, + "step": 16875 + }, + { + "epoch": 2.1468006614934487, + "ewc_loss": 0.04661912843585014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.330956340301782e-05, + "grad_norm": 30.119014739990234, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.861930251121521, + "num_tokens": 643810097.0, + "step": 16876 + }, + { + "epoch": 2.1469278717720393, + "ewc_loss": 0.04666796699166298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3333983335760422e-05, + "grad_norm": 30.054929733276367, + "learning_rate": 1e-06, + "loss": 0.4948, + "mean_token_accuracy": 0.845914363861084, + "num_tokens": 643847904.0, + "step": 16877 + }, + { + "epoch": 2.14705508205063, + "ewc_loss": 0.04668280854821205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.334140481252689e-05, + "grad_norm": 30.0037899017334, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8785021305084229, + "num_tokens": 643889157.0, + "step": 16878 + }, + { + "epoch": 2.1471822923292203, + "ewc_loss": 0.04673595726490021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.336797842872329e-05, + "grad_norm": 30.118440628051758, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8727689981460571, + "num_tokens": 643931181.0, + "step": 16879 + }, + { + "epoch": 2.147309502607811, + "ewc_loss": 0.04665611311793327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.332805706828367e-05, + "grad_norm": 29.975170135498047, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8685181140899658, + "num_tokens": 643965415.0, + "step": 16880 + }, + { + "epoch": 2.1474367128864014, + "ewc_loss": 0.04663301631808281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3316508304560557e-05, + "grad_norm": 29.991512298583984, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8554401397705078, + "num_tokens": 644010647.0, + "step": 16881 + }, + { + "epoch": 2.147563923164992, + "ewc_loss": 0.04668010026216507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3340049665421247e-05, + "grad_norm": 30.098119735717773, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8870742917060852, + "num_tokens": 644044347.0, + "step": 16882 + }, + { + "epoch": 2.1476911334435824, + "ewc_loss": 0.046727538108825684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3363769287243485e-05, + "grad_norm": 30.07717514038086, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8780558109283447, + "num_tokens": 644083456.0, + "step": 16883 + }, + { + "epoch": 2.147818343722173, + "ewc_loss": 0.04666991904377937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3334960133070126e-05, + "grad_norm": 30.014474868774414, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8615756034851074, + "num_tokens": 644123851.0, + "step": 16884 + }, + { + "epoch": 2.1479455540007635, + "ewc_loss": 0.04669869691133499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3349348339252174e-05, + "grad_norm": 30.168306350708008, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8795756697654724, + "num_tokens": 644168880.0, + "step": 16885 + }, + { + "epoch": 2.1480727642793536, + "ewc_loss": 0.04673423618078232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3367118046735413e-05, + "grad_norm": 30.105873107910156, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8727868795394897, + "num_tokens": 644204532.0, + "step": 16886 + }, + { + "epoch": 2.148199974557944, + "ewc_loss": 0.04660347104072571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.330173629161436e-05, + "grad_norm": 30.037012100219727, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.888421356678009, + "num_tokens": 644246160.0, + "step": 16887 + }, + { + "epoch": 2.1483271848365346, + "ewc_loss": 0.046672724187374115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.333636257390026e-05, + "grad_norm": 30.037046432495117, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8832613229751587, + "num_tokens": 644284881.0, + "step": 16888 + }, + { + "epoch": 2.148454395115125, + "ewc_loss": 0.04666437953710556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3332189812208526e-05, + "grad_norm": 30.003582000732422, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8646870851516724, + "num_tokens": 644326661.0, + "step": 16889 + }, + { + "epoch": 2.1485816053937157, + "ewc_loss": 0.04664672538638115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3323362256633118e-05, + "grad_norm": 29.99897575378418, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8746699094772339, + "num_tokens": 644364420.0, + "step": 16890 + }, + { + "epoch": 2.148708815672306, + "ewc_loss": 0.0467262901365757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.336314537387807e-05, + "grad_norm": 30.120161056518555, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.853780210018158, + "num_tokens": 644398626.0, + "step": 16891 + }, + { + "epoch": 2.1488360259508967, + "ewc_loss": 0.04668113589286804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3340568077401258e-05, + "grad_norm": 30.07656478881836, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8709940314292908, + "num_tokens": 644434772.0, + "step": 16892 + }, + { + "epoch": 2.1489632362294873, + "ewc_loss": 0.04667551815509796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3337759557762183e-05, + "grad_norm": 30.000394821166992, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8688004612922668, + "num_tokens": 644474425.0, + "step": 16893 + }, + { + "epoch": 2.149090446508078, + "ewc_loss": 0.046665869653224945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.333293559786398e-05, + "grad_norm": 30.071125030517578, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8625656366348267, + "num_tokens": 644507796.0, + "step": 16894 + }, + { + "epoch": 2.1492176567866683, + "ewc_loss": 0.04668312892317772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.33415648835944e-05, + "grad_norm": 30.039316177368164, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8675402402877808, + "num_tokens": 644544172.0, + "step": 16895 + }, + { + "epoch": 2.149344867065259, + "ewc_loss": 0.046663254499435425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.333162774448283e-05, + "grad_norm": 29.92868423461914, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8692449331283569, + "num_tokens": 644582217.0, + "step": 16896 + }, + { + "epoch": 2.1494720773438494, + "ewc_loss": 0.04669678956270218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3348395188804716e-05, + "grad_norm": 30.05420684814453, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8770495057106018, + "num_tokens": 644620028.0, + "step": 16897 + }, + { + "epoch": 2.14959928762244, + "ewc_loss": 0.04675685986876488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3378430341836065e-05, + "grad_norm": 29.99016571044922, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8549017906188965, + "num_tokens": 644663743.0, + "step": 16898 + }, + { + "epoch": 2.1497264979010304, + "ewc_loss": 0.046799853444099426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.339992715860717e-05, + "grad_norm": 30.067920684814453, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8715959787368774, + "num_tokens": 644700914.0, + "step": 16899 + }, + { + "epoch": 2.149853708179621, + "ewc_loss": 0.04682689532637596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3413447706843726e-05, + "grad_norm": 30.171669006347656, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8674694299697876, + "num_tokens": 644744232.0, + "step": 16900 + }, + { + "epoch": 2.1499809184582115, + "ewc_loss": 0.046656928956508636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3328464521910064e-05, + "grad_norm": 29.899023056030273, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8631078004837036, + "num_tokens": 644780851.0, + "step": 16901 + }, + { + "epoch": 2.150108128736802, + "ewc_loss": 0.0467400997877121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.337005025765393e-05, + "grad_norm": 30.157527923583984, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8782309293746948, + "num_tokens": 644822103.0, + "step": 16902 + }, + { + "epoch": 2.1502353390153925, + "ewc_loss": 0.046824779361486435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3412389055010863e-05, + "grad_norm": 30.02301788330078, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.867634654045105, + "num_tokens": 644861209.0, + "step": 16903 + }, + { + "epoch": 2.150362549293983, + "ewc_loss": 0.04661932960152626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3309665266424417e-05, + "grad_norm": 30.0063419342041, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.8571804761886597, + "num_tokens": 644899860.0, + "step": 16904 + }, + { + "epoch": 2.1504897595725736, + "ewc_loss": 0.046822987496852875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3411494112224318e-05, + "grad_norm": 30.129680633544922, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8658127784729004, + "num_tokens": 644943101.0, + "step": 16905 + }, + { + "epoch": 2.150616969851164, + "ewc_loss": 0.04668870568275452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3344353394350037e-05, + "grad_norm": 30.069746017456055, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8591846823692322, + "num_tokens": 644981786.0, + "step": 16906 + }, + { + "epoch": 2.1507441801297547, + "ewc_loss": 0.046701278537511826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.335063982172869e-05, + "grad_norm": 30.00486946105957, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.873091459274292, + "num_tokens": 645017949.0, + "step": 16907 + }, + { + "epoch": 2.150871390408345, + "ewc_loss": 0.04676038399338722, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3380191123578697e-05, + "grad_norm": 30.10157012939453, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8686556816101074, + "num_tokens": 645053443.0, + "step": 16908 + }, + { + "epoch": 2.1509986006869353, + "ewc_loss": 0.04678434133529663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.339217098779045e-05, + "grad_norm": 30.074108123779297, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8719822764396667, + "num_tokens": 645089916.0, + "step": 16909 + }, + { + "epoch": 2.1511258109655262, + "ewc_loss": 0.0467018224298954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.335091085114982e-05, + "grad_norm": 30.011873245239258, + "learning_rate": 1e-06, + "loss": 0.4734, + "mean_token_accuracy": 0.854817271232605, + "num_tokens": 645134531.0, + "step": 16910 + }, + { + "epoch": 2.1512530212441163, + "ewc_loss": 0.04687057435512543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3435286493622698e-05, + "grad_norm": 30.13067626953125, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8652557134628296, + "num_tokens": 645172608.0, + "step": 16911 + }, + { + "epoch": 2.151380231522707, + "ewc_loss": 0.04677249491214752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.33862465393031e-05, + "grad_norm": 29.961210250854492, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8658699989318848, + "num_tokens": 645209895.0, + "step": 16912 + }, + { + "epoch": 2.1515074418012974, + "ewc_loss": 0.046783093363046646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3391547074425034e-05, + "grad_norm": 30.192052841186523, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8757942914962769, + "num_tokens": 645245259.0, + "step": 16913 + }, + { + "epoch": 2.151634652079888, + "ewc_loss": 0.04694211110472679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3471055101254024e-05, + "grad_norm": 30.03318214416504, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8654756546020508, + "num_tokens": 645284962.0, + "step": 16914 + }, + { + "epoch": 2.1517618623584784, + "ewc_loss": 0.04674067348241806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.337033765797969e-05, + "grad_norm": 30.079044342041016, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8843751549720764, + "num_tokens": 645322352.0, + "step": 16915 + }, + { + "epoch": 2.151889072637069, + "ewc_loss": 0.0468733049929142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.343665255466476e-05, + "grad_norm": 30.110950469970703, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.868706464767456, + "num_tokens": 645358401.0, + "step": 16916 + }, + { + "epoch": 2.1520162829156595, + "ewc_loss": 0.04681617021560669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3408085326082073e-05, + "grad_norm": 30.116626739501953, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8826305270195007, + "num_tokens": 645399147.0, + "step": 16917 + }, + { + "epoch": 2.15214349319425, + "ewc_loss": 0.04681454226374626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3407270418829285e-05, + "grad_norm": 30.182872772216797, + "learning_rate": 1e-06, + "loss": 0.4743, + "mean_token_accuracy": 0.8515610694885254, + "num_tokens": 645437286.0, + "step": 16918 + }, + { + "epoch": 2.1522707034728406, + "ewc_loss": 0.04672970622777939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3364853404928e-05, + "grad_norm": 30.054723739624023, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8635647892951965, + "num_tokens": 645477853.0, + "step": 16919 + }, + { + "epoch": 2.152397913751431, + "ewc_loss": 0.04677192121744156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3385960957966745e-05, + "grad_norm": 30.101776123046875, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8595820665359497, + "num_tokens": 645520476.0, + "step": 16920 + }, + { + "epoch": 2.1525251240300216, + "ewc_loss": 0.04675846919417381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3379234335152432e-05, + "grad_norm": 30.075456619262695, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8558357954025269, + "num_tokens": 645561539.0, + "step": 16921 + }, + { + "epoch": 2.152652334308612, + "ewc_loss": 0.04667189344763756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3335946025326848e-05, + "grad_norm": 30.104122161865234, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8656455874443054, + "num_tokens": 645602950.0, + "step": 16922 + }, + { + "epoch": 2.1527795445872027, + "ewc_loss": 0.04676004871726036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.338002377655357e-05, + "grad_norm": 30.164011001586914, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8601303100585938, + "num_tokens": 645639215.0, + "step": 16923 + }, + { + "epoch": 2.152906754865793, + "ewc_loss": 0.046674974262714386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.333748670935165e-05, + "grad_norm": 30.071462631225586, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8688473701477051, + "num_tokens": 645677287.0, + "step": 16924 + }, + { + "epoch": 2.1530339651443837, + "ewc_loss": 0.04676974564790726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3384873202303424e-05, + "grad_norm": 30.2611141204834, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8771422505378723, + "num_tokens": 645710784.0, + "step": 16925 + }, + { + "epoch": 2.1531611754229742, + "ewc_loss": 0.04667079076170921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3335394871537574e-05, + "grad_norm": 30.017276763916016, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.864899218082428, + "num_tokens": 645750568.0, + "step": 16926 + }, + { + "epoch": 2.1532883857015648, + "ewc_loss": 0.04667586460709572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.333793236175552e-05, + "grad_norm": 30.232421875, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8742128014564514, + "num_tokens": 645783911.0, + "step": 16927 + }, + { + "epoch": 2.1534155959801553, + "ewc_loss": 0.04675031080842018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3375156160909683e-05, + "grad_norm": 30.088539123535156, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.886075496673584, + "num_tokens": 645822524.0, + "step": 16928 + }, + { + "epoch": 2.153542806258746, + "ewc_loss": 0.04667108878493309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3335544028668664e-05, + "grad_norm": 30.06546401977539, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8654474020004272, + "num_tokens": 645856924.0, + "step": 16929 + }, + { + "epoch": 2.1536700165373364, + "ewc_loss": 0.04675571992993355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.337785917916335e-05, + "grad_norm": 30.103473663330078, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8630260229110718, + "num_tokens": 645896995.0, + "step": 16930 + }, + { + "epoch": 2.153797226815927, + "ewc_loss": 0.046695079654455185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.334754026378505e-05, + "grad_norm": 30.105457305908203, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8764193058013916, + "num_tokens": 645941768.0, + "step": 16931 + }, + { + "epoch": 2.1539244370945174, + "ewc_loss": 0.04675363376736641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.337681689823512e-05, + "grad_norm": 30.050708770751953, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8673654198646545, + "num_tokens": 645981909.0, + "step": 16932 + }, + { + "epoch": 2.154051647373108, + "ewc_loss": 0.0467122420668602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3356120436801575e-05, + "grad_norm": 29.989961624145508, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8601633906364441, + "num_tokens": 646022034.0, + "step": 16933 + }, + { + "epoch": 2.154178857651698, + "ewc_loss": 0.04681480675935745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3407403205055743e-05, + "grad_norm": 30.18940544128418, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8677974939346313, + "num_tokens": 646056714.0, + "step": 16934 + }, + { + "epoch": 2.1543060679302886, + "ewc_loss": 0.04678642004728317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3393209630739875e-05, + "grad_norm": 30.015029907226562, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8611495494842529, + "num_tokens": 646095668.0, + "step": 16935 + }, + { + "epoch": 2.154433278208879, + "ewc_loss": 0.04674600809812546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3373004296445288e-05, + "grad_norm": 30.113971710205078, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8539519906044006, + "num_tokens": 646136745.0, + "step": 16936 + }, + { + "epoch": 2.1545604884874696, + "ewc_loss": 0.04683839529752731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3419197532348335e-05, + "grad_norm": 30.146053314208984, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8664761185646057, + "num_tokens": 646180314.0, + "step": 16937 + }, + { + "epoch": 2.15468769876606, + "ewc_loss": 0.04682065546512604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3410328140016645e-05, + "grad_norm": 30.124141693115234, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.872069239616394, + "num_tokens": 646218191.0, + "step": 16938 + }, + { + "epoch": 2.1548149090446507, + "ewc_loss": 0.04674487188458443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.337243677175138e-05, + "grad_norm": 30.061372756958008, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8852080702781677, + "num_tokens": 646256350.0, + "step": 16939 + }, + { + "epoch": 2.154942119323241, + "ewc_loss": 0.04674924910068512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3374625016003847e-05, + "grad_norm": 30.165794372558594, + "learning_rate": 1e-06, + "loss": 0.3432, + "mean_token_accuracy": 0.892539381980896, + "num_tokens": 646290915.0, + "step": 16940 + }, + { + "epoch": 2.1550693296018317, + "ewc_loss": 0.04671443998813629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.335722092539072e-05, + "grad_norm": 29.958417892456055, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8628020882606506, + "num_tokens": 646325838.0, + "step": 16941 + }, + { + "epoch": 2.1551965398804223, + "ewc_loss": 0.046725936233997345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3362968931905925e-05, + "grad_norm": 30.18780517578125, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8506633043289185, + "num_tokens": 646362742.0, + "step": 16942 + }, + { + "epoch": 2.155323750159013, + "ewc_loss": 0.046823859214782715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3411928850691766e-05, + "grad_norm": 30.079450607299805, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8554881811141968, + "num_tokens": 646403958.0, + "step": 16943 + }, + { + "epoch": 2.1554509604376033, + "ewc_loss": 0.04664476960897446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3322385459323414e-05, + "grad_norm": 30.102615356445312, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.863574743270874, + "num_tokens": 646442872.0, + "step": 16944 + }, + { + "epoch": 2.155578170716194, + "ewc_loss": 0.04677628353238106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3388141926261596e-05, + "grad_norm": 30.11366081237793, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8758093118667603, + "num_tokens": 646484642.0, + "step": 16945 + }, + { + "epoch": 2.1557053809947844, + "ewc_loss": 0.04670574888586998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3352873540716246e-05, + "grad_norm": 30.064104080200195, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.878991425037384, + "num_tokens": 646521688.0, + "step": 16946 + }, + { + "epoch": 2.155832591273375, + "ewc_loss": 0.04673124849796295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3365624656435102e-05, + "grad_norm": 30.10881233215332, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8777444958686829, + "num_tokens": 646560182.0, + "step": 16947 + }, + { + "epoch": 2.1559598015519654, + "ewc_loss": 0.04667799174785614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3338996470556594e-05, + "grad_norm": 30.02307891845703, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.873731255531311, + "num_tokens": 646593252.0, + "step": 16948 + }, + { + "epoch": 2.156087011830556, + "ewc_loss": 0.04671686887741089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3358434191322885e-05, + "grad_norm": 30.051071166992188, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8680901527404785, + "num_tokens": 646635514.0, + "step": 16949 + }, + { + "epoch": 2.1562142221091465, + "ewc_loss": 0.046807970851659775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.340398532396648e-05, + "grad_norm": 30.11536979675293, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8571449518203735, + "num_tokens": 646674785.0, + "step": 16950 + }, + { + "epoch": 2.156341432387737, + "ewc_loss": 0.04684656858444214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3423284801538102e-05, + "grad_norm": 30.216053009033203, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8686498999595642, + "num_tokens": 646713832.0, + "step": 16951 + }, + { + "epoch": 2.1564686426663275, + "ewc_loss": 0.0467449314892292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3372465875581838e-05, + "grad_norm": 30.15416717529297, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8727168440818787, + "num_tokens": 646752218.0, + "step": 16952 + }, + { + "epoch": 2.156595852944918, + "ewc_loss": 0.046748846769332886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3374423108180054e-05, + "grad_norm": 30.10793113708496, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8687591552734375, + "num_tokens": 646786196.0, + "step": 16953 + }, + { + "epoch": 2.1567230632235086, + "ewc_loss": 0.04673685505986214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3368427719105966e-05, + "grad_norm": 30.10183334350586, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8623517751693726, + "num_tokens": 646821462.0, + "step": 16954 + }, + { + "epoch": 2.156850273502099, + "ewc_loss": 0.046838536858558655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3419268472935073e-05, + "grad_norm": 30.206499099731445, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8759809732437134, + "num_tokens": 646856613.0, + "step": 16955 + }, + { + "epoch": 2.1569774837806897, + "ewc_loss": 0.046787988394498825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3393993615172803e-05, + "grad_norm": 30.080642700195312, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8811261057853699, + "num_tokens": 646893013.0, + "step": 16956 + }, + { + "epoch": 2.15710469405928, + "ewc_loss": 0.0467635840177536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3381791834253818e-05, + "grad_norm": 30.091625213623047, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8749721050262451, + "num_tokens": 646936180.0, + "step": 16957 + }, + { + "epoch": 2.1572319043378707, + "ewc_loss": 0.04675512760877609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3377564502879977e-05, + "grad_norm": 30.12710189819336, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8692772388458252, + "num_tokens": 646979314.0, + "step": 16958 + }, + { + "epoch": 2.157359114616461, + "ewc_loss": 0.04680844023823738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3404219973599538e-05, + "grad_norm": 30.089733123779297, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8800345659255981, + "num_tokens": 647020136.0, + "step": 16959 + }, + { + "epoch": 2.1574863248950513, + "ewc_loss": 0.04671185836195946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3355929442914203e-05, + "grad_norm": 30.100385665893555, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8744847774505615, + "num_tokens": 647052970.0, + "step": 16960 + }, + { + "epoch": 2.157613535173642, + "ewc_loss": 0.04677683860063553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3388418412650935e-05, + "grad_norm": 30.06852149963379, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8632502555847168, + "num_tokens": 647093195.0, + "step": 16961 + }, + { + "epoch": 2.1577407454522324, + "ewc_loss": 0.04676719382405281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3383596271742135e-05, + "grad_norm": 30.021774291992188, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8702398538589478, + "num_tokens": 647133800.0, + "step": 16962 + }, + { + "epoch": 2.157867955730823, + "ewc_loss": 0.04691941291093826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.34597064263653e-05, + "grad_norm": 30.229434967041016, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.854590654373169, + "num_tokens": 647167704.0, + "step": 16963 + }, + { + "epoch": 2.1579951660094134, + "ewc_loss": 0.04683508723974228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.341754407098051e-05, + "grad_norm": 30.059310913085938, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8570199012756348, + "num_tokens": 647208474.0, + "step": 16964 + }, + { + "epoch": 2.158122376288004, + "ewc_loss": 0.04673248901963234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.336624493182171e-05, + "grad_norm": 30.109949111938477, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8832789063453674, + "num_tokens": 647242131.0, + "step": 16965 + }, + { + "epoch": 2.1582495865665945, + "ewc_loss": 0.046804580837488174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3402290025842376e-05, + "grad_norm": 30.171743392944336, + "learning_rate": 1e-06, + "loss": 0.4738, + "mean_token_accuracy": 0.8500123620033264, + "num_tokens": 647280952.0, + "step": 16966 + }, + { + "epoch": 2.158376796845185, + "ewc_loss": 0.04672806337475777, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3364031221717596e-05, + "grad_norm": 30.09451675415039, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8648057579994202, + "num_tokens": 647318411.0, + "step": 16967 + }, + { + "epoch": 2.1585040071237755, + "ewc_loss": 0.04679753631353378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.339876846235711e-05, + "grad_norm": 30.215898513793945, + "learning_rate": 1e-06, + "loss": 0.4864, + "mean_token_accuracy": 0.8561071157455444, + "num_tokens": 647361002.0, + "step": 16968 + }, + { + "epoch": 2.158631217402366, + "ewc_loss": 0.04673019051551819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.336509533051867e-05, + "grad_norm": 30.161231994628906, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8724913597106934, + "num_tokens": 647402995.0, + "step": 16969 + }, + { + "epoch": 2.1587584276809566, + "ewc_loss": 0.04678727686405182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3393638912239112e-05, + "grad_norm": 30.034364700317383, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.871913731098175, + "num_tokens": 647441220.0, + "step": 16970 + }, + { + "epoch": 2.158885637959547, + "ewc_loss": 0.04673224687576294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3366123059531674e-05, + "grad_norm": 30.150279998779297, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8616344928741455, + "num_tokens": 647476832.0, + "step": 16971 + }, + { + "epoch": 2.1590128482381377, + "ewc_loss": 0.04679201915860176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3396009055431932e-05, + "grad_norm": 30.08986473083496, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8722047805786133, + "num_tokens": 647516855.0, + "step": 16972 + }, + { + "epoch": 2.159140058516728, + "ewc_loss": 0.046729300171136856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.33646496781148e-05, + "grad_norm": 30.123544692993164, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.866254448890686, + "num_tokens": 647556913.0, + "step": 16973 + }, + { + "epoch": 2.1592672687953187, + "ewc_loss": 0.046765223145484924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3382612198474817e-05, + "grad_norm": 30.038619995117188, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8868891000747681, + "num_tokens": 647591242.0, + "step": 16974 + }, + { + "epoch": 2.1593944790739092, + "ewc_loss": 0.04676338657736778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3381693608826026e-05, + "grad_norm": 30.08575439453125, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8736254572868347, + "num_tokens": 647632156.0, + "step": 16975 + }, + { + "epoch": 2.1595216893524998, + "ewc_loss": 0.046865709125995636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3432854504790157e-05, + "grad_norm": 30.200210571289062, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8758894205093384, + "num_tokens": 647669857.0, + "step": 16976 + }, + { + "epoch": 2.1596488996310903, + "ewc_loss": 0.046796150505542755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.339807542739436e-05, + "grad_norm": 30.120763778686523, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8573848009109497, + "num_tokens": 647715580.0, + "step": 16977 + }, + { + "epoch": 2.159776109909681, + "ewc_loss": 0.04682459309697151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3412296286551282e-05, + "grad_norm": 30.247039794921875, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8615013360977173, + "num_tokens": 647755917.0, + "step": 16978 + }, + { + "epoch": 2.1599033201882714, + "ewc_loss": 0.04682512208819389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.34125618590042e-05, + "grad_norm": 30.090608596801758, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8853533267974854, + "num_tokens": 647802205.0, + "step": 16979 + }, + { + "epoch": 2.160030530466862, + "ewc_loss": 0.046689223498106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.334461169084534e-05, + "grad_norm": 30.184072494506836, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8558928966522217, + "num_tokens": 647846627.0, + "step": 16980 + }, + { + "epoch": 2.1601577407454524, + "ewc_loss": 0.04684000834822655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3420003344654106e-05, + "grad_norm": 30.280094146728516, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8611852526664734, + "num_tokens": 647885033.0, + "step": 16981 + }, + { + "epoch": 2.160284951024043, + "ewc_loss": 0.046740565448999405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3370283088297583e-05, + "grad_norm": 30.10775375366211, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8589362502098083, + "num_tokens": 647923370.0, + "step": 16982 + }, + { + "epoch": 2.1604121613026335, + "ewc_loss": 0.04668687283992767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.334343662369065e-05, + "grad_norm": 30.189435958862305, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8752521872520447, + "num_tokens": 647963431.0, + "step": 16983 + }, + { + "epoch": 2.1605393715812236, + "ewc_loss": 0.046648409217596054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.332420444872696e-05, + "grad_norm": 29.933176040649414, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8615051507949829, + "num_tokens": 648002854.0, + "step": 16984 + }, + { + "epoch": 2.160666581859814, + "ewc_loss": 0.04667507857084274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3337539460044354e-05, + "grad_norm": 30.191547393798828, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8573111295700073, + "num_tokens": 648041082.0, + "step": 16985 + }, + { + "epoch": 2.1607937921384046, + "ewc_loss": 0.04680296778678894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3401484213536605e-05, + "grad_norm": 30.049015045166016, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8612220883369446, + "num_tokens": 648082109.0, + "step": 16986 + }, + { + "epoch": 2.160921002416995, + "ewc_loss": 0.046695515513420105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3347758542513475e-05, + "grad_norm": 30.077131271362305, + "learning_rate": 1e-06, + "loss": 0.3706, + "mean_token_accuracy": 0.8838224411010742, + "num_tokens": 648117398.0, + "step": 16987 + }, + { + "epoch": 2.1610482126955857, + "ewc_loss": 0.046752698719501495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.337634941795841e-05, + "grad_norm": 30.02103614807129, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.876934826374054, + "num_tokens": 648158997.0, + "step": 16988 + }, + { + "epoch": 2.161175422974176, + "ewc_loss": 0.04675542563199997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3377713660011068e-05, + "grad_norm": 30.167001724243164, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8661991357803345, + "num_tokens": 648194334.0, + "step": 16989 + }, + { + "epoch": 2.1613026332527667, + "ewc_loss": 0.04674907028675079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3374535885523073e-05, + "grad_norm": 30.0539493560791, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.865143895149231, + "num_tokens": 648230601.0, + "step": 16990 + }, + { + "epoch": 2.1614298435313573, + "ewc_loss": 0.04676886647939682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3384433006867766e-05, + "grad_norm": 29.97992515563965, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8633914589881897, + "num_tokens": 648274979.0, + "step": 16991 + }, + { + "epoch": 2.161557053809948, + "ewc_loss": 0.04675725847482681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.337862861168105e-05, + "grad_norm": 30.207622528076172, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8537256717681885, + "num_tokens": 648319229.0, + "step": 16992 + }, + { + "epoch": 2.1616842640885383, + "ewc_loss": 0.04690195992588997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.345098073419649e-05, + "grad_norm": 30.19149398803711, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8807967305183411, + "num_tokens": 648354550.0, + "step": 16993 + }, + { + "epoch": 2.161811474367129, + "ewc_loss": 0.04670427367091179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.335213685000781e-05, + "grad_norm": 30.08268928527832, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8813658952713013, + "num_tokens": 648389820.0, + "step": 16994 + }, + { + "epoch": 2.1619386846457194, + "ewc_loss": 0.04677126184105873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3385631720884703e-05, + "grad_norm": 30.07489776611328, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8840948939323425, + "num_tokens": 648423841.0, + "step": 16995 + }, + { + "epoch": 2.16206589492431, + "ewc_loss": 0.04687504097819328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3437520212610252e-05, + "grad_norm": 30.248119354248047, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8559192419052124, + "num_tokens": 648459865.0, + "step": 16996 + }, + { + "epoch": 2.1621931052029004, + "ewc_loss": 0.046804700046777725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3402350052492693e-05, + "grad_norm": 30.115440368652344, + "learning_rate": 1e-06, + "loss": 0.3587, + "mean_token_accuracy": 0.8881845474243164, + "num_tokens": 648492639.0, + "step": 16997 + }, + { + "epoch": 2.162320315481491, + "ewc_loss": 0.04666512832045555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3332564524025656e-05, + "grad_norm": 30.12868309020996, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8751279711723328, + "num_tokens": 648533456.0, + "step": 16998 + }, + { + "epoch": 2.1624475257600815, + "ewc_loss": 0.04678324609994888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3391623471979983e-05, + "grad_norm": 30.251773834228516, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8835671544075012, + "num_tokens": 648571955.0, + "step": 16999 + }, + { + "epoch": 2.162574736038672, + "ewc_loss": 0.04670185223221779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3350925403065048e-05, + "grad_norm": 30.098630905151367, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8866071701049805, + "num_tokens": 648614284.0, + "step": 17000 + }, + { + "epoch": 2.1627019463172625, + "ewc_loss": 0.04661160707473755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.330580355192069e-05, + "grad_norm": 30.16583251953125, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.87892085313797, + "num_tokens": 648654399.0, + "step": 17001 + }, + { + "epoch": 2.162829156595853, + "ewc_loss": 0.04678718373179436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.339359161851462e-05, + "grad_norm": 30.12236976623535, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8727513551712036, + "num_tokens": 648694490.0, + "step": 17002 + }, + { + "epoch": 2.1629563668744436, + "ewc_loss": 0.046619515866041183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3309758034884e-05, + "grad_norm": 30.041240692138672, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8754980564117432, + "num_tokens": 648730665.0, + "step": 17003 + }, + { + "epoch": 2.163083577153034, + "ewc_loss": 0.04674828052520752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.33741393458331e-05, + "grad_norm": 30.11580467224121, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.874277651309967, + "num_tokens": 648765289.0, + "step": 17004 + }, + { + "epoch": 2.1632107874316246, + "ewc_loss": 0.046730779111385345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3365390006802045e-05, + "grad_norm": 30.113393783569336, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8701225519180298, + "num_tokens": 648803982.0, + "step": 17005 + }, + { + "epoch": 2.163337997710215, + "ewc_loss": 0.046799641102552414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.339981983823236e-05, + "grad_norm": 30.0891170501709, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8759058713912964, + "num_tokens": 648840992.0, + "step": 17006 + }, + { + "epoch": 2.1634652079888053, + "ewc_loss": 0.046819452196359634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.340972605452407e-05, + "grad_norm": 30.162710189819336, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8660101890563965, + "num_tokens": 648883621.0, + "step": 17007 + }, + { + "epoch": 2.1635924182673962, + "ewc_loss": 0.04677392914891243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3386965040117502e-05, + "grad_norm": 30.052078247070312, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8554023504257202, + "num_tokens": 648922258.0, + "step": 17008 + }, + { + "epoch": 2.1637196285459863, + "ewc_loss": 0.04680940881371498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.340470382478088e-05, + "grad_norm": 30.159461975097656, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8687928915023804, + "num_tokens": 648965010.0, + "step": 17009 + }, + { + "epoch": 2.163846838824577, + "ewc_loss": 0.04681256413459778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.340628270758316e-05, + "grad_norm": 30.093456268310547, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.864969789981842, + "num_tokens": 649008124.0, + "step": 17010 + }, + { + "epoch": 2.1639740491031674, + "ewc_loss": 0.046749770641326904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3374885131488554e-05, + "grad_norm": 30.215797424316406, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8690399527549744, + "num_tokens": 649044743.0, + "step": 17011 + }, + { + "epoch": 2.164101259381758, + "ewc_loss": 0.04680495336651802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.340247738175094e-05, + "grad_norm": 29.96617889404297, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8830254077911377, + "num_tokens": 649078410.0, + "step": 17012 + }, + { + "epoch": 2.1642284696603484, + "ewc_loss": 0.04675953835248947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3379769118037075e-05, + "grad_norm": 30.11140251159668, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8801941275596619, + "num_tokens": 649115415.0, + "step": 17013 + }, + { + "epoch": 2.164355679938939, + "ewc_loss": 0.04687187448143959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3435937691829167e-05, + "grad_norm": 30.109785079956055, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8677197098731995, + "num_tokens": 649156266.0, + "step": 17014 + }, + { + "epoch": 2.1644828902175295, + "ewc_loss": 0.046849433332681656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3424716346198693e-05, + "grad_norm": 30.069400787353516, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8753980398178101, + "num_tokens": 649192552.0, + "step": 17015 + }, + { + "epoch": 2.16461010049612, + "ewc_loss": 0.04677760601043701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3388802219415084e-05, + "grad_norm": 30.012117385864258, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8722594976425171, + "num_tokens": 649224163.0, + "step": 17016 + }, + { + "epoch": 2.1647373107747105, + "ewc_loss": 0.04684433341026306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.342216612305492e-05, + "grad_norm": 30.165660858154297, + "learning_rate": 1e-06, + "loss": 0.354, + "mean_token_accuracy": 0.8893043994903564, + "num_tokens": 649259348.0, + "step": 17017 + }, + { + "epoch": 2.164864521053301, + "ewc_loss": 0.04686230048537254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3431150111719035e-05, + "grad_norm": 29.932645797729492, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8846135139465332, + "num_tokens": 649296875.0, + "step": 17018 + }, + { + "epoch": 2.1649917313318916, + "ewc_loss": 0.04684063419699669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3420316210831515e-05, + "grad_norm": 30.045124053955078, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8587528467178345, + "num_tokens": 649333455.0, + "step": 17019 + }, + { + "epoch": 2.165118941610482, + "ewc_loss": 0.0469813346862793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3490667445003055e-05, + "grad_norm": 30.09449005126953, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.870420515537262, + "num_tokens": 649364567.0, + "step": 17020 + }, + { + "epoch": 2.1652461518890727, + "ewc_loss": 0.04683893546462059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3419468561769463e-05, + "grad_norm": 29.934471130371094, + "learning_rate": 1e-06, + "loss": 0.4867, + "mean_token_accuracy": 0.8523082137107849, + "num_tokens": 649400579.0, + "step": 17021 + }, + { + "epoch": 2.165373362167663, + "ewc_loss": 0.04701275750994682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.350637805648148e-05, + "grad_norm": 30.205913543701172, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8822712898254395, + "num_tokens": 649432940.0, + "step": 17022 + }, + { + "epoch": 2.1655005724462537, + "ewc_loss": 0.04703962802886963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.351981311221607e-05, + "grad_norm": 30.077486038208008, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8744475245475769, + "num_tokens": 649466695.0, + "step": 17023 + }, + { + "epoch": 2.1656277827248442, + "ewc_loss": 0.04691769555211067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3458847863366827e-05, + "grad_norm": 30.08293914794922, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8677818179130554, + "num_tokens": 649504578.0, + "step": 17024 + }, + { + "epoch": 2.1657549930034348, + "ewc_loss": 0.04698215052485466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.349107489862945e-05, + "grad_norm": 30.154176712036133, + "learning_rate": 1e-06, + "loss": 0.4583, + "mean_token_accuracy": 0.8601875901222229, + "num_tokens": 649541536.0, + "step": 17025 + }, + { + "epoch": 2.1658822032820253, + "ewc_loss": 0.04694805294275284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3474027329939418e-05, + "grad_norm": 30.00629425048828, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8682133555412292, + "num_tokens": 649581280.0, + "step": 17026 + }, + { + "epoch": 2.166009413560616, + "ewc_loss": 0.04700079932808876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3500399038312025e-05, + "grad_norm": 30.056564331054688, + "learning_rate": 1e-06, + "loss": 0.3608, + "mean_token_accuracy": 0.8913686275482178, + "num_tokens": 649619837.0, + "step": 17027 + }, + { + "epoch": 2.1661366238392064, + "ewc_loss": 0.04696609079837799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3483045879402198e-05, + "grad_norm": 30.140417098999023, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8817877769470215, + "num_tokens": 649654708.0, + "step": 17028 + }, + { + "epoch": 2.166263834117797, + "ewc_loss": 0.047003090381622314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.350154500163626e-05, + "grad_norm": 30.010522842407227, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8496625423431396, + "num_tokens": 649691709.0, + "step": 17029 + }, + { + "epoch": 2.1663910443963874, + "ewc_loss": 0.046903494745492935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3451746528735384e-05, + "grad_norm": 30.101011276245117, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.866630494594574, + "num_tokens": 649734876.0, + "step": 17030 + }, + { + "epoch": 2.166518254674978, + "ewc_loss": 0.04701085388660431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3505426725023426e-05, + "grad_norm": 30.159351348876953, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8800090551376343, + "num_tokens": 649775477.0, + "step": 17031 + }, + { + "epoch": 2.166645464953568, + "ewc_loss": 0.046939462423324585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3469730876968242e-05, + "grad_norm": 30.210302352905273, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8813945055007935, + "num_tokens": 649816136.0, + "step": 17032 + }, + { + "epoch": 2.1667726752321586, + "ewc_loss": 0.04690911993384361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.345456050534267e-05, + "grad_norm": 30.13871955871582, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8798781633377075, + "num_tokens": 649847942.0, + "step": 17033 + }, + { + "epoch": 2.166899885510749, + "ewc_loss": 0.04693568870425224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3467844584956765e-05, + "grad_norm": 30.129987716674805, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8669269680976868, + "num_tokens": 649887875.0, + "step": 17034 + }, + { + "epoch": 2.1670270957893396, + "ewc_loss": 0.046938247978687286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3469123334507458e-05, + "grad_norm": 30.144393920898438, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8569761514663696, + "num_tokens": 649920557.0, + "step": 17035 + }, + { + "epoch": 2.16715430606793, + "ewc_loss": 0.04696004092693329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.34800209000241e-05, + "grad_norm": 30.1723575592041, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8785912394523621, + "num_tokens": 649959704.0, + "step": 17036 + }, + { + "epoch": 2.1672815163465207, + "ewc_loss": 0.04694266617298126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3471333406632766e-05, + "grad_norm": 30.236873626708984, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8752874135971069, + "num_tokens": 650000433.0, + "step": 17037 + }, + { + "epoch": 2.167408726625111, + "ewc_loss": 0.04690352827310562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.345176471862942e-05, + "grad_norm": 30.114665985107422, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8829610347747803, + "num_tokens": 650038793.0, + "step": 17038 + }, + { + "epoch": 2.1675359369037017, + "ewc_loss": 0.0469118133187294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3455906557501294e-05, + "grad_norm": 30.186471939086914, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8792581558227539, + "num_tokens": 650072877.0, + "step": 17039 + }, + { + "epoch": 2.1676631471822922, + "ewc_loss": 0.046880897134542465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.344044878554996e-05, + "grad_norm": 30.00818634033203, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8591164350509644, + "num_tokens": 650109245.0, + "step": 17040 + }, + { + "epoch": 2.1677903574608828, + "ewc_loss": 0.04695143550634384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.347571717109531e-05, + "grad_norm": 30.169254302978516, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8830382823944092, + "num_tokens": 650150254.0, + "step": 17041 + }, + { + "epoch": 2.1679175677394733, + "ewc_loss": 0.046984247863292694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3492124455515295e-05, + "grad_norm": 30.144765853881836, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8506891131401062, + "num_tokens": 650189587.0, + "step": 17042 + }, + { + "epoch": 2.168044778018064, + "ewc_loss": 0.046846967190504074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.342348307138309e-05, + "grad_norm": 30.117483139038086, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8669589757919312, + "num_tokens": 650225443.0, + "step": 17043 + }, + { + "epoch": 2.1681719882966544, + "ewc_loss": 0.046872612088918686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3436305127688684e-05, + "grad_norm": 30.084623336791992, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8668135404586792, + "num_tokens": 650262319.0, + "step": 17044 + }, + { + "epoch": 2.168299198575245, + "ewc_loss": 0.0468713715672493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3435686671291478e-05, + "grad_norm": 30.096660614013672, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8547502160072327, + "num_tokens": 650297137.0, + "step": 17045 + }, + { + "epoch": 2.1684264088538354, + "ewc_loss": 0.04694444313645363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3472221073461697e-05, + "grad_norm": 30.13326644897461, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.868499755859375, + "num_tokens": 650335521.0, + "step": 17046 + }, + { + "epoch": 2.168553619132426, + "ewc_loss": 0.04693111032247543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.34655544772977e-05, + "grad_norm": 30.13332748413086, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8647468090057373, + "num_tokens": 650373050.0, + "step": 17047 + }, + { + "epoch": 2.1686808294110165, + "ewc_loss": 0.04686969518661499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3434848117176443e-05, + "grad_norm": 30.28143882751465, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8772765398025513, + "num_tokens": 650407438.0, + "step": 17048 + }, + { + "epoch": 2.168808039689607, + "ewc_loss": 0.046873047947883606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.343652340641711e-05, + "grad_norm": 30.05936622619629, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8655304908752441, + "num_tokens": 650447793.0, + "step": 17049 + }, + { + "epoch": 2.1689352499681975, + "ewc_loss": 0.04670010134577751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3350050469161943e-05, + "grad_norm": 29.987037658691406, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8666049242019653, + "num_tokens": 650487167.0, + "step": 17050 + }, + { + "epoch": 2.169062460246788, + "ewc_loss": 0.04695628955960274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3478145521949045e-05, + "grad_norm": 30.2169132232666, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.858120322227478, + "num_tokens": 650522960.0, + "step": 17051 + }, + { + "epoch": 2.1691896705253786, + "ewc_loss": 0.04688180610537529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3440903532900847e-05, + "grad_norm": 30.11073875427246, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8647478222846985, + "num_tokens": 650564161.0, + "step": 17052 + }, + { + "epoch": 2.169316880803969, + "ewc_loss": 0.046786513179540634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3393256924464367e-05, + "grad_norm": 30.21116065979004, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8768715858459473, + "num_tokens": 650599201.0, + "step": 17053 + }, + { + "epoch": 2.1694440910825596, + "ewc_loss": 0.04679436236619949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3397180484607816e-05, + "grad_norm": 30.044076919555664, + "learning_rate": 1e-06, + "loss": 0.4748, + "mean_token_accuracy": 0.851038932800293, + "num_tokens": 650637385.0, + "step": 17054 + }, + { + "epoch": 2.16957130136115, + "ewc_loss": 0.04685797169804573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3428985514328815e-05, + "grad_norm": 30.09061622619629, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8697271943092346, + "num_tokens": 650671954.0, + "step": 17055 + }, + { + "epoch": 2.1696985116397407, + "ewc_loss": 0.04693371430039406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.346685687371064e-05, + "grad_norm": 30.111125946044922, + "learning_rate": 1e-06, + "loss": 0.3649, + "mean_token_accuracy": 0.8878486156463623, + "num_tokens": 650708475.0, + "step": 17056 + }, + { + "epoch": 2.169825721918331, + "ewc_loss": 0.04685976728796959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3429884095094167e-05, + "grad_norm": 30.140748977661133, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8741523623466492, + "num_tokens": 650748089.0, + "step": 17057 + }, + { + "epoch": 2.1699529321969213, + "ewc_loss": 0.04694958031177521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3474789486499503e-05, + "grad_norm": 30.327594757080078, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8725946545600891, + "num_tokens": 650790869.0, + "step": 17058 + }, + { + "epoch": 2.170080142475512, + "ewc_loss": 0.04677710309624672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3388551198877394e-05, + "grad_norm": 29.930438995361328, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8579825162887573, + "num_tokens": 650830697.0, + "step": 17059 + }, + { + "epoch": 2.1702073527541024, + "ewc_loss": 0.046772271394729614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3386135580949485e-05, + "grad_norm": 30.214569091796875, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8793624639511108, + "num_tokens": 650868755.0, + "step": 17060 + }, + { + "epoch": 2.170334563032693, + "ewc_loss": 0.046924494206905365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3462247554562055e-05, + "grad_norm": 30.107080459594727, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8812834024429321, + "num_tokens": 650908019.0, + "step": 17061 + }, + { + "epoch": 2.1704617733112834, + "ewc_loss": 0.046801310032606125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.340065475436859e-05, + "grad_norm": 30.086933135986328, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8686288595199585, + "num_tokens": 650954195.0, + "step": 17062 + }, + { + "epoch": 2.170588983589874, + "ewc_loss": 0.04693718999624252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.346859582758043e-05, + "grad_norm": 30.1372013092041, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8830809593200684, + "num_tokens": 650992740.0, + "step": 17063 + }, + { + "epoch": 2.1707161938684645, + "ewc_loss": 0.04685397073626518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3426984625984915e-05, + "grad_norm": 30.09264373779297, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8837581872940063, + "num_tokens": 651031524.0, + "step": 17064 + }, + { + "epoch": 2.170843404147055, + "ewc_loss": 0.04694122076034546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.347061126783956e-05, + "grad_norm": 30.092182159423828, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.869488537311554, + "num_tokens": 651070004.0, + "step": 17065 + }, + { + "epoch": 2.1709706144256455, + "ewc_loss": 0.046869099140167236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3434549802914262e-05, + "grad_norm": 30.174488067626953, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8565233945846558, + "num_tokens": 651104474.0, + "step": 17066 + }, + { + "epoch": 2.171097824704236, + "ewc_loss": 0.04691283404827118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.345641769352369e-05, + "grad_norm": 30.077741622924805, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8632782101631165, + "num_tokens": 651148913.0, + "step": 17067 + }, + { + "epoch": 2.1712250349828266, + "ewc_loss": 0.04681192710995674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.340596438443754e-05, + "grad_norm": 30.14503288269043, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8784008026123047, + "num_tokens": 651192072.0, + "step": 17068 + }, + { + "epoch": 2.171352245261417, + "ewc_loss": 0.04693935066461563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3469674488296732e-05, + "grad_norm": 30.146238327026367, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8617523908615112, + "num_tokens": 651226132.0, + "step": 17069 + }, + { + "epoch": 2.1714794555400077, + "ewc_loss": 0.046785276383161545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3392638468067162e-05, + "grad_norm": 30.16840934753418, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.872159481048584, + "num_tokens": 651263467.0, + "step": 17070 + }, + { + "epoch": 2.171606665818598, + "ewc_loss": 0.046884626150131226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3442313249688596e-05, + "grad_norm": 30.114282608032227, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8626169562339783, + "num_tokens": 651309521.0, + "step": 17071 + }, + { + "epoch": 2.1717338760971887, + "ewc_loss": 0.046847935765981674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3423968741553836e-05, + "grad_norm": 30.185527801513672, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8731123208999634, + "num_tokens": 651347026.0, + "step": 17072 + }, + { + "epoch": 2.1718610863757792, + "ewc_loss": 0.0468301922082901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.341509571124334e-05, + "grad_norm": 30.1825008392334, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8783851861953735, + "num_tokens": 651380545.0, + "step": 17073 + }, + { + "epoch": 2.1719882966543698, + "ewc_loss": 0.046847037971019745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.342351945117116e-05, + "grad_norm": 30.108816146850586, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8764278888702393, + "num_tokens": 651409428.0, + "step": 17074 + }, + { + "epoch": 2.1721155069329603, + "ewc_loss": 0.046774864196777344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3387432520394213e-05, + "grad_norm": 30.130767822265625, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8576414585113525, + "num_tokens": 651447993.0, + "step": 17075 + }, + { + "epoch": 2.172242717211551, + "ewc_loss": 0.04689852520823479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.344926178921014e-05, + "grad_norm": 30.1697940826416, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8649678230285645, + "num_tokens": 651488210.0, + "step": 17076 + }, + { + "epoch": 2.1723699274901414, + "ewc_loss": 0.04691203311085701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3456015696865506e-05, + "grad_norm": 30.1806640625, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8662171959877014, + "num_tokens": 651525742.0, + "step": 17077 + }, + { + "epoch": 2.172497137768732, + "ewc_loss": 0.0467657595872879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3382879589917138e-05, + "grad_norm": 30.044111251831055, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8765789270401001, + "num_tokens": 651565569.0, + "step": 17078 + }, + { + "epoch": 2.1726243480473224, + "ewc_loss": 0.04690433293581009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3452166715287603e-05, + "grad_norm": 30.124784469604492, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8652566075325012, + "num_tokens": 651605291.0, + "step": 17079 + }, + { + "epoch": 2.172751558325913, + "ewc_loss": 0.04692709445953369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.346354813198559e-05, + "grad_norm": 30.05595588684082, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8712858557701111, + "num_tokens": 651641998.0, + "step": 17080 + }, + { + "epoch": 2.1728787686045035, + "ewc_loss": 0.04684509336948395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3422546291840263e-05, + "grad_norm": 30.00807762145996, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8678399324417114, + "num_tokens": 651683040.0, + "step": 17081 + }, + { + "epoch": 2.1730059788830935, + "ewc_loss": 0.0469062402844429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.345311986573506e-05, + "grad_norm": 30.168834686279297, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8793231844902039, + "num_tokens": 651718067.0, + "step": 17082 + }, + { + "epoch": 2.173133189161684, + "ewc_loss": 0.04702291265130043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.351145667489618e-05, + "grad_norm": 30.008935928344727, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8851528167724609, + "num_tokens": 651761995.0, + "step": 17083 + }, + { + "epoch": 2.1732603994402746, + "ewc_loss": 0.04691403731703758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.345701796002686e-05, + "grad_norm": 30.140535354614258, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8835279941558838, + "num_tokens": 651797780.0, + "step": 17084 + }, + { + "epoch": 2.173387609718865, + "ewc_loss": 0.047106191515922546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.355309516133275e-05, + "grad_norm": 30.150997161865234, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8597790598869324, + "num_tokens": 651832931.0, + "step": 17085 + }, + { + "epoch": 2.1735148199974557, + "ewc_loss": 0.04691779240965843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3458896976080723e-05, + "grad_norm": 30.15808868408203, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8652485609054565, + "num_tokens": 651868440.0, + "step": 17086 + }, + { + "epoch": 2.173642030276046, + "ewc_loss": 0.046990927308797836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3495464120060205e-05, + "grad_norm": 30.10708236694336, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.873570442199707, + "num_tokens": 651900029.0, + "step": 17087 + }, + { + "epoch": 2.1737692405546367, + "ewc_loss": 0.04695657268166542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.347828558413312e-05, + "grad_norm": 30.1773624420166, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8670194745063782, + "num_tokens": 651944892.0, + "step": 17088 + }, + { + "epoch": 2.1738964508332272, + "ewc_loss": 0.04696393758058548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.34819690376753e-05, + "grad_norm": 30.09925651550293, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.878424346446991, + "num_tokens": 651979330.0, + "step": 17089 + }, + { + "epoch": 2.1740236611118178, + "ewc_loss": 0.04694866016507149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3474329282180406e-05, + "grad_norm": 30.10262680053711, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8617846965789795, + "num_tokens": 652017389.0, + "step": 17090 + }, + { + "epoch": 2.1741508713904083, + "ewc_loss": 0.046964578330516815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3482289179810323e-05, + "grad_norm": 30.063344955444336, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.866574227809906, + "num_tokens": 652052260.0, + "step": 17091 + }, + { + "epoch": 2.174278081668999, + "ewc_loss": 0.04693102464079857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.346551264054142e-05, + "grad_norm": 29.989835739135742, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.85608971118927, + "num_tokens": 652089917.0, + "step": 17092 + }, + { + "epoch": 2.1744052919475894, + "ewc_loss": 0.04703250899910927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3516255168942735e-05, + "grad_norm": 30.14375877380371, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8738560676574707, + "num_tokens": 652129849.0, + "step": 17093 + }, + { + "epoch": 2.17453250222618, + "ewc_loss": 0.04703547805547714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3517739464296028e-05, + "grad_norm": 30.11760139465332, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8620914816856384, + "num_tokens": 652163178.0, + "step": 17094 + }, + { + "epoch": 2.1746597125047704, + "ewc_loss": 0.04697947949171066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3489739760407247e-05, + "grad_norm": 30.131534576416016, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8767445683479309, + "num_tokens": 652201521.0, + "step": 17095 + }, + { + "epoch": 2.174786922783361, + "ewc_loss": 0.04698009043931961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3490045350627042e-05, + "grad_norm": 30.054048538208008, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8703189492225647, + "num_tokens": 652234820.0, + "step": 17096 + }, + { + "epoch": 2.1749141330619515, + "ewc_loss": 0.04706454649567604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.353227318963036e-05, + "grad_norm": 30.119020462036133, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8670557737350464, + "num_tokens": 652275686.0, + "step": 17097 + }, + { + "epoch": 2.175041343340542, + "ewc_loss": 0.047005146741867065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3502572730649263e-05, + "grad_norm": 30.078248977661133, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8617361187934875, + "num_tokens": 652313243.0, + "step": 17098 + }, + { + "epoch": 2.1751685536191325, + "ewc_loss": 0.0470714196562767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3535709260613658e-05, + "grad_norm": 30.147668838500977, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8853935599327087, + "num_tokens": 652351550.0, + "step": 17099 + }, + { + "epoch": 2.175295763897723, + "ewc_loss": 0.04701392725110054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3506963771069422e-05, + "grad_norm": 30.026412963867188, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8672138452529907, + "num_tokens": 652399834.0, + "step": 17100 + }, + { + "epoch": 2.1754229741763136, + "ewc_loss": 0.046922165900468826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3461083401343785e-05, + "grad_norm": 30.064891815185547, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8572344183921814, + "num_tokens": 652435839.0, + "step": 17101 + }, + { + "epoch": 2.175550184454904, + "ewc_loss": 0.04703537002205849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.351768489461392e-05, + "grad_norm": 30.045259475708008, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8664608597755432, + "num_tokens": 652475747.0, + "step": 17102 + }, + { + "epoch": 2.1756773947334946, + "ewc_loss": 0.047076962888240814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.353848140046466e-05, + "grad_norm": 30.19049644470215, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8828847408294678, + "num_tokens": 652517147.0, + "step": 17103 + }, + { + "epoch": 2.175804605012085, + "ewc_loss": 0.047043733298778534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3521866751252674e-05, + "grad_norm": 30.08133316040039, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8820425271987915, + "num_tokens": 652552811.0, + "step": 17104 + }, + { + "epoch": 2.1759318152906753, + "ewc_loss": 0.046995609998703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.349780515942257e-05, + "grad_norm": 30.12065315246582, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8779422044754028, + "num_tokens": 652587709.0, + "step": 17105 + }, + { + "epoch": 2.1760590255692662, + "ewc_loss": 0.04699172452092171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3495862478739582e-05, + "grad_norm": 30.057424545288086, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8692203760147095, + "num_tokens": 652626213.0, + "step": 17106 + }, + { + "epoch": 2.1761862358478563, + "ewc_loss": 0.0469973050057888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.349865280848462e-05, + "grad_norm": 30.0825138092041, + "learning_rate": 1e-06, + "loss": 0.5192, + "mean_token_accuracy": 0.84100341796875, + "num_tokens": 652665692.0, + "step": 17107 + }, + { + "epoch": 2.176313446126447, + "ewc_loss": 0.046952106058597565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3476053684134968e-05, + "grad_norm": 30.071044921875, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8625745177268982, + "num_tokens": 652705515.0, + "step": 17108 + }, + { + "epoch": 2.1764406564050374, + "ewc_loss": 0.04708376154303551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.354188109165989e-05, + "grad_norm": 30.057750701904297, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8826866149902344, + "num_tokens": 652743343.0, + "step": 17109 + }, + { + "epoch": 2.176567866683628, + "ewc_loss": 0.046926505863666534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3463253455702215e-05, + "grad_norm": 29.97591781616211, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8645814657211304, + "num_tokens": 652784587.0, + "step": 17110 + }, + { + "epoch": 2.1766950769622184, + "ewc_loss": 0.04702138528227806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.351069269934669e-05, + "grad_norm": 30.064197540283203, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8613961338996887, + "num_tokens": 652821985.0, + "step": 17111 + }, + { + "epoch": 2.176822287240809, + "ewc_loss": 0.047025978565216064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.351299008296337e-05, + "grad_norm": 30.130573272705078, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8799570798873901, + "num_tokens": 652855772.0, + "step": 17112 + }, + { + "epoch": 2.1769494975193995, + "ewc_loss": 0.04709675908088684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3548380340798758e-05, + "grad_norm": 30.209074020385742, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8653626441955566, + "num_tokens": 652890497.0, + "step": 17113 + }, + { + "epoch": 2.17707670779799, + "ewc_loss": 0.046997055411338806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3498527298215777e-05, + "grad_norm": 30.061630249023438, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8806434869766235, + "num_tokens": 652931611.0, + "step": 17114 + }, + { + "epoch": 2.1772039180765805, + "ewc_loss": 0.047030434012413025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.351521652599331e-05, + "grad_norm": 30.244369506835938, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8707665205001831, + "num_tokens": 652972129.0, + "step": 17115 + }, + { + "epoch": 2.177331128355171, + "ewc_loss": 0.047006756067276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3503378542955033e-05, + "grad_norm": 30.2553768157959, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8635618686676025, + "num_tokens": 653009848.0, + "step": 17116 + }, + { + "epoch": 2.1774583386337616, + "ewc_loss": 0.04700808972120285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.350404429307673e-05, + "grad_norm": 30.228425979614258, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8672878742218018, + "num_tokens": 653044614.0, + "step": 17117 + }, + { + "epoch": 2.177585548912352, + "ewc_loss": 0.04698392376303673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.349196256545838e-05, + "grad_norm": 30.192869186401367, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8829177618026733, + "num_tokens": 653087614.0, + "step": 17118 + }, + { + "epoch": 2.1777127591909426, + "ewc_loss": 0.04692860320210457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3464301193598658e-05, + "grad_norm": 30.134632110595703, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8835550546646118, + "num_tokens": 653120434.0, + "step": 17119 + }, + { + "epoch": 2.177839969469533, + "ewc_loss": 0.04691912233829498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3459560907213017e-05, + "grad_norm": 30.163061141967773, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8805058598518372, + "num_tokens": 653159390.0, + "step": 17120 + }, + { + "epoch": 2.1779671797481237, + "ewc_loss": 0.04697452858090401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.348726411582902e-05, + "grad_norm": 30.132104873657227, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8623044490814209, + "num_tokens": 653201045.0, + "step": 17121 + }, + { + "epoch": 2.1780943900267142, + "ewc_loss": 0.046963904052972794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3481952666770667e-05, + "grad_norm": 30.19869041442871, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8738274574279785, + "num_tokens": 653244968.0, + "step": 17122 + }, + { + "epoch": 2.1782216003053048, + "ewc_loss": 0.04703674837946892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3518374291597866e-05, + "grad_norm": 30.240352630615234, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8631529211997986, + "num_tokens": 653281157.0, + "step": 17123 + }, + { + "epoch": 2.1783488105838953, + "ewc_loss": 0.046953003853559494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.347650115552824e-05, + "grad_norm": 30.186527252197266, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8842324018478394, + "num_tokens": 653321055.0, + "step": 17124 + }, + { + "epoch": 2.178476020862486, + "ewc_loss": 0.04694720730185509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.347360350540839e-05, + "grad_norm": 30.197778701782227, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8578680753707886, + "num_tokens": 653359320.0, + "step": 17125 + }, + { + "epoch": 2.1786032311410763, + "ewc_loss": 0.04692446440458298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3462233002646826e-05, + "grad_norm": 30.09877586364746, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8903224468231201, + "num_tokens": 653404186.0, + "step": 17126 + }, + { + "epoch": 2.178730441419667, + "ewc_loss": 0.04693460091948509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3467300707125105e-05, + "grad_norm": 30.184377670288086, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8808239698410034, + "num_tokens": 653438122.0, + "step": 17127 + }, + { + "epoch": 2.1788576516982574, + "ewc_loss": 0.04694712162017822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.347356166865211e-05, + "grad_norm": 30.08316421508789, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8687125444412231, + "num_tokens": 653474727.0, + "step": 17128 + }, + { + "epoch": 2.178984861976848, + "ewc_loss": 0.047025635838508606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3512817278970033e-05, + "grad_norm": 30.229076385498047, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8759186267852783, + "num_tokens": 653513531.0, + "step": 17129 + }, + { + "epoch": 2.179112072255438, + "ewc_loss": 0.04694255068898201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3471275198971853e-05, + "grad_norm": 30.095314025878906, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8816413879394531, + "num_tokens": 653555871.0, + "step": 17130 + }, + { + "epoch": 2.1792392825340285, + "ewc_loss": 0.04689060524106026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.344530184927862e-05, + "grad_norm": 30.052974700927734, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8727244734764099, + "num_tokens": 653598825.0, + "step": 17131 + }, + { + "epoch": 2.179366492812619, + "ewc_loss": 0.0469839833676815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3491991669288836e-05, + "grad_norm": 30.223007202148438, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8795981407165527, + "num_tokens": 653639838.0, + "step": 17132 + }, + { + "epoch": 2.1794937030912096, + "ewc_loss": 0.046913642436265945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3456821509171277e-05, + "grad_norm": 30.09690284729004, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8628268241882324, + "num_tokens": 653678958.0, + "step": 17133 + }, + { + "epoch": 2.1796209133698, + "ewc_loss": 0.04695623368024826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.347811641811859e-05, + "grad_norm": 30.27703094482422, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8792096376419067, + "num_tokens": 653718043.0, + "step": 17134 + }, + { + "epoch": 2.1797481236483907, + "ewc_loss": 0.047009725123643875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3504862838308327e-05, + "grad_norm": 30.181547164916992, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8858284950256348, + "num_tokens": 653756308.0, + "step": 17135 + }, + { + "epoch": 2.179875333926981, + "ewc_loss": 0.04685112461447716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3425562176271342e-05, + "grad_norm": 30.091983795166016, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8795724511146545, + "num_tokens": 653793140.0, + "step": 17136 + }, + { + "epoch": 2.1800025442055717, + "ewc_loss": 0.04697023332118988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3485115889343433e-05, + "grad_norm": 30.145891189575195, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8764630556106567, + "num_tokens": 653823441.0, + "step": 17137 + }, + { + "epoch": 2.1801297544841622, + "ewc_loss": 0.046928782016038895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.346439032407943e-05, + "grad_norm": 30.15055274963379, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8615251779556274, + "num_tokens": 653860743.0, + "step": 17138 + }, + { + "epoch": 2.1802569647627528, + "ewc_loss": 0.04697675630450249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.348837733734399e-05, + "grad_norm": 30.149497985839844, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8833214044570923, + "num_tokens": 653894785.0, + "step": 17139 + }, + { + "epoch": 2.1803841750413433, + "ewc_loss": 0.04697470739483833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3487353246309794e-05, + "grad_norm": 30.192649841308594, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8677283525466919, + "num_tokens": 653932208.0, + "step": 17140 + }, + { + "epoch": 2.180511385319934, + "ewc_loss": 0.04693859443068504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.34692979574902e-05, + "grad_norm": 30.152097702026367, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8569431304931641, + "num_tokens": 653968818.0, + "step": 17141 + }, + { + "epoch": 2.1806385955985244, + "ewc_loss": 0.04683602601289749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3418013370246626e-05, + "grad_norm": 30.127479553222656, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8683189153671265, + "num_tokens": 654010696.0, + "step": 17142 + }, + { + "epoch": 2.180765805877115, + "ewc_loss": 0.04692329466342926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3461647288058884e-05, + "grad_norm": 30.15184783935547, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8754540681838989, + "num_tokens": 654047632.0, + "step": 17143 + }, + { + "epoch": 2.1808930161557054, + "ewc_loss": 0.046912822872400284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.345641223655548e-05, + "grad_norm": 30.061941146850586, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.87549889087677, + "num_tokens": 654088200.0, + "step": 17144 + }, + { + "epoch": 2.181020226434296, + "ewc_loss": 0.046953845769166946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3476923161069863e-05, + "grad_norm": 30.07648468017578, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8683995008468628, + "num_tokens": 654124225.0, + "step": 17145 + }, + { + "epoch": 2.1811474367128865, + "ewc_loss": 0.04694814234972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3474070985685103e-05, + "grad_norm": 30.00143814086914, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8740041851997375, + "num_tokens": 654163317.0, + "step": 17146 + }, + { + "epoch": 2.181274646991477, + "ewc_loss": 0.04704177752137184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3520888134953566e-05, + "grad_norm": 30.135820388793945, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8633930087089539, + "num_tokens": 654199208.0, + "step": 17147 + }, + { + "epoch": 2.1814018572700675, + "ewc_loss": 0.04707881435751915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3539407266071066e-05, + "grad_norm": 30.1256103515625, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8598464727401733, + "num_tokens": 654239988.0, + "step": 17148 + }, + { + "epoch": 2.181529067548658, + "ewc_loss": 0.047032855451107025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.351642797293607e-05, + "grad_norm": 30.086626052856445, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.865658164024353, + "num_tokens": 654276361.0, + "step": 17149 + }, + { + "epoch": 2.1816562778272486, + "ewc_loss": 0.047000203281641006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3500102543039247e-05, + "grad_norm": 29.982179641723633, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8734076023101807, + "num_tokens": 654316690.0, + "step": 17150 + }, + { + "epoch": 2.181783488105839, + "ewc_loss": 0.04706617817282677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.353308991587255e-05, + "grad_norm": 30.142715454101562, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8802355527877808, + "num_tokens": 654354112.0, + "step": 17151 + }, + { + "epoch": 2.1819106983844296, + "ewc_loss": 0.047114837914705276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3557418899144977e-05, + "grad_norm": 30.121337890625, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8821725845336914, + "num_tokens": 654398896.0, + "step": 17152 + }, + { + "epoch": 2.18203790866302, + "ewc_loss": 0.0469806008040905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.349030000914354e-05, + "grad_norm": 30.112119674682617, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8837403059005737, + "num_tokens": 654435180.0, + "step": 17153 + }, + { + "epoch": 2.1821651189416107, + "ewc_loss": 0.047051843255758286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3525921278633177e-05, + "grad_norm": 30.16400146484375, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8591821193695068, + "num_tokens": 654473958.0, + "step": 17154 + }, + { + "epoch": 2.1822923292202008, + "ewc_loss": 0.0470409020781517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3520451577496715e-05, + "grad_norm": 30.123048782348633, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8563655018806458, + "num_tokens": 654512918.0, + "step": 17155 + }, + { + "epoch": 2.1824195394987913, + "ewc_loss": 0.047094494104385376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.354724711040035e-05, + "grad_norm": 30.10441780090332, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8779780864715576, + "num_tokens": 654552296.0, + "step": 17156 + }, + { + "epoch": 2.182546749777382, + "ewc_loss": 0.047078248113393784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3539123503724113e-05, + "grad_norm": 30.237485885620117, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8772803544998169, + "num_tokens": 654592088.0, + "step": 17157 + }, + { + "epoch": 2.1826739600559724, + "ewc_loss": 0.047068532556295395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3534266802016646e-05, + "grad_norm": 30.043670654296875, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8627308011054993, + "num_tokens": 654635119.0, + "step": 17158 + }, + { + "epoch": 2.182801170334563, + "ewc_loss": 0.04691390320658684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3456952476408333e-05, + "grad_norm": 30.064327239990234, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8585422039031982, + "num_tokens": 654671204.0, + "step": 17159 + }, + { + "epoch": 2.1829283806131534, + "ewc_loss": 0.0471322126686573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3566106392536312e-05, + "grad_norm": 30.155065536499023, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8698964715003967, + "num_tokens": 654702986.0, + "step": 17160 + }, + { + "epoch": 2.183055590891744, + "ewc_loss": 0.04695086181163788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3475431589758955e-05, + "grad_norm": 30.036500930786133, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.847522497177124, + "num_tokens": 654745813.0, + "step": 17161 + }, + { + "epoch": 2.1831828011703345, + "ewc_loss": 0.047147367149591446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3573684302391484e-05, + "grad_norm": 30.17633628845215, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8714556694030762, + "num_tokens": 654783305.0, + "step": 17162 + }, + { + "epoch": 2.183310011448925, + "ewc_loss": 0.04717109352350235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3585545932292007e-05, + "grad_norm": 30.086196899414062, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8789862990379333, + "num_tokens": 654823427.0, + "step": 17163 + }, + { + "epoch": 2.1834372217275155, + "ewc_loss": 0.04698631912469864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3493159460485913e-05, + "grad_norm": 30.109800338745117, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8848874568939209, + "num_tokens": 654862400.0, + "step": 17164 + }, + { + "epoch": 2.183564432006106, + "ewc_loss": 0.04712885618209839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.356442746531684e-05, + "grad_norm": 30.205034255981445, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8744821548461914, + "num_tokens": 654902484.0, + "step": 17165 + }, + { + "epoch": 2.1836916422846966, + "ewc_loss": 0.047050412744283676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3525206415797584e-05, + "grad_norm": 30.008594512939453, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.877301812171936, + "num_tokens": 654946882.0, + "step": 17166 + }, + { + "epoch": 2.183818852563287, + "ewc_loss": 0.047066524624824524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.353326271986589e-05, + "grad_norm": 30.235397338867188, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8612486124038696, + "num_tokens": 654977850.0, + "step": 17167 + }, + { + "epoch": 2.1839460628418776, + "ewc_loss": 0.0470660962164402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.353304807911627e-05, + "grad_norm": 30.106830596923828, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8575750589370728, + "num_tokens": 655015352.0, + "step": 17168 + }, + { + "epoch": 2.184073273120468, + "ewc_loss": 0.04703991487622261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.351995681237895e-05, + "grad_norm": 30.15633201599121, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8716585040092468, + "num_tokens": 655047488.0, + "step": 17169 + }, + { + "epoch": 2.1842004833990587, + "ewc_loss": 0.04708840698003769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3544203941128217e-05, + "grad_norm": 30.18794822692871, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.867385745048523, + "num_tokens": 655083410.0, + "step": 17170 + }, + { + "epoch": 2.1843276936776492, + "ewc_loss": 0.047024864703416824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.351243165321648e-05, + "grad_norm": 30.09840965270996, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8779483437538147, + "num_tokens": 655120594.0, + "step": 17171 + }, + { + "epoch": 2.1844549039562398, + "ewc_loss": 0.047040633857250214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3520316972280852e-05, + "grad_norm": 30.110347747802734, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.869196891784668, + "num_tokens": 655157310.0, + "step": 17172 + }, + { + "epoch": 2.1845821142348303, + "ewc_loss": 0.047055765986442566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.35278821492102e-05, + "grad_norm": 30.147647857666016, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8792873620986938, + "num_tokens": 655193749.0, + "step": 17173 + }, + { + "epoch": 2.184709324513421, + "ewc_loss": 0.04710526391863823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3552631319034845e-05, + "grad_norm": 30.249116897583008, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8853636980056763, + "num_tokens": 655235532.0, + "step": 17174 + }, + { + "epoch": 2.1848365347920113, + "ewc_loss": 0.04705614969134331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3528074962086976e-05, + "grad_norm": 30.063613891601562, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.875848650932312, + "num_tokens": 655282154.0, + "step": 17175 + }, + { + "epoch": 2.184963745070602, + "ewc_loss": 0.047020699828863144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3510350729338825e-05, + "grad_norm": 30.178752899169922, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8647565841674805, + "num_tokens": 655321342.0, + "step": 17176 + }, + { + "epoch": 2.1850909553491924, + "ewc_loss": 0.04708138480782509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3540693291579373e-05, + "grad_norm": 30.11038589477539, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8880763053894043, + "num_tokens": 655358107.0, + "step": 17177 + }, + { + "epoch": 2.185218165627783, + "ewc_loss": 0.04700983688235283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3504919226979837e-05, + "grad_norm": 30.193477630615234, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8657703995704651, + "num_tokens": 655393462.0, + "step": 17178 + }, + { + "epoch": 2.1853453759063735, + "ewc_loss": 0.04704640433192253, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3523201889474876e-05, + "grad_norm": 30.100051879882812, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8721678256988525, + "num_tokens": 655436114.0, + "step": 17179 + }, + { + "epoch": 2.1854725861849635, + "ewc_loss": 0.04690060392022133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.345030225114897e-05, + "grad_norm": 30.146812438964844, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8732188940048218, + "num_tokens": 655473119.0, + "step": 17180 + }, + { + "epoch": 2.185599796463554, + "ewc_loss": 0.04710860177874565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.35543011513073e-05, + "grad_norm": 30.10334587097168, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8660749197006226, + "num_tokens": 655515537.0, + "step": 17181 + }, + { + "epoch": 2.1857270067421446, + "ewc_loss": 0.047004081308841705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3502039766754024e-05, + "grad_norm": 30.1635684967041, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8651560544967651, + "num_tokens": 655555944.0, + "step": 17182 + }, + { + "epoch": 2.185854217020735, + "ewc_loss": 0.04706687107682228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3533435523859225e-05, + "grad_norm": 30.13713836669922, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8541170358657837, + "num_tokens": 655589952.0, + "step": 17183 + }, + { + "epoch": 2.1859814272993257, + "ewc_loss": 0.04702384024858475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3511920517194085e-05, + "grad_norm": 30.112085342407227, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8841841220855713, + "num_tokens": 655628269.0, + "step": 17184 + }, + { + "epoch": 2.186108637577916, + "ewc_loss": 0.04707462340593338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.353731179027818e-05, + "grad_norm": 30.23208999633789, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8744769096374512, + "num_tokens": 655657315.0, + "step": 17185 + }, + { + "epoch": 2.1862358478565067, + "ewc_loss": 0.04703937843441963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.351968942093663e-05, + "grad_norm": 29.99201202392578, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8712929487228394, + "num_tokens": 655696117.0, + "step": 17186 + }, + { + "epoch": 2.1863630581350972, + "ewc_loss": 0.04699984937906265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.34999242820777e-05, + "grad_norm": 30.218704223632812, + "learning_rate": 1e-06, + "loss": 0.4737, + "mean_token_accuracy": 0.8531182408332825, + "num_tokens": 655738823.0, + "step": 17187 + }, + { + "epoch": 2.1864902684136878, + "ewc_loss": 0.0471947118639946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.359735663048923e-05, + "grad_norm": 30.148765563964844, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8614436388015747, + "num_tokens": 655781457.0, + "step": 17188 + }, + { + "epoch": 2.1866174786922783, + "ewc_loss": 0.0470559187233448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.352795854676515e-05, + "grad_norm": 30.117008209228516, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8655326962471008, + "num_tokens": 655824704.0, + "step": 17189 + }, + { + "epoch": 2.186744688970869, + "ewc_loss": 0.04716261103749275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.358130586799234e-05, + "grad_norm": 30.205501556396484, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8801037669181824, + "num_tokens": 655860013.0, + "step": 17190 + }, + { + "epoch": 2.1868718992494594, + "ewc_loss": 0.04708658531308174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.354329262743704e-05, + "grad_norm": 30.231966018676758, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8689857721328735, + "num_tokens": 655897146.0, + "step": 17191 + }, + { + "epoch": 2.18699910952805, + "ewc_loss": 0.04706788435578346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3533943021902815e-05, + "grad_norm": 30.040416717529297, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8766878843307495, + "num_tokens": 655934299.0, + "step": 17192 + }, + { + "epoch": 2.1871263198066404, + "ewc_loss": 0.0470048151910305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.350240720261354e-05, + "grad_norm": 30.189802169799805, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8752768039703369, + "num_tokens": 655976554.0, + "step": 17193 + }, + { + "epoch": 2.187253530085231, + "ewc_loss": 0.047120045870542526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3560023691970855e-05, + "grad_norm": 30.08988380432129, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8825361728668213, + "num_tokens": 656011756.0, + "step": 17194 + }, + { + "epoch": 2.1873807403638215, + "ewc_loss": 0.04698929190635681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.349464557482861e-05, + "grad_norm": 30.188793182373047, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8683229088783264, + "num_tokens": 656054405.0, + "step": 17195 + }, + { + "epoch": 2.187507950642412, + "ewc_loss": 0.04701966419816017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3509832317358814e-05, + "grad_norm": 30.02147674560547, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8754240274429321, + "num_tokens": 656094668.0, + "step": 17196 + }, + { + "epoch": 2.1876351609210025, + "ewc_loss": 0.04710350185632706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.355175092816353e-05, + "grad_norm": 30.317237854003906, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8711397647857666, + "num_tokens": 656137664.0, + "step": 17197 + }, + { + "epoch": 2.187762371199593, + "ewc_loss": 0.047059111297130585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3529555619461462e-05, + "grad_norm": 30.3001708984375, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8634382486343384, + "num_tokens": 656180611.0, + "step": 17198 + }, + { + "epoch": 2.1878895814781836, + "ewc_loss": 0.04698807746171951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3494038032367826e-05, + "grad_norm": 30.1227970123291, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8627949953079224, + "num_tokens": 656217789.0, + "step": 17199 + }, + { + "epoch": 2.188016791756774, + "ewc_loss": 0.04698902368545532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3494510969612747e-05, + "grad_norm": 30.199724197387695, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8678913116455078, + "num_tokens": 656261940.0, + "step": 17200 + }, + { + "epoch": 2.1881440020353646, + "ewc_loss": 0.04701028764247894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3505142962676473e-05, + "grad_norm": 29.9868221282959, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8793003559112549, + "num_tokens": 656296000.0, + "step": 17201 + }, + { + "epoch": 2.188271212313955, + "ewc_loss": 0.047025326639413834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.351266266487073e-05, + "grad_norm": 30.146512985229492, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8808585405349731, + "num_tokens": 656340862.0, + "step": 17202 + }, + { + "epoch": 2.1883984225925452, + "ewc_loss": 0.047095078974962234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.354753996769432e-05, + "grad_norm": 30.227323532104492, + "learning_rate": 1e-06, + "loss": 0.3467, + "mean_token_accuracy": 0.8951171636581421, + "num_tokens": 656376923.0, + "step": 17203 + }, + { + "epoch": 2.188525632871136, + "ewc_loss": 0.0471031628549099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3551581762149e-05, + "grad_norm": 30.16029930114746, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8682917952537537, + "num_tokens": 656414058.0, + "step": 17204 + }, + { + "epoch": 2.1886528431497263, + "ewc_loss": 0.046986356377601624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.349317765037995e-05, + "grad_norm": 30.199615478515625, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8619794845581055, + "num_tokens": 656450077.0, + "step": 17205 + }, + { + "epoch": 2.188780053428317, + "ewc_loss": 0.04705601930618286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3528009478468448e-05, + "grad_norm": 30.199481964111328, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8627546429634094, + "num_tokens": 656487258.0, + "step": 17206 + }, + { + "epoch": 2.1889072637069074, + "ewc_loss": 0.0470396913588047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3519845854025334e-05, + "grad_norm": 30.15130615234375, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8847408294677734, + "num_tokens": 656525141.0, + "step": 17207 + }, + { + "epoch": 2.189034473985498, + "ewc_loss": 0.04701932147145271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.350966133235488e-05, + "grad_norm": 30.1168270111084, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8743898868560791, + "num_tokens": 656561517.0, + "step": 17208 + }, + { + "epoch": 2.1891616842640884, + "ewc_loss": 0.04701375216245651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3506876459578052e-05, + "grad_norm": 30.19242286682129, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8771629333496094, + "num_tokens": 656595859.0, + "step": 17209 + }, + { + "epoch": 2.189288894542679, + "ewc_loss": 0.04709333926439285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3546670490759425e-05, + "grad_norm": 30.154071807861328, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8680272102355957, + "num_tokens": 656627652.0, + "step": 17210 + }, + { + "epoch": 2.1894161048212695, + "ewc_loss": 0.04710882157087326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3554410290671512e-05, + "grad_norm": 30.35747528076172, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.877532422542572, + "num_tokens": 656669951.0, + "step": 17211 + }, + { + "epoch": 2.18954331509986, + "ewc_loss": 0.046995148062705994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.349757414776832e-05, + "grad_norm": 30.200014114379883, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.882776141166687, + "num_tokens": 656702216.0, + "step": 17212 + }, + { + "epoch": 2.1896705253784505, + "ewc_loss": 0.04694771766662598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3473858163924888e-05, + "grad_norm": 30.119789123535156, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.860100507736206, + "num_tokens": 656744816.0, + "step": 17213 + }, + { + "epoch": 2.189797735657041, + "ewc_loss": 0.04704948514699936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.352474257349968e-05, + "grad_norm": 30.22658348083496, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8668021559715271, + "num_tokens": 656785533.0, + "step": 17214 + }, + { + "epoch": 2.1899249459356316, + "ewc_loss": 0.04703044146299362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3515220163972117e-05, + "grad_norm": 30.127302169799805, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.869637131690979, + "num_tokens": 656825375.0, + "step": 17215 + }, + { + "epoch": 2.190052156214222, + "ewc_loss": 0.047054387629032135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.352719457121566e-05, + "grad_norm": 30.24901008605957, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8785808086395264, + "num_tokens": 656857951.0, + "step": 17216 + }, + { + "epoch": 2.1901793664928126, + "ewc_loss": 0.047110848128795624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3555423467769288e-05, + "grad_norm": 30.17601203918457, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8796746730804443, + "num_tokens": 656896911.0, + "step": 17217 + }, + { + "epoch": 2.190306576771403, + "ewc_loss": 0.04702410846948624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3512053303420544e-05, + "grad_norm": 30.164142608642578, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8786046504974365, + "num_tokens": 656935723.0, + "step": 17218 + }, + { + "epoch": 2.1904337870499937, + "ewc_loss": 0.04706618934869766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3533095372840762e-05, + "grad_norm": 30.110151290893555, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8825463056564331, + "num_tokens": 656970881.0, + "step": 17219 + }, + { + "epoch": 2.1905609973285842, + "ewc_loss": 0.04701464995741844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3507325749960728e-05, + "grad_norm": 30.20785140991211, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8666319251060486, + "num_tokens": 657007721.0, + "step": 17220 + }, + { + "epoch": 2.1906882076071748, + "ewc_loss": 0.047095637768507004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3547818273073062e-05, + "grad_norm": 30.201553344726562, + "learning_rate": 1e-06, + "loss": 0.3687, + "mean_token_accuracy": 0.8811221122741699, + "num_tokens": 657042359.0, + "step": 17221 + }, + { + "epoch": 2.1908154178857653, + "ewc_loss": 0.04704475775361061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3522379706264473e-05, + "grad_norm": 30.11875343322754, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8760945796966553, + "num_tokens": 657084662.0, + "step": 17222 + }, + { + "epoch": 2.190942628164356, + "ewc_loss": 0.047039855271577835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3519927708548494e-05, + "grad_norm": 30.161588668823242, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8735930919647217, + "num_tokens": 657122002.0, + "step": 17223 + }, + { + "epoch": 2.1910698384429463, + "ewc_loss": 0.04708633944392204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.35431689361576e-05, + "grad_norm": 30.166677474975586, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8722950220108032, + "num_tokens": 657158768.0, + "step": 17224 + }, + { + "epoch": 2.191197048721537, + "ewc_loss": 0.0470573790371418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3528689780505374e-05, + "grad_norm": 30.15300178527832, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8594862818717957, + "num_tokens": 657193445.0, + "step": 17225 + }, + { + "epoch": 2.1913242590001274, + "ewc_loss": 0.0470045804977417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3502290787291713e-05, + "grad_norm": 30.030370712280273, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8708888292312622, + "num_tokens": 657234922.0, + "step": 17226 + }, + { + "epoch": 2.191451469278718, + "ewc_loss": 0.04701122269034386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3505610442953184e-05, + "grad_norm": 30.05401039123535, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8623837232589722, + "num_tokens": 657270774.0, + "step": 17227 + }, + { + "epoch": 2.191578679557308, + "ewc_loss": 0.04714352637529373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.357176344958134e-05, + "grad_norm": 30.233478546142578, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8867859244346619, + "num_tokens": 657307046.0, + "step": 17228 + }, + { + "epoch": 2.1917058898358985, + "ewc_loss": 0.04707087576389313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.353543823119253e-05, + "grad_norm": 30.040966033935547, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.861495852470398, + "num_tokens": 657352865.0, + "step": 17229 + }, + { + "epoch": 2.191833100114489, + "ewc_loss": 0.0470946803689003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.354733987885993e-05, + "grad_norm": 30.197919845581055, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8512841463088989, + "num_tokens": 657393372.0, + "step": 17230 + }, + { + "epoch": 2.1919603103930796, + "ewc_loss": 0.047097012400627136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3548505851067603e-05, + "grad_norm": 30.08578872680664, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.8517336845397949, + "num_tokens": 657432445.0, + "step": 17231 + }, + { + "epoch": 2.19208752067167, + "ewc_loss": 0.04708971828222275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3544858777313493e-05, + "grad_norm": 30.15118980407715, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8692509531974792, + "num_tokens": 657469145.0, + "step": 17232 + }, + { + "epoch": 2.1922147309502606, + "ewc_loss": 0.047171495854854584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.35857478401158e-05, + "grad_norm": 30.192359924316406, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8856546878814697, + "num_tokens": 657510369.0, + "step": 17233 + }, + { + "epoch": 2.192341941228851, + "ewc_loss": 0.04700518026947975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3502590920543298e-05, + "grad_norm": 30.055246353149414, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.878241777420044, + "num_tokens": 657550257.0, + "step": 17234 + }, + { + "epoch": 2.1924691515074417, + "ewc_loss": 0.047123976051807404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3561988200526685e-05, + "grad_norm": 30.211517333984375, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8735727071762085, + "num_tokens": 657587095.0, + "step": 17235 + }, + { + "epoch": 2.1925963617860322, + "ewc_loss": 0.047089073807001114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3544536816189066e-05, + "grad_norm": 30.177547454833984, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8751249313354492, + "num_tokens": 657619710.0, + "step": 17236 + }, + { + "epoch": 2.1927235720646228, + "ewc_loss": 0.04705791920423508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.35289589909371e-05, + "grad_norm": 30.306133270263672, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8664877414703369, + "num_tokens": 657653316.0, + "step": 17237 + }, + { + "epoch": 2.1928507823432133, + "ewc_loss": 0.047035183757543564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.351759212615434e-05, + "grad_norm": 30.119783401489258, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8667007684707642, + "num_tokens": 657693570.0, + "step": 17238 + }, + { + "epoch": 2.192977992621804, + "ewc_loss": 0.04707668721675873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3538343157269992e-05, + "grad_norm": 30.264780044555664, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8722413778305054, + "num_tokens": 657736781.0, + "step": 17239 + }, + { + "epoch": 2.1931052029003943, + "ewc_loss": 0.04707346111536026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3536729713669047e-05, + "grad_norm": 30.178239822387695, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.863366425037384, + "num_tokens": 657775603.0, + "step": 17240 + }, + { + "epoch": 2.193232413178985, + "ewc_loss": 0.04707080125808716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3535400032415055e-05, + "grad_norm": 30.279327392578125, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8690215945243835, + "num_tokens": 657809065.0, + "step": 17241 + }, + { + "epoch": 2.1933596234575754, + "ewc_loss": 0.04711437225341797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3557186068501323e-05, + "grad_norm": 30.260345458984375, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8693521022796631, + "num_tokens": 657851369.0, + "step": 17242 + }, + { + "epoch": 2.193486833736166, + "ewc_loss": 0.04701010882854462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.35050538321957e-05, + "grad_norm": 30.151063919067383, + "learning_rate": 1e-06, + "loss": 0.3581, + "mean_token_accuracy": 0.8930461406707764, + "num_tokens": 657889008.0, + "step": 17243 + }, + { + "epoch": 2.1936140440147565, + "ewc_loss": 0.0471000000834465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3549999241367914e-05, + "grad_norm": 30.308324813842773, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8697161078453064, + "num_tokens": 657925867.0, + "step": 17244 + }, + { + "epoch": 2.193741254293347, + "ewc_loss": 0.04698166623711586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3490832973038778e-05, + "grad_norm": 30.16461753845215, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8613491058349609, + "num_tokens": 657959502.0, + "step": 17245 + }, + { + "epoch": 2.1938684645719375, + "ewc_loss": 0.04692792519927025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.34639628615696e-05, + "grad_norm": 30.274805068969727, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.874326229095459, + "num_tokens": 657999063.0, + "step": 17246 + }, + { + "epoch": 2.193995674850528, + "ewc_loss": 0.047050464898347855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3525231881649233e-05, + "grad_norm": 30.24197769165039, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8613802194595337, + "num_tokens": 658033016.0, + "step": 17247 + }, + { + "epoch": 2.1941228851291186, + "ewc_loss": 0.04696090891957283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3480453819502145e-05, + "grad_norm": 30.241783142089844, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8773198127746582, + "num_tokens": 658071259.0, + "step": 17248 + }, + { + "epoch": 2.194250095407709, + "ewc_loss": 0.04702300950884819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3511503968620673e-05, + "grad_norm": 30.283124923706055, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.872843861579895, + "num_tokens": 658103190.0, + "step": 17249 + }, + { + "epoch": 2.1943773056862996, + "ewc_loss": 0.04696136713027954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3480683012166992e-05, + "grad_norm": 30.074831008911133, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8641737699508667, + "num_tokens": 658143346.0, + "step": 17250 + }, + { + "epoch": 2.19450451596489, + "ewc_loss": 0.04703172668814659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3515864086220972e-05, + "grad_norm": 30.190765380859375, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8813075423240662, + "num_tokens": 658181747.0, + "step": 17251 + }, + { + "epoch": 2.1946317262434807, + "ewc_loss": 0.04705718159675598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3528591555077583e-05, + "grad_norm": 30.075885772705078, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.872260332107544, + "num_tokens": 658221770.0, + "step": 17252 + }, + { + "epoch": 2.1947589365220708, + "ewc_loss": 0.04700328782200813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.350164322706405e-05, + "grad_norm": 30.11150550842285, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.875668466091156, + "num_tokens": 658260157.0, + "step": 17253 + }, + { + "epoch": 2.1948861468006613, + "ewc_loss": 0.04712711274623871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.356355616939254e-05, + "grad_norm": 30.171871185302734, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.861195981502533, + "num_tokens": 658301815.0, + "step": 17254 + }, + { + "epoch": 2.195013357079252, + "ewc_loss": 0.047084275633096695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.354213756916579e-05, + "grad_norm": 30.171884536743164, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8733551502227783, + "num_tokens": 658340732.0, + "step": 17255 + }, + { + "epoch": 2.1951405673578424, + "ewc_loss": 0.047062527388334274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.353126365051139e-05, + "grad_norm": 30.181982040405273, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8659647703170776, + "num_tokens": 658380569.0, + "step": 17256 + }, + { + "epoch": 2.195267777636433, + "ewc_loss": 0.04720109701156616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3600548956892453e-05, + "grad_norm": 30.220754623413086, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8836438059806824, + "num_tokens": 658415842.0, + "step": 17257 + }, + { + "epoch": 2.1953949879150234, + "ewc_loss": 0.04708435386419296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3542177586932667e-05, + "grad_norm": 30.185543060302734, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8753771781921387, + "num_tokens": 658452861.0, + "step": 17258 + }, + { + "epoch": 2.195522198193614, + "ewc_loss": 0.04720963165163994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3604816306033172e-05, + "grad_norm": 30.26187515258789, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8738936185836792, + "num_tokens": 658487019.0, + "step": 17259 + }, + { + "epoch": 2.1956494084722045, + "ewc_loss": 0.04704798758029938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.352399314986542e-05, + "grad_norm": 30.131961822509766, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8697556257247925, + "num_tokens": 658529290.0, + "step": 17260 + }, + { + "epoch": 2.195776618750795, + "ewc_loss": 0.04707163944840431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3535820218967274e-05, + "grad_norm": 30.195907592773438, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8658236861228943, + "num_tokens": 658572118.0, + "step": 17261 + }, + { + "epoch": 2.1959038290293855, + "ewc_loss": 0.0471356026828289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3567801690660417e-05, + "grad_norm": 30.262758255004883, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8685397505760193, + "num_tokens": 658615972.0, + "step": 17262 + }, + { + "epoch": 2.196031039307976, + "ewc_loss": 0.04707914590835571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.353957279410679e-05, + "grad_norm": 30.107666015625, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8786462545394897, + "num_tokens": 658653848.0, + "step": 17263 + }, + { + "epoch": 2.1961582495865666, + "ewc_loss": 0.04712517559528351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3562588467029855e-05, + "grad_norm": 30.220073699951172, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8714179992675781, + "num_tokens": 658686247.0, + "step": 17264 + }, + { + "epoch": 2.196285459865157, + "ewc_loss": 0.04706219956278801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3531099941465072e-05, + "grad_norm": 30.09038543701172, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8677502274513245, + "num_tokens": 658725787.0, + "step": 17265 + }, + { + "epoch": 2.1964126701437476, + "ewc_loss": 0.047138139605522156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.356906952627469e-05, + "grad_norm": 30.203662872314453, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8680297136306763, + "num_tokens": 658765957.0, + "step": 17266 + }, + { + "epoch": 2.196539880422338, + "ewc_loss": 0.04714757576584816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3573787984787486e-05, + "grad_norm": 30.218463897705078, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8814714550971985, + "num_tokens": 658800996.0, + "step": 17267 + }, + { + "epoch": 2.1966670907009287, + "ewc_loss": 0.047110170125961304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3555085135740228e-05, + "grad_norm": 30.225725173950195, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8687582015991211, + "num_tokens": 658833779.0, + "step": 17268 + }, + { + "epoch": 2.196794300979519, + "ewc_loss": 0.04707454517483711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3537271772511303e-05, + "grad_norm": 30.155351638793945, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8751227855682373, + "num_tokens": 658874425.0, + "step": 17269 + }, + { + "epoch": 2.1969215112581097, + "ewc_loss": 0.04711871221661568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3559356122859754e-05, + "grad_norm": 30.143476486206055, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8800392746925354, + "num_tokens": 658912007.0, + "step": 17270 + }, + { + "epoch": 2.1970487215367003, + "ewc_loss": 0.04713050648570061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.356525328650605e-05, + "grad_norm": 30.22031593322754, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.862568736076355, + "num_tokens": 658956775.0, + "step": 17271 + }, + { + "epoch": 2.197175931815291, + "ewc_loss": 0.04712624475359917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3563123249914497e-05, + "grad_norm": 30.14238929748535, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8840396404266357, + "num_tokens": 658997132.0, + "step": 17272 + }, + { + "epoch": 2.1973031420938813, + "ewc_loss": 0.04707220941781998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3536103981314227e-05, + "grad_norm": 30.236045837402344, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8782739639282227, + "num_tokens": 659040437.0, + "step": 17273 + }, + { + "epoch": 2.197430352372472, + "ewc_loss": 0.04718070104718208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.359034988330677e-05, + "grad_norm": 30.237564086914062, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.8866199254989624, + "num_tokens": 659074461.0, + "step": 17274 + }, + { + "epoch": 2.1975575626510624, + "ewc_loss": 0.04708125442266464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3540627807960846e-05, + "grad_norm": 30.132078170776367, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8689067363739014, + "num_tokens": 659113818.0, + "step": 17275 + }, + { + "epoch": 2.197684772929653, + "ewc_loss": 0.04707717150449753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3538585082860664e-05, + "grad_norm": 30.299558639526367, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8581302762031555, + "num_tokens": 659149297.0, + "step": 17276 + }, + { + "epoch": 2.1978119832082434, + "ewc_loss": 0.04706158861517906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3530794351245277e-05, + "grad_norm": 30.135902404785156, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8746469020843506, + "num_tokens": 659187500.0, + "step": 17277 + }, + { + "epoch": 2.1979391934868335, + "ewc_loss": 0.047079961746931076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3539980247733183e-05, + "grad_norm": 30.287954330444336, + "learning_rate": 1e-06, + "loss": 0.507, + "mean_token_accuracy": 0.8457885980606079, + "num_tokens": 659228624.0, + "step": 17278 + }, + { + "epoch": 2.198066403765424, + "ewc_loss": 0.047095704823732376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.354785283387173e-05, + "grad_norm": 30.145784378051758, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8793085813522339, + "num_tokens": 659270317.0, + "step": 17279 + }, + { + "epoch": 2.1981936140440146, + "ewc_loss": 0.04694822058081627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.347411100345198e-05, + "grad_norm": 30.24292755126953, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8709824681282043, + "num_tokens": 659302133.0, + "step": 17280 + }, + { + "epoch": 2.198320824322605, + "ewc_loss": 0.047017406672239304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3508702724939212e-05, + "grad_norm": 30.194461822509766, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8776251077651978, + "num_tokens": 659342469.0, + "step": 17281 + }, + { + "epoch": 2.1984480346011956, + "ewc_loss": 0.04710334539413452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3551672711619176e-05, + "grad_norm": 30.267274856567383, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8831122517585754, + "num_tokens": 659373786.0, + "step": 17282 + }, + { + "epoch": 2.198575244879786, + "ewc_loss": 0.04706989973783493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.353495074203238e-05, + "grad_norm": 30.209768295288086, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8642175793647766, + "num_tokens": 659412498.0, + "step": 17283 + }, + { + "epoch": 2.1987024551583767, + "ewc_loss": 0.04698225110769272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.349112583033275e-05, + "grad_norm": 30.300424575805664, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8568082451820374, + "num_tokens": 659446415.0, + "step": 17284 + }, + { + "epoch": 2.1988296654369672, + "ewc_loss": 0.04710361361503601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.355180731683504e-05, + "grad_norm": 30.20461654663086, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.862969160079956, + "num_tokens": 659488824.0, + "step": 17285 + }, + { + "epoch": 2.1989568757155578, + "ewc_loss": 0.04699090123176575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.349545138713438e-05, + "grad_norm": 30.211610794067383, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8750023245811462, + "num_tokens": 659523036.0, + "step": 17286 + }, + { + "epoch": 2.1990840859941483, + "ewc_loss": 0.04703376069664955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3516880901297554e-05, + "grad_norm": 30.02452850341797, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8653205633163452, + "num_tokens": 659562780.0, + "step": 17287 + }, + { + "epoch": 2.199211296272739, + "ewc_loss": 0.047064200043678284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3532100385637023e-05, + "grad_norm": 30.3061580657959, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.878269374370575, + "num_tokens": 659599364.0, + "step": 17288 + }, + { + "epoch": 2.1993385065513293, + "ewc_loss": 0.047061946243047714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3530972612206824e-05, + "grad_norm": 30.11542510986328, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8653260469436646, + "num_tokens": 659636275.0, + "step": 17289 + }, + { + "epoch": 2.19946571682992, + "ewc_loss": 0.04710346460342407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3551732738269493e-05, + "grad_norm": 30.29789161682129, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8722688555717468, + "num_tokens": 659670315.0, + "step": 17290 + }, + { + "epoch": 2.1995929271085104, + "ewc_loss": 0.047176919877529144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3588459953316487e-05, + "grad_norm": 30.18328285217285, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8672202825546265, + "num_tokens": 659710968.0, + "step": 17291 + }, + { + "epoch": 2.199720137387101, + "ewc_loss": 0.04714395850896835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3571979909320362e-05, + "grad_norm": 30.25105094909668, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.880072832107544, + "num_tokens": 659751334.0, + "step": 17292 + }, + { + "epoch": 2.1998473476656915, + "ewc_loss": 0.04712453484535217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3562266505905427e-05, + "grad_norm": 30.176586151123047, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8713973164558411, + "num_tokens": 659793867.0, + "step": 17293 + }, + { + "epoch": 2.199974557944282, + "ewc_loss": 0.04718024656176567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3590122509631328e-05, + "grad_norm": 30.255069732666016, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8717761039733887, + "num_tokens": 659838724.0, + "step": 17294 + }, + { + "epoch": 2.2001017682228725, + "ewc_loss": 0.04715642333030701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.357821176701691e-05, + "grad_norm": 30.194772720336914, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8688154220581055, + "num_tokens": 659876589.0, + "step": 17295 + }, + { + "epoch": 2.200228978501463, + "ewc_loss": 0.0471138060092926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.355690230615437e-05, + "grad_norm": 30.105072021484375, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8627465963363647, + "num_tokens": 659914864.0, + "step": 17296 + }, + { + "epoch": 2.2003561887800536, + "ewc_loss": 0.047209225594997406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3604612579219975e-05, + "grad_norm": 30.29530906677246, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8783762454986572, + "num_tokens": 659952482.0, + "step": 17297 + }, + { + "epoch": 2.200483399058644, + "ewc_loss": 0.047143593430519104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3571796191390604e-05, + "grad_norm": 30.131914138793945, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8712462186813354, + "num_tokens": 659985192.0, + "step": 17298 + }, + { + "epoch": 2.2006106093372346, + "ewc_loss": 0.047074321657419205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3537160814157687e-05, + "grad_norm": 30.22146224975586, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8788036704063416, + "num_tokens": 660019505.0, + "step": 17299 + }, + { + "epoch": 2.200737819615825, + "ewc_loss": 0.04710742086172104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3553709979751147e-05, + "grad_norm": 30.13129425048828, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8645974397659302, + "num_tokens": 660054942.0, + "step": 17300 + }, + { + "epoch": 2.2008650298944152, + "ewc_loss": 0.04718264564871788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.359132304263767e-05, + "grad_norm": 30.16473388671875, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8708289861679077, + "num_tokens": 660092836.0, + "step": 17301 + }, + { + "epoch": 2.200992240173006, + "ewc_loss": 0.047172870486974716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.358643541811034e-05, + "grad_norm": 30.194910049438477, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8658016324043274, + "num_tokens": 660129419.0, + "step": 17302 + }, + { + "epoch": 2.2011194504515963, + "ewc_loss": 0.047142162919044495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.357108132855501e-05, + "grad_norm": 30.232141494750977, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8744343519210815, + "num_tokens": 660168116.0, + "step": 17303 + }, + { + "epoch": 2.201246660730187, + "ewc_loss": 0.04711947962641716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3559739929623902e-05, + "grad_norm": 30.119356155395508, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8651082515716553, + "num_tokens": 660208006.0, + "step": 17304 + }, + { + "epoch": 2.2013738710087773, + "ewc_loss": 0.04717203229665756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3586017050547525e-05, + "grad_norm": 30.267955780029297, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8714184761047363, + "num_tokens": 660254046.0, + "step": 17305 + }, + { + "epoch": 2.201501081287368, + "ewc_loss": 0.047176800668239594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.358839992666617e-05, + "grad_norm": 30.21048355102539, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8706194162368774, + "num_tokens": 660289321.0, + "step": 17306 + }, + { + "epoch": 2.2016282915659584, + "ewc_loss": 0.047144919633865356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3572460122522898e-05, + "grad_norm": 30.113435745239258, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8681991696357727, + "num_tokens": 660325052.0, + "step": 17307 + }, + { + "epoch": 2.201755501844549, + "ewc_loss": 0.04731258004903793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3656290068174712e-05, + "grad_norm": 30.258840560913086, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8623665571212769, + "num_tokens": 660357942.0, + "step": 17308 + }, + { + "epoch": 2.2018827121231395, + "ewc_loss": 0.04715374857187271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3576874809805304e-05, + "grad_norm": 30.21966552734375, + "learning_rate": 1e-06, + "loss": 0.5004, + "mean_token_accuracy": 0.8458422422409058, + "num_tokens": 660390654.0, + "step": 17309 + }, + { + "epoch": 2.20200992240173, + "ewc_loss": 0.04718821123242378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3594106096425094e-05, + "grad_norm": 30.109561920166016, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8713113069534302, + "num_tokens": 660432020.0, + "step": 17310 + }, + { + "epoch": 2.2021371326803205, + "ewc_loss": 0.04712532460689545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.35626630455954e-05, + "grad_norm": 30.25470733642578, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.870747447013855, + "num_tokens": 660465051.0, + "step": 17311 + }, + { + "epoch": 2.202264342958911, + "ewc_loss": 0.047254886478185654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.362744271522388e-05, + "grad_norm": 30.143943786621094, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.867443323135376, + "num_tokens": 660504967.0, + "step": 17312 + }, + { + "epoch": 2.2023915532375016, + "ewc_loss": 0.047200463712215424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3600232452736236e-05, + "grad_norm": 30.244997024536133, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8618296384811401, + "num_tokens": 660543826.0, + "step": 17313 + }, + { + "epoch": 2.202518763516092, + "ewc_loss": 0.04726506397128105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3632532247575e-05, + "grad_norm": 30.03835678100586, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8665828704833984, + "num_tokens": 660583074.0, + "step": 17314 + }, + { + "epoch": 2.2026459737946826, + "ewc_loss": 0.047207310795784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.360365579079371e-05, + "grad_norm": 30.243852615356445, + "learning_rate": 1e-06, + "loss": 0.3638, + "mean_token_accuracy": 0.8874070048332214, + "num_tokens": 660619974.0, + "step": 17315 + }, + { + "epoch": 2.202773184073273, + "ewc_loss": 0.0472966693341732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3648333808523603e-05, + "grad_norm": 30.1943359375, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.87664794921875, + "num_tokens": 660654005.0, + "step": 17316 + }, + { + "epoch": 2.2029003943518637, + "ewc_loss": 0.0472058542072773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3602926376042888e-05, + "grad_norm": 30.194599151611328, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8590130805969238, + "num_tokens": 660689179.0, + "step": 17317 + }, + { + "epoch": 2.203027604630454, + "ewc_loss": 0.04729190468788147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.364595275139436e-05, + "grad_norm": 30.244403839111328, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8591735363006592, + "num_tokens": 660724923.0, + "step": 17318 + }, + { + "epoch": 2.2031548149090447, + "ewc_loss": 0.04731271043419838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.365635555179324e-05, + "grad_norm": 30.148317337036133, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8742576837539673, + "num_tokens": 660760763.0, + "step": 17319 + }, + { + "epoch": 2.2032820251876353, + "ewc_loss": 0.04721147567033768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.360573853366077e-05, + "grad_norm": 30.146835327148438, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8716731667518616, + "num_tokens": 660795520.0, + "step": 17320 + }, + { + "epoch": 2.203409235466226, + "ewc_loss": 0.04735898599028587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3679493097006343e-05, + "grad_norm": 30.222410202026367, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8814065456390381, + "num_tokens": 660834858.0, + "step": 17321 + }, + { + "epoch": 2.2035364457448163, + "ewc_loss": 0.0473051592707634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.365257932979148e-05, + "grad_norm": 30.169395446777344, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8796308040618896, + "num_tokens": 660870319.0, + "step": 17322 + }, + { + "epoch": 2.203663656023407, + "ewc_loss": 0.04724542051553726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3622709704795852e-05, + "grad_norm": 30.215782165527344, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8582550287246704, + "num_tokens": 660907313.0, + "step": 17323 + }, + { + "epoch": 2.2037908663019974, + "ewc_loss": 0.04720847308635712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.360423604841344e-05, + "grad_norm": 30.196630477905273, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8858249187469482, + "num_tokens": 660948087.0, + "step": 17324 + }, + { + "epoch": 2.203918076580588, + "ewc_loss": 0.04729553684592247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.36477681028191e-05, + "grad_norm": 30.358226776123047, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8765164613723755, + "num_tokens": 660984984.0, + "step": 17325 + }, + { + "epoch": 2.204045286859178, + "ewc_loss": 0.04722928628325462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3614642486791126e-05, + "grad_norm": 30.265621185302734, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8691188097000122, + "num_tokens": 661022898.0, + "step": 17326 + }, + { + "epoch": 2.2041724971377685, + "ewc_loss": 0.0471515916287899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3575796149089e-05, + "grad_norm": 30.127967834472656, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8811691999435425, + "num_tokens": 661058645.0, + "step": 17327 + }, + { + "epoch": 2.204299707416359, + "ewc_loss": 0.04722709208726883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3613545636180788e-05, + "grad_norm": 30.24567222595215, + "learning_rate": 1e-06, + "loss": 0.3607, + "mean_token_accuracy": 0.8928921222686768, + "num_tokens": 661096954.0, + "step": 17328 + }, + { + "epoch": 2.2044269176949496, + "ewc_loss": 0.0472664013504982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.36331998166861e-05, + "grad_norm": 30.216440200805664, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.8601813316345215, + "num_tokens": 661136947.0, + "step": 17329 + }, + { + "epoch": 2.20455412797354, + "ewc_loss": 0.04709259793162346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.35462994169211e-05, + "grad_norm": 30.165367126464844, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8726170063018799, + "num_tokens": 661175376.0, + "step": 17330 + }, + { + "epoch": 2.2046813382521306, + "ewc_loss": 0.04726048558950424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3630242139915936e-05, + "grad_norm": 30.199636459350586, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8647748827934265, + "num_tokens": 661213238.0, + "step": 17331 + }, + { + "epoch": 2.204808548530721, + "ewc_loss": 0.04717787355184555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3588936528540216e-05, + "grad_norm": 30.18935203552246, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8868752121925354, + "num_tokens": 661246512.0, + "step": 17332 + }, + { + "epoch": 2.2049357588093117, + "ewc_loss": 0.04717118293046951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3585591407027096e-05, + "grad_norm": 30.184812545776367, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8815470933914185, + "num_tokens": 661288272.0, + "step": 17333 + }, + { + "epoch": 2.2050629690879022, + "ewc_loss": 0.047250453382730484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.362522718613036e-05, + "grad_norm": 30.25531768798828, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8772537112236023, + "num_tokens": 661328705.0, + "step": 17334 + }, + { + "epoch": 2.2051901793664928, + "ewc_loss": 0.04723968356847763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.361984115850646e-05, + "grad_norm": 30.08774757385254, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8759143948554993, + "num_tokens": 661369877.0, + "step": 17335 + }, + { + "epoch": 2.2053173896450833, + "ewc_loss": 0.04715310409665108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3576552848680876e-05, + "grad_norm": 30.218873977661133, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8741199970245361, + "num_tokens": 661405209.0, + "step": 17336 + }, + { + "epoch": 2.205444599923674, + "ewc_loss": 0.04732643812894821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.366321859881282e-05, + "grad_norm": 30.230762481689453, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8755934238433838, + "num_tokens": 661441083.0, + "step": 17337 + }, + { + "epoch": 2.2055718102022643, + "ewc_loss": 0.04721692204475403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3608461560797878e-05, + "grad_norm": 30.28474998474121, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8735102415084839, + "num_tokens": 661476938.0, + "step": 17338 + }, + { + "epoch": 2.205699020480855, + "ewc_loss": 0.047245968133211136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3622984372195788e-05, + "grad_norm": 30.183910369873047, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8643218278884888, + "num_tokens": 661519281.0, + "step": 17339 + }, + { + "epoch": 2.2058262307594454, + "ewc_loss": 0.04722572863101959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.361286351515446e-05, + "grad_norm": 30.173221588134766, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8865141272544861, + "num_tokens": 661555880.0, + "step": 17340 + }, + { + "epoch": 2.205953441038036, + "ewc_loss": 0.047238875180482864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3619437342858873e-05, + "grad_norm": 30.258630752563477, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.862072229385376, + "num_tokens": 661595937.0, + "step": 17341 + }, + { + "epoch": 2.2060806513166265, + "ewc_loss": 0.047242265194654465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3621132640982978e-05, + "grad_norm": 30.15985107421875, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8578343391418457, + "num_tokens": 661636868.0, + "step": 17342 + }, + { + "epoch": 2.206207861595217, + "ewc_loss": 0.047256603837013245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3628301278222352e-05, + "grad_norm": 30.152368545532227, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8642417192459106, + "num_tokens": 661681503.0, + "step": 17343 + }, + { + "epoch": 2.2063350718738075, + "ewc_loss": 0.04725714400410652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.362857230764348e-05, + "grad_norm": 30.29283905029297, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8749326467514038, + "num_tokens": 661718438.0, + "step": 17344 + }, + { + "epoch": 2.206462282152398, + "ewc_loss": 0.047173816710710526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3586908355355263e-05, + "grad_norm": 30.07475471496582, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.85160893201828, + "num_tokens": 661759048.0, + "step": 17345 + }, + { + "epoch": 2.2065894924309886, + "ewc_loss": 0.04724433645606041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3622167645953596e-05, + "grad_norm": 30.285566329956055, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8828597068786621, + "num_tokens": 661803131.0, + "step": 17346 + }, + { + "epoch": 2.206716702709579, + "ewc_loss": 0.04723704978823662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3618524210178293e-05, + "grad_norm": 30.156904220581055, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8724970817565918, + "num_tokens": 661838139.0, + "step": 17347 + }, + { + "epoch": 2.2068439129881696, + "ewc_loss": 0.047158099710941315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3579050321131945e-05, + "grad_norm": 30.40939712524414, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8648802042007446, + "num_tokens": 661878568.0, + "step": 17348 + }, + { + "epoch": 2.20697112326676, + "ewc_loss": 0.04721138998866081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3605694877915084e-05, + "grad_norm": 30.117977142333984, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8652291893959045, + "num_tokens": 661909110.0, + "step": 17349 + }, + { + "epoch": 2.2070983335453507, + "ewc_loss": 0.0470801442861557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3540073016192764e-05, + "grad_norm": 30.33643913269043, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8849657773971558, + "num_tokens": 661944472.0, + "step": 17350 + }, + { + "epoch": 2.2072255438239408, + "ewc_loss": 0.0472109280526638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3605463866260834e-05, + "grad_norm": 30.17485809326172, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8840837478637695, + "num_tokens": 661985865.0, + "step": 17351 + }, + { + "epoch": 2.2073527541025313, + "ewc_loss": 0.04709547758102417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3547738237539306e-05, + "grad_norm": 30.261470794677734, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8664971590042114, + "num_tokens": 662019460.0, + "step": 17352 + }, + { + "epoch": 2.207479964381122, + "ewc_loss": 0.04730432853102684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.365216460020747e-05, + "grad_norm": 30.356576919555664, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8787556886672974, + "num_tokens": 662059109.0, + "step": 17353 + }, + { + "epoch": 2.2076071746597123, + "ewc_loss": 0.04708347097039223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3541735572507605e-05, + "grad_norm": 30.187528610229492, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8641343712806702, + "num_tokens": 662096040.0, + "step": 17354 + }, + { + "epoch": 2.207734384938303, + "ewc_loss": 0.04708733782172203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3543669158243574e-05, + "grad_norm": 30.22484588623047, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8760965466499329, + "num_tokens": 662139351.0, + "step": 17355 + }, + { + "epoch": 2.2078615952168934, + "ewc_loss": 0.04721558839082718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3607793991686776e-05, + "grad_norm": 30.28436279296875, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8729190826416016, + "num_tokens": 662177353.0, + "step": 17356 + }, + { + "epoch": 2.207988805495484, + "ewc_loss": 0.04714074730873108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.357037374167703e-05, + "grad_norm": 30.356428146362305, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8608596324920654, + "num_tokens": 662209519.0, + "step": 17357 + }, + { + "epoch": 2.2081160157740745, + "ewc_loss": 0.047130122780799866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3565062292618677e-05, + "grad_norm": 30.188369750976562, + "learning_rate": 1e-06, + "loss": 0.4806, + "mean_token_accuracy": 0.8519883155822754, + "num_tokens": 662251311.0, + "step": 17358 + }, + { + "epoch": 2.208243226052665, + "ewc_loss": 0.04707631841301918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3538159439340234e-05, + "grad_norm": 30.33730125427246, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8781570196151733, + "num_tokens": 662282770.0, + "step": 17359 + }, + { + "epoch": 2.2083704363312555, + "ewc_loss": 0.047172289341688156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3586144379805773e-05, + "grad_norm": 30.25403594970703, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8673267364501953, + "num_tokens": 662326770.0, + "step": 17360 + }, + { + "epoch": 2.208497646609846, + "ewc_loss": 0.047064851969480515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3532425984740257e-05, + "grad_norm": 30.254487991333008, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8705693483352661, + "num_tokens": 662366811.0, + "step": 17361 + }, + { + "epoch": 2.2086248568884366, + "ewc_loss": 0.04709775000810623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3548875105916522e-05, + "grad_norm": 30.367773056030273, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8674383759498596, + "num_tokens": 662404517.0, + "step": 17362 + }, + { + "epoch": 2.208752067167027, + "ewc_loss": 0.04707261174917221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.353630588913802e-05, + "grad_norm": 30.298139572143555, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.877508819103241, + "num_tokens": 662442091.0, + "step": 17363 + }, + { + "epoch": 2.2088792774456176, + "ewc_loss": 0.0470881387591362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3544069335912354e-05, + "grad_norm": 30.288352966308594, + "learning_rate": 1e-06, + "loss": 0.4929, + "mean_token_accuracy": 0.846551775932312, + "num_tokens": 662483343.0, + "step": 17364 + }, + { + "epoch": 2.209006487724208, + "ewc_loss": 0.04700012877583504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3500064344261773e-05, + "grad_norm": 30.287826538085938, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8654404878616333, + "num_tokens": 662521994.0, + "step": 17365 + }, + { + "epoch": 2.2091336980027987, + "ewc_loss": 0.047122590243816376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3561295165563934e-05, + "grad_norm": 30.174577713012695, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8661478757858276, + "num_tokens": 662562737.0, + "step": 17366 + }, + { + "epoch": 2.209260908281389, + "ewc_loss": 0.04701793193817139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3508966478402726e-05, + "grad_norm": 30.25469207763672, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8672595024108887, + "num_tokens": 662596927.0, + "step": 17367 + }, + { + "epoch": 2.2093881185599797, + "ewc_loss": 0.04716721922159195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.358360870857723e-05, + "grad_norm": 30.22602653503418, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8790971636772156, + "num_tokens": 662632831.0, + "step": 17368 + }, + { + "epoch": 2.2095153288385703, + "ewc_loss": 0.04705899581313133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.352949741180055e-05, + "grad_norm": 30.18254852294922, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8677420616149902, + "num_tokens": 662675554.0, + "step": 17369 + }, + { + "epoch": 2.209642539117161, + "ewc_loss": 0.047110430896282196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3555216102977283e-05, + "grad_norm": 30.21861457824707, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8831132650375366, + "num_tokens": 662720248.0, + "step": 17370 + }, + { + "epoch": 2.2097697493957513, + "ewc_loss": 0.04712330549955368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3561653506476432e-05, + "grad_norm": 30.258560180664062, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8716463446617126, + "num_tokens": 662761388.0, + "step": 17371 + }, + { + "epoch": 2.209896959674342, + "ewc_loss": 0.047119513154029846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3559756300528534e-05, + "grad_norm": 30.305435180664062, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8704472780227661, + "num_tokens": 662798007.0, + "step": 17372 + }, + { + "epoch": 2.2100241699529324, + "ewc_loss": 0.047084156423807144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.354207754251547e-05, + "grad_norm": 30.165821075439453, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8677641153335571, + "num_tokens": 662836042.0, + "step": 17373 + }, + { + "epoch": 2.210151380231523, + "ewc_loss": 0.047176070511341095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.358803612878546e-05, + "grad_norm": 30.26629066467285, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8614819645881653, + "num_tokens": 662869465.0, + "step": 17374 + }, + { + "epoch": 2.2102785905101134, + "ewc_loss": 0.047191329300403595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3595664970343933e-05, + "grad_norm": 30.32681655883789, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8636540770530701, + "num_tokens": 662909440.0, + "step": 17375 + }, + { + "epoch": 2.2104058007887035, + "ewc_loss": 0.047185830771923065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.359291465836577e-05, + "grad_norm": 30.197534561157227, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8674143552780151, + "num_tokens": 662949637.0, + "step": 17376 + }, + { + "epoch": 2.210533011067294, + "ewc_loss": 0.04705596715211868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3527984012616798e-05, + "grad_norm": 30.184886932373047, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8686279058456421, + "num_tokens": 662987679.0, + "step": 17377 + }, + { + "epoch": 2.2106602213458846, + "ewc_loss": 0.047207657247781754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3603828594787046e-05, + "grad_norm": 30.277570724487305, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8527923822402954, + "num_tokens": 663027651.0, + "step": 17378 + }, + { + "epoch": 2.210787431624475, + "ewc_loss": 0.04708896577358246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.354448224650696e-05, + "grad_norm": 30.162452697753906, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8837313652038574, + "num_tokens": 663063708.0, + "step": 17379 + }, + { + "epoch": 2.2109146419030656, + "ewc_loss": 0.047160062938928604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3580030756420456e-05, + "grad_norm": 30.21552085876465, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8638651371002197, + "num_tokens": 663097255.0, + "step": 17380 + }, + { + "epoch": 2.211041852181656, + "ewc_loss": 0.047166768461465836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3583384972880594e-05, + "grad_norm": 30.157773971557617, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8677937388420105, + "num_tokens": 663134496.0, + "step": 17381 + }, + { + "epoch": 2.2111690624602467, + "ewc_loss": 0.04720062389969826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3600312488269992e-05, + "grad_norm": 30.249631881713867, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8800714612007141, + "num_tokens": 663170278.0, + "step": 17382 + }, + { + "epoch": 2.211296272738837, + "ewc_loss": 0.047258537262678146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.362926898058504e-05, + "grad_norm": 30.24549674987793, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8677316308021545, + "num_tokens": 663205407.0, + "step": 17383 + }, + { + "epoch": 2.2114234830174277, + "ewc_loss": 0.047189339995384216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3594669983140193e-05, + "grad_norm": 30.224590301513672, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8745527267456055, + "num_tokens": 663243166.0, + "step": 17384 + }, + { + "epoch": 2.2115506932960183, + "ewc_loss": 0.04723082482814789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3615411919308826e-05, + "grad_norm": 30.322505950927734, + "learning_rate": 1e-06, + "loss": 0.3538, + "mean_token_accuracy": 0.8926998376846313, + "num_tokens": 663282267.0, + "step": 17385 + }, + { + "epoch": 2.211677903574609, + "ewc_loss": 0.04720833897590637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.360416874580551e-05, + "grad_norm": 30.251384735107422, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8624973893165588, + "num_tokens": 663318620.0, + "step": 17386 + }, + { + "epoch": 2.2118051138531993, + "ewc_loss": 0.04712212458252907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.356106233492028e-05, + "grad_norm": 30.197174072265625, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8883053064346313, + "num_tokens": 663353256.0, + "step": 17387 + }, + { + "epoch": 2.21193232413179, + "ewc_loss": 0.04720345884561539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3601729481015354e-05, + "grad_norm": 30.290544509887695, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8743175268173218, + "num_tokens": 663386691.0, + "step": 17388 + }, + { + "epoch": 2.2120595344103804, + "ewc_loss": 0.047234535217285156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3617267288500443e-05, + "grad_norm": 30.212804794311523, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8669968843460083, + "num_tokens": 663416550.0, + "step": 17389 + }, + { + "epoch": 2.212186744688971, + "ewc_loss": 0.04716016724705696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.358008350711316e-05, + "grad_norm": 30.21883201599121, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8731721639633179, + "num_tokens": 663464912.0, + "step": 17390 + }, + { + "epoch": 2.2123139549675614, + "ewc_loss": 0.04729233682155609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3646169211133383e-05, + "grad_norm": 30.056549072265625, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8614727258682251, + "num_tokens": 663503786.0, + "step": 17391 + }, + { + "epoch": 2.212441165246152, + "ewc_loss": 0.04713139310479164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3565697119920515e-05, + "grad_norm": 30.212610244750977, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.878891110420227, + "num_tokens": 663540579.0, + "step": 17392 + }, + { + "epoch": 2.2125683755247425, + "ewc_loss": 0.0473187081515789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3659353246330284e-05, + "grad_norm": 30.089401245117188, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.871665894985199, + "num_tokens": 663577284.0, + "step": 17393 + }, + { + "epoch": 2.212695585803333, + "ewc_loss": 0.047161784023046494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3580891138408333e-05, + "grad_norm": 30.230907440185547, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8676172494888306, + "num_tokens": 663613118.0, + "step": 17394 + }, + { + "epoch": 2.2128227960819236, + "ewc_loss": 0.047354135662317276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3677068384131417e-05, + "grad_norm": 30.145750045776367, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8755744695663452, + "num_tokens": 663651548.0, + "step": 17395 + }, + { + "epoch": 2.212950006360514, + "ewc_loss": 0.04719848930835724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.359924474149011e-05, + "grad_norm": 30.205869674682617, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8532695770263672, + "num_tokens": 663694381.0, + "step": 17396 + }, + { + "epoch": 2.2130772166391046, + "ewc_loss": 0.047281768172979355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.364088322792668e-05, + "grad_norm": 30.202791213989258, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.873079240322113, + "num_tokens": 663734686.0, + "step": 17397 + }, + { + "epoch": 2.213204426917695, + "ewc_loss": 0.04727024957537651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3635124307475053e-05, + "grad_norm": 30.17642593383789, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8621999621391296, + "num_tokens": 663768764.0, + "step": 17398 + }, + { + "epoch": 2.2133316371962852, + "ewc_loss": 0.04725223779678345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3626118490938097e-05, + "grad_norm": 30.38489532470703, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8907660841941833, + "num_tokens": 663807650.0, + "step": 17399 + }, + { + "epoch": 2.213458847474876, + "ewc_loss": 0.0472477488219738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.362387385801412e-05, + "grad_norm": 30.14760398864746, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8686243295669556, + "num_tokens": 663849308.0, + "step": 17400 + }, + { + "epoch": 2.2135860577534663, + "ewc_loss": 0.047150690108537674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3575345039716922e-05, + "grad_norm": 30.270967483520508, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8572199940681458, + "num_tokens": 663888917.0, + "step": 17401 + }, + { + "epoch": 2.213713268032057, + "ewc_loss": 0.047307077795267105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.365353975619655e-05, + "grad_norm": 30.281259536743164, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8719128966331482, + "num_tokens": 663921048.0, + "step": 17402 + }, + { + "epoch": 2.2138404783106473, + "ewc_loss": 0.04717974364757538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.358987148909364e-05, + "grad_norm": 30.22347640991211, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8759211897850037, + "num_tokens": 663963433.0, + "step": 17403 + }, + { + "epoch": 2.213967688589238, + "ewc_loss": 0.04720010608434677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3600052372785285e-05, + "grad_norm": 30.130081176757812, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8540083169937134, + "num_tokens": 664003294.0, + "step": 17404 + }, + { + "epoch": 2.2140948988678284, + "ewc_loss": 0.047174371778964996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3587186660734005e-05, + "grad_norm": 30.175989151000977, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8590972423553467, + "num_tokens": 664043161.0, + "step": 17405 + }, + { + "epoch": 2.214222109146419, + "ewc_loss": 0.047267936170101166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3633967430214398e-05, + "grad_norm": 30.220190048217773, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8824291825294495, + "num_tokens": 664082457.0, + "step": 17406 + }, + { + "epoch": 2.2143493194250095, + "ewc_loss": 0.04720023274421692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.360011603741441e-05, + "grad_norm": 30.124956130981445, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8809469938278198, + "num_tokens": 664125999.0, + "step": 17407 + }, + { + "epoch": 2.2144765297036, + "ewc_loss": 0.0472368523478508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3618425984750502e-05, + "grad_norm": 30.195255279541016, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8762540817260742, + "num_tokens": 664167256.0, + "step": 17408 + }, + { + "epoch": 2.2146037399821905, + "ewc_loss": 0.04725736379623413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3628681447007693e-05, + "grad_norm": 30.274873733520508, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8676201105117798, + "num_tokens": 664201887.0, + "step": 17409 + }, + { + "epoch": 2.214730950260781, + "ewc_loss": 0.04715169221162796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3575845261802897e-05, + "grad_norm": 30.047027587890625, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8749393820762634, + "num_tokens": 664238986.0, + "step": 17410 + }, + { + "epoch": 2.2148581605393716, + "ewc_loss": 0.04723126441240311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3615632017026655e-05, + "grad_norm": 30.322179794311523, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.866165041923523, + "num_tokens": 664283436.0, + "step": 17411 + }, + { + "epoch": 2.214985370817962, + "ewc_loss": 0.047234244644641876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.361712176934816e-05, + "grad_norm": 30.23503303527832, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8649907112121582, + "num_tokens": 664325196.0, + "step": 17412 + }, + { + "epoch": 2.2151125810965526, + "ewc_loss": 0.04716464877128601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3582324502058327e-05, + "grad_norm": 30.219785690307617, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8621042370796204, + "num_tokens": 664365722.0, + "step": 17413 + }, + { + "epoch": 2.215239791375143, + "ewc_loss": 0.04726444184780121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3632221200386994e-05, + "grad_norm": 30.157989501953125, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.862017035484314, + "num_tokens": 664411570.0, + "step": 17414 + }, + { + "epoch": 2.2153670016537337, + "ewc_loss": 0.04716455563902855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3582277208333835e-05, + "grad_norm": 30.19270133972168, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8736874461174011, + "num_tokens": 664453545.0, + "step": 17415 + }, + { + "epoch": 2.215494211932324, + "ewc_loss": 0.04722702130675316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.361351107538212e-05, + "grad_norm": 30.157825469970703, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8714311718940735, + "num_tokens": 664489882.0, + "step": 17416 + }, + { + "epoch": 2.2156214222109147, + "ewc_loss": 0.04713961109519005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3569806216983125e-05, + "grad_norm": 30.13370132446289, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8689427375793457, + "num_tokens": 664528720.0, + "step": 17417 + }, + { + "epoch": 2.2157486324895053, + "ewc_loss": 0.04725697264075279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3628486815141514e-05, + "grad_norm": 30.24856948852539, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8611547946929932, + "num_tokens": 664569645.0, + "step": 17418 + }, + { + "epoch": 2.215875842768096, + "ewc_loss": 0.04726140573620796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3630702344235033e-05, + "grad_norm": 30.282678604125977, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.865544855594635, + "num_tokens": 664612033.0, + "step": 17419 + }, + { + "epoch": 2.2160030530466863, + "ewc_loss": 0.04717874526977539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3589373085997067e-05, + "grad_norm": 30.171438217163086, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8789833784103394, + "num_tokens": 664651408.0, + "step": 17420 + }, + { + "epoch": 2.216130263325277, + "ewc_loss": 0.04721065238118172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3605325623066165e-05, + "grad_norm": 30.258947372436523, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8558595180511475, + "num_tokens": 664695764.0, + "step": 17421 + }, + { + "epoch": 2.2162574736038674, + "ewc_loss": 0.047144148498773575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3572074496769346e-05, + "grad_norm": 30.086626052856445, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8738462924957275, + "num_tokens": 664735528.0, + "step": 17422 + }, + { + "epoch": 2.216384683882458, + "ewc_loss": 0.0472102165222168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3605109163327143e-05, + "grad_norm": 30.351303100585938, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8629336357116699, + "num_tokens": 664775612.0, + "step": 17423 + }, + { + "epoch": 2.216511894161048, + "ewc_loss": 0.04724936932325363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3624685127288103e-05, + "grad_norm": 30.16281509399414, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8669481873512268, + "num_tokens": 664812485.0, + "step": 17424 + }, + { + "epoch": 2.2166391044396385, + "ewc_loss": 0.04716304689645767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3581524146720767e-05, + "grad_norm": 30.156627655029297, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.855675458908081, + "num_tokens": 664853166.0, + "step": 17425 + }, + { + "epoch": 2.216766314718229, + "ewc_loss": 0.047187112271785736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3593556761625223e-05, + "grad_norm": 30.38393783569336, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8595151901245117, + "num_tokens": 664888778.0, + "step": 17426 + }, + { + "epoch": 2.2168935249968196, + "ewc_loss": 0.047261081635951996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3630540454178117e-05, + "grad_norm": 30.144559860229492, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8680508732795715, + "num_tokens": 664933783.0, + "step": 17427 + }, + { + "epoch": 2.21702073527541, + "ewc_loss": 0.04718705639243126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3593527657794766e-05, + "grad_norm": 30.37671661376953, + "learning_rate": 1e-06, + "loss": 0.493, + "mean_token_accuracy": 0.8541759252548218, + "num_tokens": 664965228.0, + "step": 17428 + }, + { + "epoch": 2.2171479455540006, + "ewc_loss": 0.04731222242116928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3656111807213165e-05, + "grad_norm": 30.189546585083008, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8652908205986023, + "num_tokens": 664996691.0, + "step": 17429 + }, + { + "epoch": 2.217275155832591, + "ewc_loss": 0.04718439653515816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3592197976540774e-05, + "grad_norm": 30.367219924926758, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8807153701782227, + "num_tokens": 665032006.0, + "step": 17430 + }, + { + "epoch": 2.2174023661111817, + "ewc_loss": 0.047256238758563995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3628119379281998e-05, + "grad_norm": 30.157325744628906, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8605282306671143, + "num_tokens": 665071819.0, + "step": 17431 + }, + { + "epoch": 2.217529576389772, + "ewc_loss": 0.04716550186276436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3582750145578757e-05, + "grad_norm": 30.28903579711914, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8588137626647949, + "num_tokens": 665118883.0, + "step": 17432 + }, + { + "epoch": 2.2176567866683627, + "ewc_loss": 0.04724591597914696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3622958906344138e-05, + "grad_norm": 30.352392196655273, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8517231345176697, + "num_tokens": 665154926.0, + "step": 17433 + }, + { + "epoch": 2.2177839969469533, + "ewc_loss": 0.04721006378531456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3605032765772194e-05, + "grad_norm": 30.181690216064453, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8787447214126587, + "num_tokens": 665189910.0, + "step": 17434 + }, + { + "epoch": 2.217911207225544, + "ewc_loss": 0.04719690978527069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3598455300088972e-05, + "grad_norm": 30.340309143066406, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8771328926086426, + "num_tokens": 665235849.0, + "step": 17435 + }, + { + "epoch": 2.2180384175041343, + "ewc_loss": 0.047218192368745804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3609096388099715e-05, + "grad_norm": 30.16170310974121, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8613242506980896, + "num_tokens": 665274231.0, + "step": 17436 + }, + { + "epoch": 2.218165627782725, + "ewc_loss": 0.0472484789788723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3624239474884234e-05, + "grad_norm": 30.34126853942871, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8879256248474121, + "num_tokens": 665309057.0, + "step": 17437 + }, + { + "epoch": 2.2182928380613154, + "ewc_loss": 0.04723086953163147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3615435566171072e-05, + "grad_norm": 30.205631256103516, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8680920004844666, + "num_tokens": 665337031.0, + "step": 17438 + }, + { + "epoch": 2.218420048339906, + "ewc_loss": 0.04722601920366287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3613009034306742e-05, + "grad_norm": 30.43117332458496, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8688567876815796, + "num_tokens": 665376557.0, + "step": 17439 + }, + { + "epoch": 2.2185472586184964, + "ewc_loss": 0.047286804765462875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.364340252825059e-05, + "grad_norm": 30.247480392456055, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8686138391494751, + "num_tokens": 665419928.0, + "step": 17440 + }, + { + "epoch": 2.218674468897087, + "ewc_loss": 0.04711444675922394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3557224267278798e-05, + "grad_norm": 30.227754592895508, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8761522769927979, + "num_tokens": 665458827.0, + "step": 17441 + }, + { + "epoch": 2.2188016791756775, + "ewc_loss": 0.04720187932252884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3600940039614215e-05, + "grad_norm": 30.22406578063965, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8728160262107849, + "num_tokens": 665498623.0, + "step": 17442 + }, + { + "epoch": 2.218928889454268, + "ewc_loss": 0.04726618155837059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.363309067732189e-05, + "grad_norm": 30.440521240234375, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.864956259727478, + "num_tokens": 665536103.0, + "step": 17443 + }, + { + "epoch": 2.2190560997328586, + "ewc_loss": 0.0472460500895977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3623024389962666e-05, + "grad_norm": 30.21845245361328, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.867933452129364, + "num_tokens": 665572089.0, + "step": 17444 + }, + { + "epoch": 2.219183310011449, + "ewc_loss": 0.04715805500745773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.35790266742697e-05, + "grad_norm": 30.280765533447266, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8624055981636047, + "num_tokens": 665608088.0, + "step": 17445 + }, + { + "epoch": 2.2193105202900396, + "ewc_loss": 0.047271303832530975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.363565181440208e-05, + "grad_norm": 30.245363235473633, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8781147003173828, + "num_tokens": 665641883.0, + "step": 17446 + }, + { + "epoch": 2.21943773056863, + "ewc_loss": 0.0471663661301136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.35831830650568e-05, + "grad_norm": 30.144899368286133, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8744303584098816, + "num_tokens": 665682211.0, + "step": 17447 + }, + { + "epoch": 2.2195649408472207, + "ewc_loss": 0.047268155962228775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3634078388568014e-05, + "grad_norm": 30.212242126464844, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8750799894332886, + "num_tokens": 665726182.0, + "step": 17448 + }, + { + "epoch": 2.2196921511258108, + "ewc_loss": 0.04727467894554138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.363733983656857e-05, + "grad_norm": 30.276458740234375, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.874078094959259, + "num_tokens": 665760404.0, + "step": 17449 + }, + { + "epoch": 2.2198193614044013, + "ewc_loss": 0.047347553074359894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3673776013311e-05, + "grad_norm": 30.246570587158203, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8695377111434937, + "num_tokens": 665798235.0, + "step": 17450 + }, + { + "epoch": 2.219946571682992, + "ewc_loss": 0.04728619381785393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3643096938030794e-05, + "grad_norm": 30.172378540039062, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8716627359390259, + "num_tokens": 665840118.0, + "step": 17451 + }, + { + "epoch": 2.2200737819615823, + "ewc_loss": 0.04729780554771423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3648903152206913e-05, + "grad_norm": 30.226829528808594, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8694459795951843, + "num_tokens": 665878498.0, + "step": 17452 + }, + { + "epoch": 2.220200992240173, + "ewc_loss": 0.04738975316286087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.369487629039213e-05, + "grad_norm": 30.24013328552246, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8658443689346313, + "num_tokens": 665924179.0, + "step": 17453 + }, + { + "epoch": 2.2203282025187634, + "ewc_loss": 0.0472964271903038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.364821375522297e-05, + "grad_norm": 30.286117553710938, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8840434551239014, + "num_tokens": 665963160.0, + "step": 17454 + }, + { + "epoch": 2.220455412797354, + "ewc_loss": 0.04730233922600746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.365116961300373e-05, + "grad_norm": 30.213600158691406, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8792091012001038, + "num_tokens": 665998854.0, + "step": 17455 + }, + { + "epoch": 2.2205826230759445, + "ewc_loss": 0.04736091569066048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3680457161390223e-05, + "grad_norm": 30.283039093017578, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8783684968948364, + "num_tokens": 666036420.0, + "step": 17456 + }, + { + "epoch": 2.220709833354535, + "ewc_loss": 0.04735687002539635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.367843444517348e-05, + "grad_norm": 30.262300491333008, + "learning_rate": 1e-06, + "loss": 0.491, + "mean_token_accuracy": 0.8501341938972473, + "num_tokens": 666076692.0, + "step": 17457 + }, + { + "epoch": 2.2208370436331255, + "ewc_loss": 0.04735704883933067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3678523575654253e-05, + "grad_norm": 30.281553268432617, + "learning_rate": 1e-06, + "loss": 0.4835, + "mean_token_accuracy": 0.856475830078125, + "num_tokens": 666118009.0, + "step": 17458 + }, + { + "epoch": 2.220964253911716, + "ewc_loss": 0.04736291617155075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3681457605562173e-05, + "grad_norm": 30.19776153564453, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8606704473495483, + "num_tokens": 666158132.0, + "step": 17459 + }, + { + "epoch": 2.2210914641903066, + "ewc_loss": 0.047401756048202515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.370087713643443e-05, + "grad_norm": 30.388660430908203, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8649026155471802, + "num_tokens": 666187195.0, + "step": 17460 + }, + { + "epoch": 2.221218674468897, + "ewc_loss": 0.04735158383846283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3675791453570127e-05, + "grad_norm": 30.24683952331543, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8752366900444031, + "num_tokens": 666227790.0, + "step": 17461 + }, + { + "epoch": 2.2213458847474876, + "ewc_loss": 0.04725925624370575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3629627321497537e-05, + "grad_norm": 30.212244033813477, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8639453053474426, + "num_tokens": 666268517.0, + "step": 17462 + }, + { + "epoch": 2.221473095026078, + "ewc_loss": 0.047430675476789474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3715338102192618e-05, + "grad_norm": 30.195472717285156, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8864768147468567, + "num_tokens": 666313727.0, + "step": 17463 + }, + { + "epoch": 2.2216003053046687, + "ewc_loss": 0.0473158024251461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3657901692786254e-05, + "grad_norm": 30.309127807617188, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8725666999816895, + "num_tokens": 666352713.0, + "step": 17464 + }, + { + "epoch": 2.221727515583259, + "ewc_loss": 0.047395188361406326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3697593860561028e-05, + "grad_norm": 30.171314239501953, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8777510523796082, + "num_tokens": 666395065.0, + "step": 17465 + }, + { + "epoch": 2.2218547258618497, + "ewc_loss": 0.04729906842112541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3649534341529943e-05, + "grad_norm": 30.28689193725586, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8716444969177246, + "num_tokens": 666428019.0, + "step": 17466 + }, + { + "epoch": 2.2219819361404403, + "ewc_loss": 0.04744265601038933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3721328034298494e-05, + "grad_norm": 30.276565551757812, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8678308725357056, + "num_tokens": 666466830.0, + "step": 17467 + }, + { + "epoch": 2.222109146419031, + "ewc_loss": 0.04728017374873161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3640086510567926e-05, + "grad_norm": 30.25371551513672, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8875023126602173, + "num_tokens": 666501028.0, + "step": 17468 + }, + { + "epoch": 2.2222363566976213, + "ewc_loss": 0.04734913259744644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3674565454712138e-05, + "grad_norm": 30.267709732055664, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8824430704116821, + "num_tokens": 666538554.0, + "step": 17469 + }, + { + "epoch": 2.222363566976212, + "ewc_loss": 0.04728515446186066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.364257670706138e-05, + "grad_norm": 30.209596633911133, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8696093559265137, + "num_tokens": 666577019.0, + "step": 17470 + }, + { + "epoch": 2.2224907772548024, + "ewc_loss": 0.04740043729543686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3700218662270345e-05, + "grad_norm": 30.367050170898438, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8848791718482971, + "num_tokens": 666616597.0, + "step": 17471 + }, + { + "epoch": 2.222617987533393, + "ewc_loss": 0.04738093540072441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3690467060077935e-05, + "grad_norm": 30.209726333618164, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8745476603507996, + "num_tokens": 666651523.0, + "step": 17472 + }, + { + "epoch": 2.2227451978119834, + "ewc_loss": 0.047397367656230927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3698683435213752e-05, + "grad_norm": 30.36156463623047, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.860190749168396, + "num_tokens": 666689270.0, + "step": 17473 + }, + { + "epoch": 2.2228724080905735, + "ewc_loss": 0.047380607575178146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3690303351031616e-05, + "grad_norm": 30.2392520904541, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8667083382606506, + "num_tokens": 666726836.0, + "step": 17474 + }, + { + "epoch": 2.222999618369164, + "ewc_loss": 0.04727799445390701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3638996935915202e-05, + "grad_norm": 30.258657455444336, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8677539825439453, + "num_tokens": 666761207.0, + "step": 17475 + }, + { + "epoch": 2.2231268286477546, + "ewc_loss": 0.04738737270236015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.369368667132221e-05, + "grad_norm": 30.242034912109375, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8788043260574341, + "num_tokens": 666800293.0, + "step": 17476 + }, + { + "epoch": 2.223254038926345, + "ewc_loss": 0.04738220199942589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3691101887379773e-05, + "grad_norm": 30.36107635498047, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8797169923782349, + "num_tokens": 666843046.0, + "step": 17477 + }, + { + "epoch": 2.2233812492049356, + "ewc_loss": 0.047390419989824295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.369520916545298e-05, + "grad_norm": 30.403057098388672, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8834701776504517, + "num_tokens": 666885840.0, + "step": 17478 + }, + { + "epoch": 2.223508459483526, + "ewc_loss": 0.047308020293712616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.365401087445207e-05, + "grad_norm": 30.34269142150879, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8706531524658203, + "num_tokens": 666925654.0, + "step": 17479 + }, + { + "epoch": 2.2236356697621167, + "ewc_loss": 0.047251466661691666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3625732865184546e-05, + "grad_norm": 30.419422149658203, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8718904256820679, + "num_tokens": 666968098.0, + "step": 17480 + }, + { + "epoch": 2.223762880040707, + "ewc_loss": 0.04722967743873596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.361483893764671e-05, + "grad_norm": 30.299272537231445, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8815723657608032, + "num_tokens": 667010349.0, + "step": 17481 + }, + { + "epoch": 2.2238900903192977, + "ewc_loss": 0.04724728316068649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3623641027370468e-05, + "grad_norm": 30.37537956237793, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8620827198028564, + "num_tokens": 667047086.0, + "step": 17482 + }, + { + "epoch": 2.2240173005978883, + "ewc_loss": 0.04729519039392471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3647595298825763e-05, + "grad_norm": 30.404542922973633, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.854088544845581, + "num_tokens": 667083955.0, + "step": 17483 + }, + { + "epoch": 2.224144510876479, + "ewc_loss": 0.04718360677361488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3591803255840205e-05, + "grad_norm": 30.17215347290039, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8769969344139099, + "num_tokens": 667119751.0, + "step": 17484 + }, + { + "epoch": 2.2242717211550693, + "ewc_loss": 0.04719274863600731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3596374376211315e-05, + "grad_norm": 30.2752742767334, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8665730953216553, + "num_tokens": 667150980.0, + "step": 17485 + }, + { + "epoch": 2.22439893143366, + "ewc_loss": 0.04723993316292763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3619966668775305e-05, + "grad_norm": 30.2027645111084, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8678473234176636, + "num_tokens": 667185198.0, + "step": 17486 + }, + { + "epoch": 2.2245261417122504, + "ewc_loss": 0.04730168357491493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3650842194911093e-05, + "grad_norm": 30.244205474853516, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8733499646186829, + "num_tokens": 667225286.0, + "step": 17487 + }, + { + "epoch": 2.224653351990841, + "ewc_loss": 0.047265976667404175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.363298881391529e-05, + "grad_norm": 30.26324462890625, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8654100894927979, + "num_tokens": 667266294.0, + "step": 17488 + }, + { + "epoch": 2.2247805622694314, + "ewc_loss": 0.047374580055475235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.368728928558994e-05, + "grad_norm": 30.335920333862305, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8682803511619568, + "num_tokens": 667302430.0, + "step": 17489 + }, + { + "epoch": 2.224907772548022, + "ewc_loss": 0.04730621725320816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.365310865570791e-05, + "grad_norm": 30.23874282836914, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8653477430343628, + "num_tokens": 667342881.0, + "step": 17490 + }, + { + "epoch": 2.2250349828266125, + "ewc_loss": 0.047309260815382004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.365463114983868e-05, + "grad_norm": 30.26386260986328, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8724164962768555, + "num_tokens": 667376122.0, + "step": 17491 + }, + { + "epoch": 2.225162193105203, + "ewc_loss": 0.04740089178085327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3700446035945788e-05, + "grad_norm": 30.249235153198242, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8637963533401489, + "num_tokens": 667415120.0, + "step": 17492 + }, + { + "epoch": 2.2252894033837936, + "ewc_loss": 0.04735979065299034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3679895093664527e-05, + "grad_norm": 30.32815170288086, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.869208574295044, + "num_tokens": 667451014.0, + "step": 17493 + }, + { + "epoch": 2.225416613662384, + "ewc_loss": 0.047428395599126816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3714197595836595e-05, + "grad_norm": 30.26582908630371, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8615895509719849, + "num_tokens": 667485600.0, + "step": 17494 + }, + { + "epoch": 2.2255438239409746, + "ewc_loss": 0.04731890931725502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3659455109736882e-05, + "grad_norm": 30.326147079467773, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8659894466400146, + "num_tokens": 667527561.0, + "step": 17495 + }, + { + "epoch": 2.225671034219565, + "ewc_loss": 0.0473717637360096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3685881387791596e-05, + "grad_norm": 30.434368133544922, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8777278661727905, + "num_tokens": 667569677.0, + "step": 17496 + }, + { + "epoch": 2.225798244498155, + "ewc_loss": 0.04742317646741867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3711587346042506e-05, + "grad_norm": 30.43149757385254, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8700244426727295, + "num_tokens": 667605954.0, + "step": 17497 + }, + { + "epoch": 2.225925454776746, + "ewc_loss": 0.047234907746315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.361745464440901e-05, + "grad_norm": 30.2247314453125, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8738269209861755, + "num_tokens": 667645082.0, + "step": 17498 + }, + { + "epoch": 2.2260526650553363, + "ewc_loss": 0.04733802750706673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3669013899052516e-05, + "grad_norm": 30.432880401611328, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.866637110710144, + "num_tokens": 667681390.0, + "step": 17499 + }, + { + "epoch": 2.226179875333927, + "ewc_loss": 0.04732188209891319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.366094122407958e-05, + "grad_norm": 30.256092071533203, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8605380058288574, + "num_tokens": 667722772.0, + "step": 17500 + }, + { + "epoch": 2.2263070856125173, + "ewc_loss": 0.04734671115875244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.367335582675878e-05, + "grad_norm": 30.418027877807617, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8455295562744141, + "num_tokens": 667758647.0, + "step": 17501 + }, + { + "epoch": 2.226434295891108, + "ewc_loss": 0.04734572023153305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.367285924265161e-05, + "grad_norm": 30.341678619384766, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8706538677215576, + "num_tokens": 667790625.0, + "step": 17502 + }, + { + "epoch": 2.2265615061696984, + "ewc_loss": 0.04735520854592323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3677604986005463e-05, + "grad_norm": 30.440595626831055, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.859532356262207, + "num_tokens": 667823724.0, + "step": 17503 + }, + { + "epoch": 2.226688716448289, + "ewc_loss": 0.04732399433851242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3661996237933636e-05, + "grad_norm": 30.330631256103516, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8837723731994629, + "num_tokens": 667862548.0, + "step": 17504 + }, + { + "epoch": 2.2268159267268794, + "ewc_loss": 0.04732159152626991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3660795704927295e-05, + "grad_norm": 30.256235122680664, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8633217811584473, + "num_tokens": 667905994.0, + "step": 17505 + }, + { + "epoch": 2.22694313700547, + "ewc_loss": 0.047244563698768616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.362228224228602e-05, + "grad_norm": 30.300235748291016, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8641281127929688, + "num_tokens": 667945087.0, + "step": 17506 + }, + { + "epoch": 2.2270703472840605, + "ewc_loss": 0.04734126850962639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3670634618611075e-05, + "grad_norm": 30.29551124572754, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8808415532112122, + "num_tokens": 667984736.0, + "step": 17507 + }, + { + "epoch": 2.227197557562651, + "ewc_loss": 0.04724819213151932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3624095774721354e-05, + "grad_norm": 30.27651023864746, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8648847341537476, + "num_tokens": 668024020.0, + "step": 17508 + }, + { + "epoch": 2.2273247678412416, + "ewc_loss": 0.04729580134153366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.364790088904556e-05, + "grad_norm": 30.205629348754883, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8656506538391113, + "num_tokens": 668059567.0, + "step": 17509 + }, + { + "epoch": 2.227451978119832, + "ewc_loss": 0.0473908931016922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3695447453064844e-05, + "grad_norm": 30.367557525634766, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8757004737854004, + "num_tokens": 668099901.0, + "step": 17510 + }, + { + "epoch": 2.2275791883984226, + "ewc_loss": 0.04733627289533615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3668137146160007e-05, + "grad_norm": 30.255935668945312, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8845208883285522, + "num_tokens": 668135974.0, + "step": 17511 + }, + { + "epoch": 2.227706398677013, + "ewc_loss": 0.047294359654188156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3647180569241755e-05, + "grad_norm": 30.23552131652832, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8730555772781372, + "num_tokens": 668179200.0, + "step": 17512 + }, + { + "epoch": 2.2278336089556037, + "ewc_loss": 0.04732145369052887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.366072658332996e-05, + "grad_norm": 30.289566040039062, + "learning_rate": 1e-06, + "loss": 0.4848, + "mean_token_accuracy": 0.8482074737548828, + "num_tokens": 668219683.0, + "step": 17513 + }, + { + "epoch": 2.227960819234194, + "ewc_loss": 0.04727374017238617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3636870537302457e-05, + "grad_norm": 30.268346786499023, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8722796440124512, + "num_tokens": 668253899.0, + "step": 17514 + }, + { + "epoch": 2.2280880295127847, + "ewc_loss": 0.04733440652489662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3667204004595987e-05, + "grad_norm": 30.24860382080078, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.862748920917511, + "num_tokens": 668294140.0, + "step": 17515 + }, + { + "epoch": 2.2282152397913753, + "ewc_loss": 0.04731369391083717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3656846678932197e-05, + "grad_norm": 30.287799835205078, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8769493103027344, + "num_tokens": 668328442.0, + "step": 17516 + }, + { + "epoch": 2.228342450069966, + "ewc_loss": 0.04734250530600548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.367125307500828e-05, + "grad_norm": 30.341394424438477, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8733901977539062, + "num_tokens": 668367903.0, + "step": 17517 + }, + { + "epoch": 2.2284696603485563, + "ewc_loss": 0.04726479947566986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.363239946134854e-05, + "grad_norm": 30.23402976989746, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8823951482772827, + "num_tokens": 668409889.0, + "step": 17518 + }, + { + "epoch": 2.228596870627147, + "ewc_loss": 0.047230325639247894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.361516271776054e-05, + "grad_norm": 30.26009750366211, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8742971420288086, + "num_tokens": 668451061.0, + "step": 17519 + }, + { + "epoch": 2.2287240809057374, + "ewc_loss": 0.0473005510866642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3650274670217186e-05, + "grad_norm": 30.249479293823242, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8479717969894409, + "num_tokens": 668493038.0, + "step": 17520 + }, + { + "epoch": 2.228851291184328, + "ewc_loss": 0.047389838844537735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3694919946137816e-05, + "grad_norm": 30.221412658691406, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8763447999954224, + "num_tokens": 668528087.0, + "step": 17521 + }, + { + "epoch": 2.228978501462918, + "ewc_loss": 0.047374479472637177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3687240172876045e-05, + "grad_norm": 30.288318634033203, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8712836503982544, + "num_tokens": 668562325.0, + "step": 17522 + }, + { + "epoch": 2.2291057117415085, + "ewc_loss": 0.047452326864004135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.372616290813312e-05, + "grad_norm": 30.288318634033203, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8722919225692749, + "num_tokens": 668601336.0, + "step": 17523 + }, + { + "epoch": 2.229232922020099, + "ewc_loss": 0.04735991731286049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.367995875829365e-05, + "grad_norm": 30.159440994262695, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.889927864074707, + "num_tokens": 668637113.0, + "step": 17524 + }, + { + "epoch": 2.2293601322986896, + "ewc_loss": 0.047412168234586716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.370608490309678e-05, + "grad_norm": 30.372451782226562, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8654646873474121, + "num_tokens": 668675645.0, + "step": 17525 + }, + { + "epoch": 2.22948734257728, + "ewc_loss": 0.047417111694812775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3708555090706795e-05, + "grad_norm": 30.246965408325195, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8716552257537842, + "num_tokens": 668706480.0, + "step": 17526 + }, + { + "epoch": 2.2296145528558706, + "ewc_loss": 0.047321151942014694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3660575607209466e-05, + "grad_norm": 30.348161697387695, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8646488785743713, + "num_tokens": 668743956.0, + "step": 17527 + }, + { + "epoch": 2.229741763134461, + "ewc_loss": 0.04745355620980263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3726777726551518e-05, + "grad_norm": 30.381729125976562, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8794615864753723, + "num_tokens": 668779134.0, + "step": 17528 + }, + { + "epoch": 2.2298689734130517, + "ewc_loss": 0.047423675656318665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3711838366580196e-05, + "grad_norm": 30.335107803344727, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8715530633926392, + "num_tokens": 668814978.0, + "step": 17529 + }, + { + "epoch": 2.229996183691642, + "ewc_loss": 0.04728720709681511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3643604436074384e-05, + "grad_norm": 30.25946807861328, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8546499013900757, + "num_tokens": 668854194.0, + "step": 17530 + }, + { + "epoch": 2.2301233939702327, + "ewc_loss": 0.04740072041749954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.370036054344382e-05, + "grad_norm": 30.2976016998291, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8618441820144653, + "num_tokens": 668893095.0, + "step": 17531 + }, + { + "epoch": 2.2302506042488233, + "ewc_loss": 0.04743897542357445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3719487217022106e-05, + "grad_norm": 30.225688934326172, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.877579927444458, + "num_tokens": 668930626.0, + "step": 17532 + }, + { + "epoch": 2.230377814527414, + "ewc_loss": 0.04743127524852753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3715638235444203e-05, + "grad_norm": 30.256183624267578, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8777172565460205, + "num_tokens": 668960941.0, + "step": 17533 + }, + { + "epoch": 2.2305050248060043, + "ewc_loss": 0.04741939529776573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3709697416052222e-05, + "grad_norm": 30.256742477416992, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8725370764732361, + "num_tokens": 668997771.0, + "step": 17534 + }, + { + "epoch": 2.230632235084595, + "ewc_loss": 0.04746268317103386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3731341570965014e-05, + "grad_norm": 30.29323959350586, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8848268389701843, + "num_tokens": 669038714.0, + "step": 17535 + }, + { + "epoch": 2.2307594453631854, + "ewc_loss": 0.04748201370239258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3741007680655457e-05, + "grad_norm": 30.239089965820312, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8782849907875061, + "num_tokens": 669072336.0, + "step": 17536 + }, + { + "epoch": 2.230886655641776, + "ewc_loss": 0.04744542017579079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.372271046624519e-05, + "grad_norm": 30.35672378540039, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8803141117095947, + "num_tokens": 669110303.0, + "step": 17537 + }, + { + "epoch": 2.2310138659203664, + "ewc_loss": 0.047485850751399994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3742924895486794e-05, + "grad_norm": 30.225332260131836, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.860446572303772, + "num_tokens": 669146874.0, + "step": 17538 + }, + { + "epoch": 2.231141076198957, + "ewc_loss": 0.047507964074611664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.375398253207095e-05, + "grad_norm": 30.329614639282227, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8824024200439453, + "num_tokens": 669181682.0, + "step": 17539 + }, + { + "epoch": 2.2312682864775475, + "ewc_loss": 0.04747534543275833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3737673473078758e-05, + "grad_norm": 30.26705551147461, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.870398998260498, + "num_tokens": 669227201.0, + "step": 17540 + }, + { + "epoch": 2.231395496756138, + "ewc_loss": 0.04740191996097565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.370096080994699e-05, + "grad_norm": 30.271678924560547, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8714962601661682, + "num_tokens": 669265915.0, + "step": 17541 + }, + { + "epoch": 2.2315227070347285, + "ewc_loss": 0.04749837890267372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3749189494992606e-05, + "grad_norm": 30.392019271850586, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8745322823524475, + "num_tokens": 669304130.0, + "step": 17542 + }, + { + "epoch": 2.231649917313319, + "ewc_loss": 0.04736006632447243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3680033336859196e-05, + "grad_norm": 30.212608337402344, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8680838942527771, + "num_tokens": 669344225.0, + "step": 17543 + }, + { + "epoch": 2.2317771275919096, + "ewc_loss": 0.047391749918460846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3695874915574677e-05, + "grad_norm": 30.256208419799805, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.85294508934021, + "num_tokens": 669385126.0, + "step": 17544 + }, + { + "epoch": 2.2319043378705, + "ewc_loss": 0.04738212749361992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.36910636886023e-05, + "grad_norm": 30.241270065307617, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8594934940338135, + "num_tokens": 669425774.0, + "step": 17545 + }, + { + "epoch": 2.2320315481490907, + "ewc_loss": 0.0473504364490509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.367521847190801e-05, + "grad_norm": 30.343055725097656, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8615360856056213, + "num_tokens": 669459789.0, + "step": 17546 + }, + { + "epoch": 2.2321587584276807, + "ewc_loss": 0.047371987253427505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3685994165134616e-05, + "grad_norm": 30.242177963256836, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8862786293029785, + "num_tokens": 669498709.0, + "step": 17547 + }, + { + "epoch": 2.2322859687062713, + "ewc_loss": 0.04733399674296379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3666998458793387e-05, + "grad_norm": 30.307819366455078, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.875057578086853, + "num_tokens": 669536021.0, + "step": 17548 + }, + { + "epoch": 2.232413178984862, + "ewc_loss": 0.04743800684809685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3719003365840763e-05, + "grad_norm": 30.398488998413086, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8592385053634644, + "num_tokens": 669574316.0, + "step": 17549 + }, + { + "epoch": 2.2325403892634523, + "ewc_loss": 0.04740244522690773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3701222744421102e-05, + "grad_norm": 30.39488410949707, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8609018325805664, + "num_tokens": 669611826.0, + "step": 17550 + }, + { + "epoch": 2.232667599542043, + "ewc_loss": 0.04738645255565643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3693226467003115e-05, + "grad_norm": 30.273561477661133, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8671880960464478, + "num_tokens": 669655277.0, + "step": 17551 + }, + { + "epoch": 2.2327948098206334, + "ewc_loss": 0.04731011390686035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3655056793359108e-05, + "grad_norm": 30.320810317993164, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.8651245832443237, + "num_tokens": 669693481.0, + "step": 17552 + }, + { + "epoch": 2.232922020099224, + "ewc_loss": 0.047343235462903976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.367161687288899e-05, + "grad_norm": 30.280534744262695, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8704321980476379, + "num_tokens": 669724104.0, + "step": 17553 + }, + { + "epoch": 2.2330492303778144, + "ewc_loss": 0.04737386479973793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3686932763666846e-05, + "grad_norm": 30.271530151367188, + "learning_rate": 1e-06, + "loss": 0.4732, + "mean_token_accuracy": 0.8533533811569214, + "num_tokens": 669768725.0, + "step": 17554 + }, + { + "epoch": 2.233176440656405, + "ewc_loss": 0.04731175675988197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.365587897656951e-05, + "grad_norm": 30.210996627807617, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8789598941802979, + "num_tokens": 669809508.0, + "step": 17555 + }, + { + "epoch": 2.2333036509349955, + "ewc_loss": 0.047343909740448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.367195520491805e-05, + "grad_norm": 30.34507942199707, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8716110587120056, + "num_tokens": 669848047.0, + "step": 17556 + }, + { + "epoch": 2.233430861213586, + "ewc_loss": 0.047405701130628586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3702850739937276e-05, + "grad_norm": 30.330442428588867, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8861269950866699, + "num_tokens": 669881274.0, + "step": 17557 + }, + { + "epoch": 2.2335580714921766, + "ewc_loss": 0.047340281307697296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.367013985349331e-05, + "grad_norm": 30.203411102294922, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8638415932655334, + "num_tokens": 669918517.0, + "step": 17558 + }, + { + "epoch": 2.233685281770767, + "ewc_loss": 0.04734119400382042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.36705964198336e-05, + "grad_norm": 30.33936309814453, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8627115488052368, + "num_tokens": 669960990.0, + "step": 17559 + }, + { + "epoch": 2.2338124920493576, + "ewc_loss": 0.04745174199342728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3725871869828552e-05, + "grad_norm": 30.29999542236328, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8748452067375183, + "num_tokens": 669995542.0, + "step": 17560 + }, + { + "epoch": 2.233939702327948, + "ewc_loss": 0.04735351726412773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3676759155932814e-05, + "grad_norm": 30.298873901367188, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8715686798095703, + "num_tokens": 670036798.0, + "step": 17561 + }, + { + "epoch": 2.2340669126065387, + "ewc_loss": 0.0474189817905426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3709490051260218e-05, + "grad_norm": 30.393611907958984, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8687036633491516, + "num_tokens": 670071763.0, + "step": 17562 + }, + { + "epoch": 2.234194122885129, + "ewc_loss": 0.04736623167991638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3683116523898207e-05, + "grad_norm": 30.332551956176758, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.869583785533905, + "num_tokens": 670100864.0, + "step": 17563 + }, + { + "epoch": 2.2343213331637197, + "ewc_loss": 0.04731559008359909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3657794372411445e-05, + "grad_norm": 30.398916244506836, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8587832450866699, + "num_tokens": 670144114.0, + "step": 17564 + }, + { + "epoch": 2.2344485434423103, + "ewc_loss": 0.04739899933338165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3699500161455944e-05, + "grad_norm": 30.33854103088379, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8747323155403137, + "num_tokens": 670182719.0, + "step": 17565 + }, + { + "epoch": 2.234575753720901, + "ewc_loss": 0.04728325083851814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3641625375603326e-05, + "grad_norm": 30.257003784179688, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8747605085372925, + "num_tokens": 670218059.0, + "step": 17566 + }, + { + "epoch": 2.2347029639994913, + "ewc_loss": 0.04733535274863243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.366767694184091e-05, + "grad_norm": 30.292850494384766, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8655383586883545, + "num_tokens": 670259589.0, + "step": 17567 + }, + { + "epoch": 2.234830174278082, + "ewc_loss": 0.047335971146821976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3667986170039512e-05, + "grad_norm": 30.283649444580078, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8607516288757324, + "num_tokens": 670301755.0, + "step": 17568 + }, + { + "epoch": 2.2349573845566724, + "ewc_loss": 0.04734176769852638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.367088382015936e-05, + "grad_norm": 30.267383575439453, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8738654851913452, + "num_tokens": 670339205.0, + "step": 17569 + }, + { + "epoch": 2.235084594835263, + "ewc_loss": 0.04738796129822731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3693981347605586e-05, + "grad_norm": 30.255901336669922, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8726326823234558, + "num_tokens": 670382961.0, + "step": 17570 + }, + { + "epoch": 2.2352118051138534, + "ewc_loss": 0.047376278787851334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3688138753641397e-05, + "grad_norm": 30.350645065307617, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8606318235397339, + "num_tokens": 670420747.0, + "step": 17571 + }, + { + "epoch": 2.2353390153924435, + "ewc_loss": 0.04731717333197594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.365858745179139e-05, + "grad_norm": 30.176734924316406, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8871543407440186, + "num_tokens": 670457906.0, + "step": 17572 + }, + { + "epoch": 2.235466225671034, + "ewc_loss": 0.04726683348417282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3633416276425123e-05, + "grad_norm": 30.275915145874023, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8591448068618774, + "num_tokens": 670493187.0, + "step": 17573 + }, + { + "epoch": 2.2355934359496246, + "ewc_loss": 0.04747110977768898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3735554350423627e-05, + "grad_norm": 30.308385848999023, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8832963705062866, + "num_tokens": 670529750.0, + "step": 17574 + }, + { + "epoch": 2.235720646228215, + "ewc_loss": 0.047329314053058624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3664657419431023e-05, + "grad_norm": 30.189815521240234, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8685399293899536, + "num_tokens": 670570147.0, + "step": 17575 + }, + { + "epoch": 2.2358478565068056, + "ewc_loss": 0.047407735139131546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3703867555013858e-05, + "grad_norm": 30.452011108398438, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8719406127929688, + "num_tokens": 670604113.0, + "step": 17576 + }, + { + "epoch": 2.235975066785396, + "ewc_loss": 0.04744380712509155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3721902834950015e-05, + "grad_norm": 30.28184700012207, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.870112419128418, + "num_tokens": 670645451.0, + "step": 17577 + }, + { + "epoch": 2.2361022770639867, + "ewc_loss": 0.04734855890274048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.367427987337578e-05, + "grad_norm": 30.353900909423828, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8749361038208008, + "num_tokens": 670684055.0, + "step": 17578 + }, + { + "epoch": 2.236229487342577, + "ewc_loss": 0.047496095299720764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.374804716964718e-05, + "grad_norm": 30.386751174926758, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8624254465103149, + "num_tokens": 670724397.0, + "step": 17579 + }, + { + "epoch": 2.2363566976211677, + "ewc_loss": 0.047296810895204544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.364840474911034e-05, + "grad_norm": 30.20631980895996, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8805661797523499, + "num_tokens": 670761282.0, + "step": 17580 + }, + { + "epoch": 2.2364839078997583, + "ewc_loss": 0.04737628996372223, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3688144210609607e-05, + "grad_norm": 30.371150970458984, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8747189044952393, + "num_tokens": 670800007.0, + "step": 17581 + }, + { + "epoch": 2.236611118178349, + "ewc_loss": 0.04740442708134651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3702214093646035e-05, + "grad_norm": 30.312116622924805, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8672386407852173, + "num_tokens": 670840040.0, + "step": 17582 + }, + { + "epoch": 2.2367383284569393, + "ewc_loss": 0.04735247418284416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3676237105973996e-05, + "grad_norm": 30.293031692504883, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8638610243797302, + "num_tokens": 670881193.0, + "step": 17583 + }, + { + "epoch": 2.23686553873553, + "ewc_loss": 0.047322262078523636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3661130398977548e-05, + "grad_norm": 30.303699493408203, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8735013008117676, + "num_tokens": 670927275.0, + "step": 17584 + }, + { + "epoch": 2.2369927490141204, + "ewc_loss": 0.04740481078624725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3702405087533407e-05, + "grad_norm": 30.26064682006836, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8746303915977478, + "num_tokens": 670967327.0, + "step": 17585 + }, + { + "epoch": 2.237119959292711, + "ewc_loss": 0.04747840017080307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3739199605188332e-05, + "grad_norm": 30.436418533325195, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8740720152854919, + "num_tokens": 671009250.0, + "step": 17586 + }, + { + "epoch": 2.2372471695713014, + "ewc_loss": 0.04740547016263008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.370273432461545e-05, + "grad_norm": 30.264562606811523, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8735599517822266, + "num_tokens": 671045068.0, + "step": 17587 + }, + { + "epoch": 2.237374379849892, + "ewc_loss": 0.04740360751748085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3701803002040833e-05, + "grad_norm": 30.394851684570312, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.857212245464325, + "num_tokens": 671084786.0, + "step": 17588 + }, + { + "epoch": 2.2375015901284825, + "ewc_loss": 0.0474407859146595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3720393073745072e-05, + "grad_norm": 30.30518341064453, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8632012605667114, + "num_tokens": 671123235.0, + "step": 17589 + }, + { + "epoch": 2.237628800407073, + "ewc_loss": 0.04734978824853897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.367489469179418e-05, + "grad_norm": 30.314096450805664, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.885759711265564, + "num_tokens": 671161702.0, + "step": 17590 + }, + { + "epoch": 2.2377560106856635, + "ewc_loss": 0.04740583896636963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.370291986153461e-05, + "grad_norm": 30.373844146728516, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.881832480430603, + "num_tokens": 671195487.0, + "step": 17591 + }, + { + "epoch": 2.237883220964254, + "ewc_loss": 0.04736584052443504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3682920073042624e-05, + "grad_norm": 30.31861114501953, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8687140345573425, + "num_tokens": 671232288.0, + "step": 17592 + }, + { + "epoch": 2.2380104312428446, + "ewc_loss": 0.04745505750179291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3727528969175182e-05, + "grad_norm": 30.423728942871094, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8535621166229248, + "num_tokens": 671275154.0, + "step": 17593 + }, + { + "epoch": 2.238137641521435, + "ewc_loss": 0.047348231077194214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3674116164329462e-05, + "grad_norm": 30.236114501953125, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8603696227073669, + "num_tokens": 671312266.0, + "step": 17594 + }, + { + "epoch": 2.238264851800025, + "ewc_loss": 0.04735657945275307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3678288926021196e-05, + "grad_norm": 30.325218200683594, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8696008920669556, + "num_tokens": 671349690.0, + "step": 17595 + }, + { + "epoch": 2.238392062078616, + "ewc_loss": 0.047421663999557495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3710832465440035e-05, + "grad_norm": 30.37261199951172, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8736622929573059, + "num_tokens": 671387898.0, + "step": 17596 + }, + { + "epoch": 2.2385192723572063, + "ewc_loss": 0.04737667739391327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3688338842475787e-05, + "grad_norm": 30.29930877685547, + "learning_rate": 1e-06, + "loss": 0.4891, + "mean_token_accuracy": 0.8497438430786133, + "num_tokens": 671428054.0, + "step": 17597 + }, + { + "epoch": 2.238646482635797, + "ewc_loss": 0.04742612689733505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3713064365438186e-05, + "grad_norm": 30.374942779541016, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8767423629760742, + "num_tokens": 671467096.0, + "step": 17598 + }, + { + "epoch": 2.2387736929143873, + "ewc_loss": 0.047451410442590714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3725704522803426e-05, + "grad_norm": 30.389362335205078, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.869755208492279, + "num_tokens": 671506752.0, + "step": 17599 + }, + { + "epoch": 2.238900903192978, + "ewc_loss": 0.047448232769966125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.372411654505413e-05, + "grad_norm": 30.4168758392334, + "learning_rate": 1e-06, + "loss": 0.4568, + "mean_token_accuracy": 0.8621890544891357, + "num_tokens": 671542403.0, + "step": 17600 + }, + { + "epoch": 2.2390281134715684, + "ewc_loss": 0.047380898147821426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.36904488701839e-05, + "grad_norm": 30.34438705444336, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8811007142066956, + "num_tokens": 671578115.0, + "step": 17601 + }, + { + "epoch": 2.239155323750159, + "ewc_loss": 0.047313276678323746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.365663749515079e-05, + "grad_norm": 30.178977966308594, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8809497952461243, + "num_tokens": 671619854.0, + "step": 17602 + }, + { + "epoch": 2.2392825340287494, + "ewc_loss": 0.04744746536016464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3723732738289982e-05, + "grad_norm": 30.42760467529297, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8842179179191589, + "num_tokens": 671661617.0, + "step": 17603 + }, + { + "epoch": 2.23940974430734, + "ewc_loss": 0.047416117042303085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3708058506599627e-05, + "grad_norm": 30.270233154296875, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8595892786979675, + "num_tokens": 671702231.0, + "step": 17604 + }, + { + "epoch": 2.2395369545859305, + "ewc_loss": 0.047403331845998764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3701666577835567e-05, + "grad_norm": 30.417837142944336, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8700957298278809, + "num_tokens": 671741483.0, + "step": 17605 + }, + { + "epoch": 2.239664164864521, + "ewc_loss": 0.04745681211352348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.372840572206769e-05, + "grad_norm": 30.312454223632812, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8653112649917603, + "num_tokens": 671780168.0, + "step": 17606 + }, + { + "epoch": 2.2397913751431116, + "ewc_loss": 0.0473148375749588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3657419660594314e-05, + "grad_norm": 30.22812843322754, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.8476163148880005, + "num_tokens": 671818980.0, + "step": 17607 + }, + { + "epoch": 2.239918585421702, + "ewc_loss": 0.047430090606212616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3715045244898647e-05, + "grad_norm": 30.38991355895996, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8714891672134399, + "num_tokens": 671850010.0, + "step": 17608 + }, + { + "epoch": 2.2400457957002926, + "ewc_loss": 0.04748564586043358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3742823032080196e-05, + "grad_norm": 30.319536209106445, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8630782961845398, + "num_tokens": 671888506.0, + "step": 17609 + }, + { + "epoch": 2.240173005978883, + "ewc_loss": 0.04740828648209572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3704144041403197e-05, + "grad_norm": 30.418668746948242, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8790030479431152, + "num_tokens": 671930965.0, + "step": 17610 + }, + { + "epoch": 2.2403002162574737, + "ewc_loss": 0.04740869998931885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3704349587205797e-05, + "grad_norm": 30.261045455932617, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8795558214187622, + "num_tokens": 671967472.0, + "step": 17611 + }, + { + "epoch": 2.240427426536064, + "ewc_loss": 0.04736575111746788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.368287641729694e-05, + "grad_norm": 30.45698356628418, + "learning_rate": 1e-06, + "loss": 0.348, + "mean_token_accuracy": 0.8912902474403381, + "num_tokens": 672005986.0, + "step": 17612 + }, + { + "epoch": 2.2405546368146547, + "ewc_loss": 0.047408681362867355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.370434049225878e-05, + "grad_norm": 30.312829971313477, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8696470856666565, + "num_tokens": 672044729.0, + "step": 17613 + }, + { + "epoch": 2.2406818470932452, + "ewc_loss": 0.04732535779476166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3662678358959965e-05, + "grad_norm": 30.369976043701172, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8706508874893188, + "num_tokens": 672080141.0, + "step": 17614 + }, + { + "epoch": 2.2408090573718358, + "ewc_loss": 0.04741058126091957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.370529000472743e-05, + "grad_norm": 30.4221248626709, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8779895305633545, + "num_tokens": 672119277.0, + "step": 17615 + }, + { + "epoch": 2.2409362676504263, + "ewc_loss": 0.04726261645555496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3631308067706414e-05, + "grad_norm": 30.148120880126953, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8688260316848755, + "num_tokens": 672154205.0, + "step": 17616 + }, + { + "epoch": 2.241063477929017, + "ewc_loss": 0.047375015914440155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3687507564318366e-05, + "grad_norm": 30.40927505493164, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8704518675804138, + "num_tokens": 672193667.0, + "step": 17617 + }, + { + "epoch": 2.2411906882076074, + "ewc_loss": 0.0474654883146286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3732744011795148e-05, + "grad_norm": 30.37496566772461, + "learning_rate": 1e-06, + "loss": 0.5209, + "mean_token_accuracy": 0.8402625322341919, + "num_tokens": 672232360.0, + "step": 17618 + }, + { + "epoch": 2.241317898486198, + "ewc_loss": 0.04738687723875046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.369343928876333e-05, + "grad_norm": 30.268207550048828, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8569488525390625, + "num_tokens": 672264127.0, + "step": 17619 + }, + { + "epoch": 2.241445108764788, + "ewc_loss": 0.04744140803813934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3720704120933078e-05, + "grad_norm": 30.314022064208984, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8775918483734131, + "num_tokens": 672300719.0, + "step": 17620 + }, + { + "epoch": 2.2415723190433785, + "ewc_loss": 0.0473649688065052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3682483515585773e-05, + "grad_norm": 30.268909454345703, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8805025815963745, + "num_tokens": 672342796.0, + "step": 17621 + }, + { + "epoch": 2.241699529321969, + "ewc_loss": 0.047489091753959656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3744545615045354e-05, + "grad_norm": 30.310354232788086, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8721445798873901, + "num_tokens": 672382597.0, + "step": 17622 + }, + { + "epoch": 2.2418267396005596, + "ewc_loss": 0.04743591323494911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3717957446933724e-05, + "grad_norm": 30.282487869262695, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8586822748184204, + "num_tokens": 672427350.0, + "step": 17623 + }, + { + "epoch": 2.24195394987915, + "ewc_loss": 0.04751700535416603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.375850272073876e-05, + "grad_norm": 30.385499954223633, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8635680675506592, + "num_tokens": 672468997.0, + "step": 17624 + }, + { + "epoch": 2.2420811601577406, + "ewc_loss": 0.04745502024888992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3727510779281147e-05, + "grad_norm": 30.17963981628418, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8768714666366577, + "num_tokens": 672505061.0, + "step": 17625 + }, + { + "epoch": 2.242208370436331, + "ewc_loss": 0.047419849783182144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3709924789727665e-05, + "grad_norm": 30.269189834594727, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8695337772369385, + "num_tokens": 672544241.0, + "step": 17626 + }, + { + "epoch": 2.2423355807149217, + "ewc_loss": 0.047516901046037674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3758449970046058e-05, + "grad_norm": 30.12482261657715, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8694120645523071, + "num_tokens": 672586334.0, + "step": 17627 + }, + { + "epoch": 2.242462790993512, + "ewc_loss": 0.047451335936784744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3725668143015355e-05, + "grad_norm": 30.299623489379883, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8659319877624512, + "num_tokens": 672628627.0, + "step": 17628 + }, + { + "epoch": 2.2425900012721027, + "ewc_loss": 0.047569990158081055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3784994482412003e-05, + "grad_norm": 30.189645767211914, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8612632155418396, + "num_tokens": 672663046.0, + "step": 17629 + }, + { + "epoch": 2.2427172115506933, + "ewc_loss": 0.04753128066658974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3765640435158275e-05, + "grad_norm": 30.341121673583984, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8762478828430176, + "num_tokens": 672703394.0, + "step": 17630 + }, + { + "epoch": 2.242844421829284, + "ewc_loss": 0.04758941009640694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.379470424784813e-05, + "grad_norm": 30.15488052368164, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8611348867416382, + "num_tokens": 672743262.0, + "step": 17631 + }, + { + "epoch": 2.2429716321078743, + "ewc_loss": 0.04750034958124161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3750175387249328e-05, + "grad_norm": 30.357412338256836, + "learning_rate": 1e-06, + "loss": 0.5214, + "mean_token_accuracy": 0.8364566564559937, + "num_tokens": 672782662.0, + "step": 17632 + }, + { + "epoch": 2.243098842386465, + "ewc_loss": 0.04768740385770798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3843702365411445e-05, + "grad_norm": 30.21523666381836, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.867936372756958, + "num_tokens": 672818400.0, + "step": 17633 + }, + { + "epoch": 2.2432260526650554, + "ewc_loss": 0.047449398785829544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3724698621663265e-05, + "grad_norm": 30.280736923217773, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8820918202400208, + "num_tokens": 672858843.0, + "step": 17634 + }, + { + "epoch": 2.243353262943646, + "ewc_loss": 0.04758153483271599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3790767954778858e-05, + "grad_norm": 30.264253616333008, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8720694780349731, + "num_tokens": 672900397.0, + "step": 17635 + }, + { + "epoch": 2.2434804732222364, + "ewc_loss": 0.04742884263396263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.371442133153323e-05, + "grad_norm": 30.22084617614746, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8678283095359802, + "num_tokens": 672936025.0, + "step": 17636 + }, + { + "epoch": 2.243607683500827, + "ewc_loss": 0.04754728078842163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.377364035055507e-05, + "grad_norm": 30.299911499023438, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8521418571472168, + "num_tokens": 672979180.0, + "step": 17637 + }, + { + "epoch": 2.2437348937794175, + "ewc_loss": 0.047435540705919266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.371777009102516e-05, + "grad_norm": 30.130563735961914, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8806967735290527, + "num_tokens": 673018053.0, + "step": 17638 + }, + { + "epoch": 2.243862104058008, + "ewc_loss": 0.047560110688209534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3780055926181376e-05, + "grad_norm": 30.360458374023438, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8500618934631348, + "num_tokens": 673057596.0, + "step": 17639 + }, + { + "epoch": 2.2439893143365985, + "ewc_loss": 0.047608550637960434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.380427577008959e-05, + "grad_norm": 30.177635192871094, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8816896677017212, + "num_tokens": 673097740.0, + "step": 17640 + }, + { + "epoch": 2.244116524615189, + "ewc_loss": 0.04747316986322403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3736585717415437e-05, + "grad_norm": 30.244338989257812, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8757994174957275, + "num_tokens": 673141821.0, + "step": 17641 + }, + { + "epoch": 2.2442437348937796, + "ewc_loss": 0.04757213965058327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.37860695051495e-05, + "grad_norm": 30.26688575744629, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8779339790344238, + "num_tokens": 673184279.0, + "step": 17642 + }, + { + "epoch": 2.24437094517237, + "ewc_loss": 0.04747467488050461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.37373369600391e-05, + "grad_norm": 30.302642822265625, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8799951076507568, + "num_tokens": 673224328.0, + "step": 17643 + }, + { + "epoch": 2.2444981554509607, + "ewc_loss": 0.047474756836891174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3737378796795383e-05, + "grad_norm": 30.188671112060547, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8570719957351685, + "num_tokens": 673263328.0, + "step": 17644 + }, + { + "epoch": 2.2446253657295507, + "ewc_loss": 0.04751953110098839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3759765099384822e-05, + "grad_norm": 30.41860580444336, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8746932744979858, + "num_tokens": 673301663.0, + "step": 17645 + }, + { + "epoch": 2.2447525760081413, + "ewc_loss": 0.047494519501924515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3747259547235444e-05, + "grad_norm": 30.153484344482422, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8674920797348022, + "num_tokens": 673340606.0, + "step": 17646 + }, + { + "epoch": 2.244879786286732, + "ewc_loss": 0.047443192452192307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3721595425740816e-05, + "grad_norm": 30.291059494018555, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8665974736213684, + "num_tokens": 673382201.0, + "step": 17647 + }, + { + "epoch": 2.2450069965653223, + "ewc_loss": 0.047537337988615036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.376866905251518e-05, + "grad_norm": 30.264251708984375, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8793418407440186, + "num_tokens": 673423330.0, + "step": 17648 + }, + { + "epoch": 2.245134206843913, + "ewc_loss": 0.04745262861251831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3726313884253614e-05, + "grad_norm": 30.26622200012207, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8867238759994507, + "num_tokens": 673459690.0, + "step": 17649 + }, + { + "epoch": 2.2452614171225034, + "ewc_loss": 0.047507837414741516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3753918867441826e-05, + "grad_norm": 30.267786026000977, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8654674291610718, + "num_tokens": 673494827.0, + "step": 17650 + }, + { + "epoch": 2.245388627401094, + "ewc_loss": 0.04748784750699997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.374392352066934e-05, + "grad_norm": 30.331371307373047, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8588160276412964, + "num_tokens": 673540568.0, + "step": 17651 + }, + { + "epoch": 2.2455158376796844, + "ewc_loss": 0.04741315916180611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3706579668214545e-05, + "grad_norm": 30.254188537597656, + "learning_rate": 1e-06, + "loss": 0.375, + "mean_token_accuracy": 0.8861016631126404, + "num_tokens": 673574022.0, + "step": 17652 + }, + { + "epoch": 2.245643047958275, + "ewc_loss": 0.04753103479743004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3765516743878834e-05, + "grad_norm": 30.367765426635742, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8669196963310242, + "num_tokens": 673612493.0, + "step": 17653 + }, + { + "epoch": 2.2457702582368655, + "ewc_loss": 0.04744907468557358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.372453673160635e-05, + "grad_norm": 30.18158721923828, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8663527369499207, + "num_tokens": 673653729.0, + "step": 17654 + }, + { + "epoch": 2.245897468515456, + "ewc_loss": 0.047482069581747055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.374103496549651e-05, + "grad_norm": 30.399250030517578, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8831503391265869, + "num_tokens": 673695626.0, + "step": 17655 + }, + { + "epoch": 2.2460246787940465, + "ewc_loss": 0.04756416752934456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.378208409936633e-05, + "grad_norm": 30.279010772705078, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.876874566078186, + "num_tokens": 673732188.0, + "step": 17656 + }, + { + "epoch": 2.246151889072637, + "ewc_loss": 0.047450561076402664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.37252806982724e-05, + "grad_norm": 30.486473083496094, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8614828586578369, + "num_tokens": 673770259.0, + "step": 17657 + }, + { + "epoch": 2.2462790993512276, + "ewc_loss": 0.04752695560455322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.376347765675746e-05, + "grad_norm": 30.182926177978516, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.866891622543335, + "num_tokens": 673805018.0, + "step": 17658 + }, + { + "epoch": 2.246406309629818, + "ewc_loss": 0.04746004566550255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3730022803647444e-05, + "grad_norm": 30.490076065063477, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8629742860794067, + "num_tokens": 673846058.0, + "step": 17659 + }, + { + "epoch": 2.2465335199084087, + "ewc_loss": 0.04754508286714554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3772541680955328e-05, + "grad_norm": 30.200124740600586, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8570337295532227, + "num_tokens": 673886437.0, + "step": 17660 + }, + { + "epoch": 2.246660730186999, + "ewc_loss": 0.047454848885536194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.372742528677918e-05, + "grad_norm": 30.374977111816406, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.874411404132843, + "num_tokens": 673925701.0, + "step": 17661 + }, + { + "epoch": 2.2467879404655897, + "ewc_loss": 0.04758371412754059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.379185752943158e-05, + "grad_norm": 30.293107986450195, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8723130226135254, + "num_tokens": 673969854.0, + "step": 17662 + }, + { + "epoch": 2.2469151507441802, + "ewc_loss": 0.04737025499343872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.368512832617853e-05, + "grad_norm": 30.301382064819336, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.871860921382904, + "num_tokens": 674005048.0, + "step": 17663 + }, + { + "epoch": 2.2470423610227708, + "ewc_loss": 0.04749618098139763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3748090825392865e-05, + "grad_norm": 30.32039451599121, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8657294511795044, + "num_tokens": 674046548.0, + "step": 17664 + }, + { + "epoch": 2.2471695713013613, + "ewc_loss": 0.04750632494688034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.375316216784995e-05, + "grad_norm": 30.317363739013672, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8647506237030029, + "num_tokens": 674086905.0, + "step": 17665 + }, + { + "epoch": 2.247296781579952, + "ewc_loss": 0.04746381565928459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3731907276669517e-05, + "grad_norm": 30.37122344970703, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8803519010543823, + "num_tokens": 674124589.0, + "step": 17666 + }, + { + "epoch": 2.2474239918585424, + "ewc_loss": 0.04754103347659111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.377051714574918e-05, + "grad_norm": 30.386388778686523, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8822238445281982, + "num_tokens": 674165135.0, + "step": 17667 + }, + { + "epoch": 2.247551202137133, + "ewc_loss": 0.04748149588704109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.374074756517075e-05, + "grad_norm": 30.31452178955078, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8653325438499451, + "num_tokens": 674207269.0, + "step": 17668 + }, + { + "epoch": 2.2476784124157234, + "ewc_loss": 0.047486159950494766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3743079509586096e-05, + "grad_norm": 30.37933921813965, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8873471021652222, + "num_tokens": 674240806.0, + "step": 17669 + }, + { + "epoch": 2.2478056226943135, + "ewc_loss": 0.047582391649484634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.379119541728869e-05, + "grad_norm": 30.405475616455078, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8613423109054565, + "num_tokens": 674274697.0, + "step": 17670 + }, + { + "epoch": 2.247932832972904, + "ewc_loss": 0.047468964010477066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3734481146675535e-05, + "grad_norm": 30.492448806762695, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8620907664299011, + "num_tokens": 674311362.0, + "step": 17671 + }, + { + "epoch": 2.2480600432514946, + "ewc_loss": 0.04737904667854309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3689523004577495e-05, + "grad_norm": 30.272661209106445, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8775624632835388, + "num_tokens": 674352505.0, + "step": 17672 + }, + { + "epoch": 2.248187253530085, + "ewc_loss": 0.04735134541988373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3675673219258897e-05, + "grad_norm": 30.402307510375977, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8756138682365417, + "num_tokens": 674383938.0, + "step": 17673 + }, + { + "epoch": 2.2483144638086756, + "ewc_loss": 0.04747379943728447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.373690040258225e-05, + "grad_norm": 30.271934509277344, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8668956756591797, + "num_tokens": 674421665.0, + "step": 17674 + }, + { + "epoch": 2.248441674087266, + "ewc_loss": 0.04745892062783241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3729460735921748e-05, + "grad_norm": 30.328208923339844, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8732534050941467, + "num_tokens": 674466065.0, + "step": 17675 + }, + { + "epoch": 2.2485688843658567, + "ewc_loss": 0.047441497445106506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3720749595668167e-05, + "grad_norm": 30.367359161376953, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8722426891326904, + "num_tokens": 674506868.0, + "step": 17676 + }, + { + "epoch": 2.248696094644447, + "ewc_loss": 0.047433920204639435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.371696064074058e-05, + "grad_norm": 30.365158081054688, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8659805059432983, + "num_tokens": 674545756.0, + "step": 17677 + }, + { + "epoch": 2.2488233049230377, + "ewc_loss": 0.04745417833328247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3727088773739524e-05, + "grad_norm": 30.407432556152344, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8654206991195679, + "num_tokens": 674590680.0, + "step": 17678 + }, + { + "epoch": 2.2489505152016283, + "ewc_loss": 0.04734201356768608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3671007511438802e-05, + "grad_norm": 30.370431900024414, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8758707046508789, + "num_tokens": 674631624.0, + "step": 17679 + }, + { + "epoch": 2.249077725480219, + "ewc_loss": 0.04735478758811951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.367739398323465e-05, + "grad_norm": 30.294048309326172, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8654583692550659, + "num_tokens": 674667365.0, + "step": 17680 + }, + { + "epoch": 2.2492049357588093, + "ewc_loss": 0.047513656318187714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3756827431498095e-05, + "grad_norm": 30.3096981048584, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8693164587020874, + "num_tokens": 674706587.0, + "step": 17681 + }, + { + "epoch": 2.2493321460374, + "ewc_loss": 0.047429028898477554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.371451409999281e-05, + "grad_norm": 30.380441665649414, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8685587644577026, + "num_tokens": 674744480.0, + "step": 17682 + }, + { + "epoch": 2.2494593563159904, + "ewc_loss": 0.04743662104010582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3718310330878012e-05, + "grad_norm": 30.248319625854492, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.870367705821991, + "num_tokens": 674783104.0, + "step": 17683 + }, + { + "epoch": 2.249586566594581, + "ewc_loss": 0.04753194376826286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.376597149122972e-05, + "grad_norm": 30.443824768066406, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8615859746932983, + "num_tokens": 674823488.0, + "step": 17684 + }, + { + "epoch": 2.2497137768731714, + "ewc_loss": 0.047414250671863556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.370712536503561e-05, + "grad_norm": 30.26296615600586, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8756710290908813, + "num_tokens": 674857322.0, + "step": 17685 + }, + { + "epoch": 2.249840987151762, + "ewc_loss": 0.04745618999004364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3728094674879685e-05, + "grad_norm": 30.457992553710938, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8607919216156006, + "num_tokens": 674901566.0, + "step": 17686 + }, + { + "epoch": 2.2499681974303525, + "ewc_loss": 0.04751696437597275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3758482711855322e-05, + "grad_norm": 30.254913330078125, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8761469721794128, + "num_tokens": 674942803.0, + "step": 17687 + }, + { + "epoch": 2.250095407708943, + "ewc_loss": 0.04752235859632492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3761178454151377e-05, + "grad_norm": 30.4543399810791, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8610575199127197, + "num_tokens": 674983160.0, + "step": 17688 + }, + { + "epoch": 2.2502226179875335, + "ewc_loss": 0.047545481473207474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3772739950800315e-05, + "grad_norm": 30.272911071777344, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.871285080909729, + "num_tokens": 675021243.0, + "step": 17689 + }, + { + "epoch": 2.250349828266124, + "ewc_loss": 0.04739553853869438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.369776848354377e-05, + "grad_norm": 30.38077735900879, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.865767240524292, + "num_tokens": 675059435.0, + "step": 17690 + }, + { + "epoch": 2.2504770385447146, + "ewc_loss": 0.04758644849061966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3793223590473644e-05, + "grad_norm": 30.385814666748047, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8739374876022339, + "num_tokens": 675091689.0, + "step": 17691 + }, + { + "epoch": 2.250604248823305, + "ewc_loss": 0.047397956252098083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3698978111497127e-05, + "grad_norm": 30.22248649597168, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8643678426742554, + "num_tokens": 675130368.0, + "step": 17692 + }, + { + "epoch": 2.250731459101895, + "ewc_loss": 0.04753037914633751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3765189325786196e-05, + "grad_norm": 30.332658767700195, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8745262622833252, + "num_tokens": 675171745.0, + "step": 17693 + }, + { + "epoch": 2.250858669380486, + "ewc_loss": 0.047412578016519547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3706288629909977e-05, + "grad_norm": 30.272232055664062, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8690406084060669, + "num_tokens": 675206052.0, + "step": 17694 + }, + { + "epoch": 2.2509858796590763, + "ewc_loss": 0.04754647985100746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.377324017288629e-05, + "grad_norm": 30.21153450012207, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8648675680160522, + "num_tokens": 675243231.0, + "step": 17695 + }, + { + "epoch": 2.251113089937667, + "ewc_loss": 0.0475471056997776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.37735530390637e-05, + "grad_norm": 30.380645751953125, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8636392951011658, + "num_tokens": 675273582.0, + "step": 17696 + }, + { + "epoch": 2.2512403002162573, + "ewc_loss": 0.04755742475390434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3778711693012156e-05, + "grad_norm": 30.326828002929688, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8670968413352966, + "num_tokens": 675314247.0, + "step": 17697 + }, + { + "epoch": 2.251367510494848, + "ewc_loss": 0.04745763540267944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3728818632662296e-05, + "grad_norm": 30.259082794189453, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8717089891433716, + "num_tokens": 675355675.0, + "step": 17698 + }, + { + "epoch": 2.2514947207734384, + "ewc_loss": 0.047574810683727264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.37874046433717e-05, + "grad_norm": 30.325220108032227, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8678230047225952, + "num_tokens": 675394290.0, + "step": 17699 + }, + { + "epoch": 2.251621931052029, + "ewc_loss": 0.04754796251654625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3773980501573533e-05, + "grad_norm": 30.20535659790039, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8734744787216187, + "num_tokens": 675434251.0, + "step": 17700 + }, + { + "epoch": 2.2517491413306194, + "ewc_loss": 0.04758276045322418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3791380954207852e-05, + "grad_norm": 30.420074462890625, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8762578964233398, + "num_tokens": 675474731.0, + "step": 17701 + }, + { + "epoch": 2.25187635160921, + "ewc_loss": 0.04765152931213379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.382576531090308e-05, + "grad_norm": 30.316486358642578, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8646219968795776, + "num_tokens": 675515180.0, + "step": 17702 + }, + { + "epoch": 2.2520035618878005, + "ewc_loss": 0.04757555201649666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3787775717210025e-05, + "grad_norm": 30.41855812072754, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8596445322036743, + "num_tokens": 675553429.0, + "step": 17703 + }, + { + "epoch": 2.252130772166391, + "ewc_loss": 0.04758312553167343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3791562853148207e-05, + "grad_norm": 30.271196365356445, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8727242350578308, + "num_tokens": 675592919.0, + "step": 17704 + }, + { + "epoch": 2.2522579824449815, + "ewc_loss": 0.047564547508955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.37822732742643e-05, + "grad_norm": 30.37554931640625, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8788189888000488, + "num_tokens": 675627528.0, + "step": 17705 + }, + { + "epoch": 2.252385192723572, + "ewc_loss": 0.047580696642398834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.379034776822664e-05, + "grad_norm": 30.291370391845703, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8784380555152893, + "num_tokens": 675663721.0, + "step": 17706 + }, + { + "epoch": 2.2525124030021626, + "ewc_loss": 0.047497138381004333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3748569219605997e-05, + "grad_norm": 30.35634422302246, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.863698422908783, + "num_tokens": 675705187.0, + "step": 17707 + }, + { + "epoch": 2.252639613280753, + "ewc_loss": 0.04756990447640419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.378495264565572e-05, + "grad_norm": 30.263866424560547, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8798046708106995, + "num_tokens": 675752713.0, + "step": 17708 + }, + { + "epoch": 2.2527668235593437, + "ewc_loss": 0.04754582792520523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3772914573783055e-05, + "grad_norm": 30.28006935119629, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8710861802101135, + "num_tokens": 675791009.0, + "step": 17709 + }, + { + "epoch": 2.252894033837934, + "ewc_loss": 0.04753721505403519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.376860720687546e-05, + "grad_norm": 30.367595672607422, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8730050325393677, + "num_tokens": 675828890.0, + "step": 17710 + }, + { + "epoch": 2.2530212441165247, + "ewc_loss": 0.04753022640943527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3765112928231247e-05, + "grad_norm": 30.210594177246094, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8704518675804138, + "num_tokens": 675862855.0, + "step": 17711 + }, + { + "epoch": 2.2531484543951152, + "ewc_loss": 0.04751541092991829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3757706003380008e-05, + "grad_norm": 30.389917373657227, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8685123920440674, + "num_tokens": 675903032.0, + "step": 17712 + }, + { + "epoch": 2.2532756646737058, + "ewc_loss": 0.04747621342539787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.37381063925568e-05, + "grad_norm": 30.285675048828125, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8657213449478149, + "num_tokens": 675943035.0, + "step": 17713 + }, + { + "epoch": 2.2534028749522963, + "ewc_loss": 0.047541484236717224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.377074270043522e-05, + "grad_norm": 30.308195114135742, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8732098937034607, + "num_tokens": 675983073.0, + "step": 17714 + }, + { + "epoch": 2.253530085230887, + "ewc_loss": 0.0474950447678566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.374752330069896e-05, + "grad_norm": 30.307586669921875, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8750817775726318, + "num_tokens": 676022341.0, + "step": 17715 + }, + { + "epoch": 2.2536572955094774, + "ewc_loss": 0.04752571880817413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3762859200360253e-05, + "grad_norm": 30.275943756103516, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.866832971572876, + "num_tokens": 676060907.0, + "step": 17716 + }, + { + "epoch": 2.253784505788068, + "ewc_loss": 0.04749950394034386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3749751562718302e-05, + "grad_norm": 30.348670959472656, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8683491349220276, + "num_tokens": 676100693.0, + "step": 17717 + }, + { + "epoch": 2.253911716066658, + "ewc_loss": 0.047541406005620956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3770702682668343e-05, + "grad_norm": 30.224836349487305, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8850082159042358, + "num_tokens": 676137526.0, + "step": 17718 + }, + { + "epoch": 2.254038926345249, + "ewc_loss": 0.04746503010392189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.37325148191303e-05, + "grad_norm": 30.26662254333496, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8732175230979919, + "num_tokens": 676177245.0, + "step": 17719 + }, + { + "epoch": 2.254166136623839, + "ewc_loss": 0.04755917191505432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.377958662691526e-05, + "grad_norm": 30.362083435058594, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8772915601730347, + "num_tokens": 676214568.0, + "step": 17720 + }, + { + "epoch": 2.2542933469024296, + "ewc_loss": 0.0475790798664093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3789540136931464e-05, + "grad_norm": 30.219696044921875, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8711135387420654, + "num_tokens": 676252552.0, + "step": 17721 + }, + { + "epoch": 2.25442055718102, + "ewc_loss": 0.0475018247961998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3750912077957764e-05, + "grad_norm": 30.25238800048828, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8531644344329834, + "num_tokens": 676290480.0, + "step": 17722 + }, + { + "epoch": 2.2545477674596106, + "ewc_loss": 0.047541648149490356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.377082455495838e-05, + "grad_norm": 30.19148063659668, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8833600282669067, + "num_tokens": 676324950.0, + "step": 17723 + }, + { + "epoch": 2.254674977738201, + "ewc_loss": 0.04756153002381325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.378076533204876e-05, + "grad_norm": 30.30640411376953, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8621598482131958, + "num_tokens": 676363851.0, + "step": 17724 + }, + { + "epoch": 2.2548021880167917, + "ewc_loss": 0.04754170402884483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3770851839799434e-05, + "grad_norm": 30.2194766998291, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8791254758834839, + "num_tokens": 676399920.0, + "step": 17725 + }, + { + "epoch": 2.254929398295382, + "ewc_loss": 0.04749465733766556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.374732866883278e-05, + "grad_norm": 30.169748306274414, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8679530024528503, + "num_tokens": 676443243.0, + "step": 17726 + }, + { + "epoch": 2.2550566085739727, + "ewc_loss": 0.04764365404844284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3821827198844403e-05, + "grad_norm": 30.34032440185547, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.857495903968811, + "num_tokens": 676480272.0, + "step": 17727 + }, + { + "epoch": 2.2551838188525632, + "ewc_loss": 0.04761311039328575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3806554963812232e-05, + "grad_norm": 30.233360290527344, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8665074110031128, + "num_tokens": 676516630.0, + "step": 17728 + }, + { + "epoch": 2.2553110291311538, + "ewc_loss": 0.047660231590270996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.383011633355636e-05, + "grad_norm": 30.3398380279541, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8749966621398926, + "num_tokens": 676557842.0, + "step": 17729 + }, + { + "epoch": 2.2554382394097443, + "ewc_loss": 0.04765038192272186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.382519051025156e-05, + "grad_norm": 30.33847999572754, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8801324367523193, + "num_tokens": 676599413.0, + "step": 17730 + }, + { + "epoch": 2.255565449688335, + "ewc_loss": 0.047629792243242264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3814896849216893e-05, + "grad_norm": 30.35692024230957, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8622909784317017, + "num_tokens": 676637791.0, + "step": 17731 + }, + { + "epoch": 2.2556926599669254, + "ewc_loss": 0.04750993102788925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3754964786348864e-05, + "grad_norm": 30.264263153076172, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8656006455421448, + "num_tokens": 676677108.0, + "step": 17732 + }, + { + "epoch": 2.255819870245516, + "ewc_loss": 0.04760923981666565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.380461955908686e-05, + "grad_norm": 30.4213924407959, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8730945587158203, + "num_tokens": 676720698.0, + "step": 17733 + }, + { + "epoch": 2.2559470805241064, + "ewc_loss": 0.04761432483792305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3807162506273016e-05, + "grad_norm": 30.303239822387695, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8586158752441406, + "num_tokens": 676763719.0, + "step": 17734 + }, + { + "epoch": 2.256074290802697, + "ewc_loss": 0.04742077365517616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3710386813036166e-05, + "grad_norm": 30.301555633544922, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.871088445186615, + "num_tokens": 676801083.0, + "step": 17735 + }, + { + "epoch": 2.2562015010812875, + "ewc_loss": 0.04760931059718132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.380465593887493e-05, + "grad_norm": 30.421213150024414, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8628553152084351, + "num_tokens": 676840861.0, + "step": 17736 + }, + { + "epoch": 2.256328711359878, + "ewc_loss": 0.04753201827406883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3766009690007195e-05, + "grad_norm": 30.314359664916992, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8829120993614197, + "num_tokens": 676876864.0, + "step": 17737 + }, + { + "epoch": 2.2564559216384685, + "ewc_loss": 0.047476474195718765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3738237359793857e-05, + "grad_norm": 30.34149169921875, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8895101547241211, + "num_tokens": 676920983.0, + "step": 17738 + }, + { + "epoch": 2.256583131917059, + "ewc_loss": 0.047535743564367294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3767872335156426e-05, + "grad_norm": 30.43474769592285, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8799206614494324, + "num_tokens": 676961294.0, + "step": 17739 + }, + { + "epoch": 2.2567103421956496, + "ewc_loss": 0.04747532308101654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3737660740152933e-05, + "grad_norm": 30.30180549621582, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8639751672744751, + "num_tokens": 676996318.0, + "step": 17740 + }, + { + "epoch": 2.2568375524742397, + "ewc_loss": 0.047512322664260864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3756161681376398e-05, + "grad_norm": 30.429311752319336, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8724836111068726, + "num_tokens": 677034083.0, + "step": 17741 + }, + { + "epoch": 2.2569647627528306, + "ewc_loss": 0.0474948026239872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.374740142840892e-05, + "grad_norm": 30.30694007873535, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8712297081947327, + "num_tokens": 677071206.0, + "step": 17742 + }, + { + "epoch": 2.2570919730314207, + "ewc_loss": 0.047488078474998474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3744039935991168e-05, + "grad_norm": 30.43799591064453, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8751574754714966, + "num_tokens": 677103573.0, + "step": 17743 + }, + { + "epoch": 2.2572191833100113, + "ewc_loss": 0.04750368744134903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.375184340053238e-05, + "grad_norm": 30.23321533203125, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8797372579574585, + "num_tokens": 677141406.0, + "step": 17744 + }, + { + "epoch": 2.257346393588602, + "ewc_loss": 0.04740827530622482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3704138584434986e-05, + "grad_norm": 30.29993438720703, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8598564267158508, + "num_tokens": 677185968.0, + "step": 17745 + }, + { + "epoch": 2.2574736038671923, + "ewc_loss": 0.0475403256714344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.377016244281549e-05, + "grad_norm": 30.398391723632812, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8631635904312134, + "num_tokens": 677229004.0, + "step": 17746 + }, + { + "epoch": 2.257600814145783, + "ewc_loss": 0.04752518981695175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.376259544689674e-05, + "grad_norm": 30.338346481323242, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8815059065818787, + "num_tokens": 677266317.0, + "step": 17747 + }, + { + "epoch": 2.2577280244243734, + "ewc_loss": 0.047440141439437866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3720071112620644e-05, + "grad_norm": 30.41695785522461, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8809434771537781, + "num_tokens": 677302344.0, + "step": 17748 + }, + { + "epoch": 2.257855234702964, + "ewc_loss": 0.04750321805477142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3751608750899322e-05, + "grad_norm": 30.333003997802734, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8614150285720825, + "num_tokens": 677342103.0, + "step": 17749 + }, + { + "epoch": 2.2579824449815544, + "ewc_loss": 0.04754389449954033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.377194687142037e-05, + "grad_norm": 30.414575576782227, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8572547435760498, + "num_tokens": 677378602.0, + "step": 17750 + }, + { + "epoch": 2.258109655260145, + "ewc_loss": 0.047519128769636154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3759565010550432e-05, + "grad_norm": 30.358400344848633, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8693221807479858, + "num_tokens": 677415005.0, + "step": 17751 + }, + { + "epoch": 2.2582368655387355, + "ewc_loss": 0.04746651649475098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3733258785796352e-05, + "grad_norm": 30.306915283203125, + "learning_rate": 1e-06, + "loss": 0.4879, + "mean_token_accuracy": 0.852090060710907, + "num_tokens": 677457908.0, + "step": 17752 + }, + { + "epoch": 2.258364075817326, + "ewc_loss": 0.04759662598371506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3798313122824766e-05, + "grad_norm": 30.347776412963867, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8604815006256104, + "num_tokens": 677492759.0, + "step": 17753 + }, + { + "epoch": 2.2584912860959165, + "ewc_loss": 0.04748121649026871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3740607502986677e-05, + "grad_norm": 30.253971099853516, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8677963614463806, + "num_tokens": 677535771.0, + "step": 17754 + }, + { + "epoch": 2.258618496374507, + "ewc_loss": 0.04761793464422226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3808966943761334e-05, + "grad_norm": 30.297176361083984, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.883236289024353, + "num_tokens": 677575325.0, + "step": 17755 + }, + { + "epoch": 2.2587457066530976, + "ewc_loss": 0.04764983430504799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3824917661841027e-05, + "grad_norm": 30.437700271606445, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8704321384429932, + "num_tokens": 677614132.0, + "step": 17756 + }, + { + "epoch": 2.258872916931688, + "ewc_loss": 0.04754096642136574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3770482584950514e-05, + "grad_norm": 30.352928161621094, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8645144701004028, + "num_tokens": 677645657.0, + "step": 17757 + }, + { + "epoch": 2.2590001272102787, + "ewc_loss": 0.04757476598024368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.378738281549886e-05, + "grad_norm": 30.27659034729004, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8782137632369995, + "num_tokens": 677686238.0, + "step": 17758 + }, + { + "epoch": 2.259127337488869, + "ewc_loss": 0.04753214493393898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.376607335463632e-05, + "grad_norm": 30.33214569091797, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8738953471183777, + "num_tokens": 677724080.0, + "step": 17759 + }, + { + "epoch": 2.2592545477674597, + "ewc_loss": 0.04759042337536812, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.379521174589172e-05, + "grad_norm": 30.284086227416992, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.869763195514679, + "num_tokens": 677765768.0, + "step": 17760 + }, + { + "epoch": 2.2593817580460502, + "ewc_loss": 0.04755476862192154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3777383830747567e-05, + "grad_norm": 30.3957462310791, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8710410594940186, + "num_tokens": 677803937.0, + "step": 17761 + }, + { + "epoch": 2.2595089683246408, + "ewc_loss": 0.04764604941010475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3823024093871936e-05, + "grad_norm": 30.310428619384766, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8814407587051392, + "num_tokens": 677839685.0, + "step": 17762 + }, + { + "epoch": 2.2596361786032313, + "ewc_loss": 0.047585662454366684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.379283068876248e-05, + "grad_norm": 30.349685668945312, + "learning_rate": 1e-06, + "loss": 0.4819, + "mean_token_accuracy": 0.8566573858261108, + "num_tokens": 677880411.0, + "step": 17763 + }, + { + "epoch": 2.259763388881822, + "ewc_loss": 0.04769822955131531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3849113858886994e-05, + "grad_norm": 30.408458709716797, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8787853717803955, + "num_tokens": 677921177.0, + "step": 17764 + }, + { + "epoch": 2.2598905991604123, + "ewc_loss": 0.04762190580368042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3810953280190006e-05, + "grad_norm": 30.460031509399414, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8802844882011414, + "num_tokens": 677956393.0, + "step": 17765 + }, + { + "epoch": 2.2600178094390024, + "ewc_loss": 0.04753251373767853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3766257072566077e-05, + "grad_norm": 30.306175231933594, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8684648275375366, + "num_tokens": 677988364.0, + "step": 17766 + }, + { + "epoch": 2.2601450197175934, + "ewc_loss": 0.04760829359292984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3804146621841937e-05, + "grad_norm": 30.38787269592285, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8778549432754517, + "num_tokens": 678028093.0, + "step": 17767 + }, + { + "epoch": 2.2602722299961835, + "ewc_loss": 0.04768037796020508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.38401898968732e-05, + "grad_norm": 30.432090759277344, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8717745542526245, + "num_tokens": 678068174.0, + "step": 17768 + }, + { + "epoch": 2.260399440274774, + "ewc_loss": 0.04757776856422424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3788883481756784e-05, + "grad_norm": 30.45270538330078, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8724384307861328, + "num_tokens": 678105784.0, + "step": 17769 + }, + { + "epoch": 2.2605266505533645, + "ewc_loss": 0.04759350046515465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.379675061092712e-05, + "grad_norm": 30.49506950378418, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8810796737670898, + "num_tokens": 678142873.0, + "step": 17770 + }, + { + "epoch": 2.260653860831955, + "ewc_loss": 0.04755338281393051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3776690795784816e-05, + "grad_norm": 30.354650497436523, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8668142557144165, + "num_tokens": 678181527.0, + "step": 17771 + }, + { + "epoch": 2.2607810711105456, + "ewc_loss": 0.047528691589832306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.376434531470295e-05, + "grad_norm": 30.4172420501709, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8731710910797119, + "num_tokens": 678221225.0, + "step": 17772 + }, + { + "epoch": 2.260908281389136, + "ewc_loss": 0.047536373138427734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.376818702032324e-05, + "grad_norm": 30.288496017456055, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8784059286117554, + "num_tokens": 678254767.0, + "step": 17773 + }, + { + "epoch": 2.2610354916677267, + "ewc_loss": 0.047544293105602264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.377214696025476e-05, + "grad_norm": 30.366432189941406, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8735198974609375, + "num_tokens": 678290830.0, + "step": 17774 + }, + { + "epoch": 2.261162701946317, + "ewc_loss": 0.04752286151051521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.376143129367847e-05, + "grad_norm": 30.33926773071289, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.882250189781189, + "num_tokens": 678327102.0, + "step": 17775 + }, + { + "epoch": 2.2612899122249077, + "ewc_loss": 0.04768061265349388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3840306312195025e-05, + "grad_norm": 30.41682243347168, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.884729266166687, + "num_tokens": 678364872.0, + "step": 17776 + }, + { + "epoch": 2.2614171225034982, + "ewc_loss": 0.04752788692712784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3763943318044767e-05, + "grad_norm": 30.295879364013672, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.8634405732154846, + "num_tokens": 678399233.0, + "step": 17777 + }, + { + "epoch": 2.2615443327820888, + "ewc_loss": 0.047587521374225616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.379376019234769e-05, + "grad_norm": 30.353836059570312, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8792986869812012, + "num_tokens": 678434577.0, + "step": 17778 + }, + { + "epoch": 2.2616715430606793, + "ewc_loss": 0.0476585328578949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3829266865504906e-05, + "grad_norm": 30.359832763671875, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.87590491771698, + "num_tokens": 678466835.0, + "step": 17779 + }, + { + "epoch": 2.26179875333927, + "ewc_loss": 0.04758754372596741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3793771106284112e-05, + "grad_norm": 30.26317024230957, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8865599632263184, + "num_tokens": 678506817.0, + "step": 17780 + }, + { + "epoch": 2.2619259636178604, + "ewc_loss": 0.04758157953619957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.37907897826517e-05, + "grad_norm": 30.355485916137695, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8725932836532593, + "num_tokens": 678543734.0, + "step": 17781 + }, + { + "epoch": 2.262053173896451, + "ewc_loss": 0.047712817788124084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3856408006395213e-05, + "grad_norm": 30.306861877441406, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.866965651512146, + "num_tokens": 678585066.0, + "step": 17782 + }, + { + "epoch": 2.2621803841750414, + "ewc_loss": 0.04761046543717384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3805232558515854e-05, + "grad_norm": 30.464176177978516, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8851922750473022, + "num_tokens": 678614951.0, + "step": 17783 + }, + { + "epoch": 2.262307594453632, + "ewc_loss": 0.04775860905647278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3879303626017645e-05, + "grad_norm": 30.546031951904297, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8737671375274658, + "num_tokens": 678649671.0, + "step": 17784 + }, + { + "epoch": 2.2624348047322225, + "ewc_loss": 0.047572534531354904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3786267774994485e-05, + "grad_norm": 30.28375244140625, + "learning_rate": 1e-06, + "loss": 0.3665, + "mean_token_accuracy": 0.8882321119308472, + "num_tokens": 678693418.0, + "step": 17785 + }, + { + "epoch": 2.262562015010813, + "ewc_loss": 0.04756069928407669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3780348783475347e-05, + "grad_norm": 30.352092742919922, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8811072111129761, + "num_tokens": 678730891.0, + "step": 17786 + }, + { + "epoch": 2.2626892252894035, + "ewc_loss": 0.04764555022120476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.382277489232365e-05, + "grad_norm": 30.346214294433594, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8796547651290894, + "num_tokens": 678766369.0, + "step": 17787 + }, + { + "epoch": 2.262816435567994, + "ewc_loss": 0.04759908467531204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3799542759661563e-05, + "grad_norm": 30.344383239746094, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8611443042755127, + "num_tokens": 678809118.0, + "step": 17788 + }, + { + "epoch": 2.2629436458465846, + "ewc_loss": 0.04771212488412857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.385606239840854e-05, + "grad_norm": 30.385896682739258, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8697230815887451, + "num_tokens": 678850057.0, + "step": 17789 + }, + { + "epoch": 2.263070856125175, + "ewc_loss": 0.04767103120684624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3835515094106086e-05, + "grad_norm": 30.467226028442383, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8706362247467041, + "num_tokens": 678890253.0, + "step": 17790 + }, + { + "epoch": 2.263198066403765, + "ewc_loss": 0.047644976526498795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.382248749199789e-05, + "grad_norm": 30.382768630981445, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8854025602340698, + "num_tokens": 678922535.0, + "step": 17791 + }, + { + "epoch": 2.263325276682356, + "ewc_loss": 0.04751943051815033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3759715986670926e-05, + "grad_norm": 30.24723243713379, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8692867755889893, + "num_tokens": 678956136.0, + "step": 17792 + }, + { + "epoch": 2.2634524869609463, + "ewc_loss": 0.047668442130088806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3834221792640164e-05, + "grad_norm": 30.427108764648438, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8751091957092285, + "num_tokens": 678990864.0, + "step": 17793 + }, + { + "epoch": 2.263579697239537, + "ewc_loss": 0.047580648213624954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3790324121364392e-05, + "grad_norm": 30.37216567993164, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8841780424118042, + "num_tokens": 679028345.0, + "step": 17794 + }, + { + "epoch": 2.2637069075181273, + "ewc_loss": 0.04768132045865059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3840661015128717e-05, + "grad_norm": 30.410198211669922, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8585841655731201, + "num_tokens": 679073679.0, + "step": 17795 + }, + { + "epoch": 2.263834117796718, + "ewc_loss": 0.04766834154725075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3834170860936865e-05, + "grad_norm": 30.385009765625, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8700095415115356, + "num_tokens": 679109274.0, + "step": 17796 + }, + { + "epoch": 2.2639613280753084, + "ewc_loss": 0.04755180701613426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.377590317337308e-05, + "grad_norm": 30.33795738220215, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8731231093406677, + "num_tokens": 679141644.0, + "step": 17797 + }, + { + "epoch": 2.264088538353899, + "ewc_loss": 0.0476963073015213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3848153432481922e-05, + "grad_norm": 30.487518310546875, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8556041717529297, + "num_tokens": 679184351.0, + "step": 17798 + }, + { + "epoch": 2.2642157486324894, + "ewc_loss": 0.04761570319533348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.380785190325696e-05, + "grad_norm": 30.243074417114258, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8682206869125366, + "num_tokens": 679222509.0, + "step": 17799 + }, + { + "epoch": 2.26434295891108, + "ewc_loss": 0.047620635479688644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.381031845288817e-05, + "grad_norm": 30.42090606689453, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8560562133789062, + "num_tokens": 679263362.0, + "step": 17800 + }, + { + "epoch": 2.2644701691896705, + "ewc_loss": 0.047627586871385574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.381379272264894e-05, + "grad_norm": 30.18710708618164, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8597170114517212, + "num_tokens": 679307859.0, + "step": 17801 + }, + { + "epoch": 2.264597379468261, + "ewc_loss": 0.047617532312870026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3808766854926944e-05, + "grad_norm": 30.42316436767578, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8809231519699097, + "num_tokens": 679344801.0, + "step": 17802 + }, + { + "epoch": 2.2647245897468515, + "ewc_loss": 0.047763604670763016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3881802917458117e-05, + "grad_norm": 30.41303062438965, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8642593026161194, + "num_tokens": 679382795.0, + "step": 17803 + }, + { + "epoch": 2.264851800025442, + "ewc_loss": 0.04758220538496971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.379110264882911e-05, + "grad_norm": 30.445159912109375, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8691437244415283, + "num_tokens": 679425282.0, + "step": 17804 + }, + { + "epoch": 2.2649790103040326, + "ewc_loss": 0.04774421080946922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3872105884947814e-05, + "grad_norm": 30.355756759643555, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8581036329269409, + "num_tokens": 679466881.0, + "step": 17805 + }, + { + "epoch": 2.265106220582623, + "ewc_loss": 0.04760601371526718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3803006115485914e-05, + "grad_norm": 30.384279251098633, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8840240836143494, + "num_tokens": 679509268.0, + "step": 17806 + }, + { + "epoch": 2.2652334308612136, + "ewc_loss": 0.04766845703125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.383422906859778e-05, + "grad_norm": 30.39638328552246, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8694725036621094, + "num_tokens": 679545082.0, + "step": 17807 + }, + { + "epoch": 2.265360641139804, + "ewc_loss": 0.04759063571691513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3795317247277126e-05, + "grad_norm": 30.34528350830078, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8734070062637329, + "num_tokens": 679581795.0, + "step": 17808 + }, + { + "epoch": 2.2654878514183947, + "ewc_loss": 0.0476178303360939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3808916012058035e-05, + "grad_norm": 30.30063819885254, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8697366118431091, + "num_tokens": 679618978.0, + "step": 17809 + }, + { + "epoch": 2.2656150616969852, + "ewc_loss": 0.047613486647605896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.38067441387102e-05, + "grad_norm": 30.34771728515625, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8752289414405823, + "num_tokens": 679655002.0, + "step": 17810 + }, + { + "epoch": 2.2657422719755758, + "ewc_loss": 0.047665487974882126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3832744773244485e-05, + "grad_norm": 30.31070899963379, + "learning_rate": 1e-06, + "loss": 0.3637, + "mean_token_accuracy": 0.8859254121780396, + "num_tokens": 679691143.0, + "step": 17811 + }, + { + "epoch": 2.2658694822541663, + "ewc_loss": 0.047658685594797134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3829343263059855e-05, + "grad_norm": 30.37323760986328, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8734010457992554, + "num_tokens": 679729168.0, + "step": 17812 + }, + { + "epoch": 2.265996692532757, + "ewc_loss": 0.04763341322541237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.381670674367342e-05, + "grad_norm": 30.323320388793945, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.870120644569397, + "num_tokens": 679773385.0, + "step": 17813 + }, + { + "epoch": 2.2661239028113473, + "ewc_loss": 0.04755926877260208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3779633920639753e-05, + "grad_norm": 30.295637130737305, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8827677369117737, + "num_tokens": 679813884.0, + "step": 17814 + }, + { + "epoch": 2.266251113089938, + "ewc_loss": 0.047662876546382904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.383143873885274e-05, + "grad_norm": 30.397937774658203, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8721956014633179, + "num_tokens": 679850834.0, + "step": 17815 + }, + { + "epoch": 2.266378323368528, + "ewc_loss": 0.04762279614806175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3811398932593875e-05, + "grad_norm": 30.35675811767578, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8774638175964355, + "num_tokens": 679890210.0, + "step": 17816 + }, + { + "epoch": 2.266505533647119, + "ewc_loss": 0.04761425405740738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.380712794547435e-05, + "grad_norm": 30.363866806030273, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.880084753036499, + "num_tokens": 679926252.0, + "step": 17817 + }, + { + "epoch": 2.266632743925709, + "ewc_loss": 0.047718193382024765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3859096472733654e-05, + "grad_norm": 30.482641220092773, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8682692050933838, + "num_tokens": 679959626.0, + "step": 17818 + }, + { + "epoch": 2.2667599542042995, + "ewc_loss": 0.047634854912757874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3817427063477226e-05, + "grad_norm": 30.3272762298584, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8665761351585388, + "num_tokens": 679999973.0, + "step": 17819 + }, + { + "epoch": 2.26688716448289, + "ewc_loss": 0.04760611802339554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3803058866178617e-05, + "grad_norm": 30.330841064453125, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8639780282974243, + "num_tokens": 680041081.0, + "step": 17820 + }, + { + "epoch": 2.2670143747614806, + "ewc_loss": 0.04764796048402786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.38239808822982e-05, + "grad_norm": 30.226455688476562, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.876140832901001, + "num_tokens": 680078953.0, + "step": 17821 + }, + { + "epoch": 2.267141585040071, + "ewc_loss": 0.04758153110742569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3790766135789454e-05, + "grad_norm": 30.306968688964844, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8796550631523132, + "num_tokens": 680112309.0, + "step": 17822 + }, + { + "epoch": 2.2672687953186617, + "ewc_loss": 0.04761523753404617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3807619072613306e-05, + "grad_norm": 30.251720428466797, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8664742708206177, + "num_tokens": 680148607.0, + "step": 17823 + }, + { + "epoch": 2.267396005597252, + "ewc_loss": 0.04770747944712639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.385373954894021e-05, + "grad_norm": 30.357744216918945, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8880627155303955, + "num_tokens": 680189617.0, + "step": 17824 + }, + { + "epoch": 2.2675232158758427, + "ewc_loss": 0.04761162027716637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3805810997146182e-05, + "grad_norm": 30.35862922668457, + "learning_rate": 1e-06, + "loss": 0.3656, + "mean_token_accuracy": 0.887466311454773, + "num_tokens": 680227940.0, + "step": 17825 + }, + { + "epoch": 2.2676504261544332, + "ewc_loss": 0.04770824685692787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.385412335570436e-05, + "grad_norm": 30.417724609375, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8843163251876831, + "num_tokens": 680263473.0, + "step": 17826 + }, + { + "epoch": 2.2677776364330238, + "ewc_loss": 0.04759904742240906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3799524569767527e-05, + "grad_norm": 30.323827743530273, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8634305000305176, + "num_tokens": 680297895.0, + "step": 17827 + }, + { + "epoch": 2.2679048467116143, + "ewc_loss": 0.047652896493673325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.382644743192941e-05, + "grad_norm": 30.436513900756836, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8731501698493958, + "num_tokens": 680337658.0, + "step": 17828 + }, + { + "epoch": 2.268032056990205, + "ewc_loss": 0.047698725014925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.384936306043528e-05, + "grad_norm": 30.417879104614258, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8671881556510925, + "num_tokens": 680375912.0, + "step": 17829 + }, + { + "epoch": 2.2681592672687954, + "ewc_loss": 0.047593407332897186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3796703317202628e-05, + "grad_norm": 30.25238609313965, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8840270638465881, + "num_tokens": 680415231.0, + "step": 17830 + }, + { + "epoch": 2.268286477547386, + "ewc_loss": 0.047664765268564224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.383238279435318e-05, + "grad_norm": 30.35392189025879, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8679420948028564, + "num_tokens": 680460037.0, + "step": 17831 + }, + { + "epoch": 2.2684136878259764, + "ewc_loss": 0.04760802164673805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.380401019763667e-05, + "grad_norm": 30.27878761291504, + "learning_rate": 1e-06, + "loss": 0.4837, + "mean_token_accuracy": 0.8496766686439514, + "num_tokens": 680505521.0, + "step": 17832 + }, + { + "epoch": 2.268540898104567, + "ewc_loss": 0.04764785245060921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3823926312616095e-05, + "grad_norm": 30.385358810424805, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.88515305519104, + "num_tokens": 680547682.0, + "step": 17833 + }, + { + "epoch": 2.2686681083831575, + "ewc_loss": 0.04769914969801903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.384957406320609e-05, + "grad_norm": 30.51471710205078, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8779431581497192, + "num_tokens": 680587510.0, + "step": 17834 + }, + { + "epoch": 2.268795318661748, + "ewc_loss": 0.04757276177406311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3786380552337505e-05, + "grad_norm": 30.215757369995117, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8750243782997131, + "num_tokens": 680624233.0, + "step": 17835 + }, + { + "epoch": 2.2689225289403385, + "ewc_loss": 0.04764896258711815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3824481104384176e-05, + "grad_norm": 30.580204010009766, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8671036958694458, + "num_tokens": 680666091.0, + "step": 17836 + }, + { + "epoch": 2.269049739218929, + "ewc_loss": 0.0476701520383358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.383507671765983e-05, + "grad_norm": 30.304594039916992, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8789898157119751, + "num_tokens": 680703062.0, + "step": 17837 + }, + { + "epoch": 2.2691769494975196, + "ewc_loss": 0.047424476593732834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3712238544248976e-05, + "grad_norm": 30.456253051757812, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8848087787628174, + "num_tokens": 680735966.0, + "step": 17838 + }, + { + "epoch": 2.2693041597761097, + "ewc_loss": 0.04775254428386688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3876271370681934e-05, + "grad_norm": 30.3257999420166, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8723849058151245, + "num_tokens": 680780010.0, + "step": 17839 + }, + { + "epoch": 2.2694313700547006, + "ewc_loss": 0.04756221920251846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.378110912104603e-05, + "grad_norm": 30.471601486206055, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8720874786376953, + "num_tokens": 680819516.0, + "step": 17840 + }, + { + "epoch": 2.2695585803332907, + "ewc_loss": 0.047563571482896805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3781785785104148e-05, + "grad_norm": 30.234636306762695, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8712321519851685, + "num_tokens": 680861374.0, + "step": 17841 + }, + { + "epoch": 2.2696857906118812, + "ewc_loss": 0.04765307158231735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3826536562410183e-05, + "grad_norm": 30.499732971191406, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8635132908821106, + "num_tokens": 680902447.0, + "step": 17842 + }, + { + "epoch": 2.2698130008904718, + "ewc_loss": 0.047636087983846664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3818043700885028e-05, + "grad_norm": 30.362581253051758, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8886081576347351, + "num_tokens": 680942376.0, + "step": 17843 + }, + { + "epoch": 2.2699402111690623, + "ewc_loss": 0.04751324653625488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3756623704684898e-05, + "grad_norm": 30.3994197845459, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8728616237640381, + "num_tokens": 680977060.0, + "step": 17844 + }, + { + "epoch": 2.270067421447653, + "ewc_loss": 0.04771580174565315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3857901396695524e-05, + "grad_norm": 30.406312942504883, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8564542531967163, + "num_tokens": 681014504.0, + "step": 17845 + }, + { + "epoch": 2.2701946317262434, + "ewc_loss": 0.04758862033486366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.379430952714756e-05, + "grad_norm": 30.4638729095459, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8563717603683472, + "num_tokens": 681053919.0, + "step": 17846 + }, + { + "epoch": 2.270321842004834, + "ewc_loss": 0.04761470481753349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3807351681170985e-05, + "grad_norm": 30.277565002441406, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8716672658920288, + "num_tokens": 681090883.0, + "step": 17847 + }, + { + "epoch": 2.2704490522834244, + "ewc_loss": 0.047596968710422516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.37984841078287e-05, + "grad_norm": 30.39375114440918, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8867707252502441, + "num_tokens": 681128819.0, + "step": 17848 + }, + { + "epoch": 2.270576262562015, + "ewc_loss": 0.047652095556259155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.382604725426063e-05, + "grad_norm": 30.43285369873047, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.862839937210083, + "num_tokens": 681171434.0, + "step": 17849 + }, + { + "epoch": 2.2707034728406055, + "ewc_loss": 0.04766251891851425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3831258658901788e-05, + "grad_norm": 30.350481033325195, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8817564249038696, + "num_tokens": 681206297.0, + "step": 17850 + }, + { + "epoch": 2.270830683119196, + "ewc_loss": 0.047729384154081345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.386469168413896e-05, + "grad_norm": 30.448352813720703, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8611575365066528, + "num_tokens": 681242521.0, + "step": 17851 + }, + { + "epoch": 2.2709578933977865, + "ewc_loss": 0.04767458513379097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.383729224675335e-05, + "grad_norm": 30.36895751953125, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8808992505073547, + "num_tokens": 681282474.0, + "step": 17852 + }, + { + "epoch": 2.271085103676377, + "ewc_loss": 0.04764413461089134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.382206730544567e-05, + "grad_norm": 30.37408447265625, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8687081336975098, + "num_tokens": 681317280.0, + "step": 17853 + }, + { + "epoch": 2.2712123139549676, + "ewc_loss": 0.04771554097533226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.385777042945847e-05, + "grad_norm": 30.44108772277832, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8698412179946899, + "num_tokens": 681349155.0, + "step": 17854 + }, + { + "epoch": 2.271339524233558, + "ewc_loss": 0.047652967274188995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.382648381171748e-05, + "grad_norm": 30.37534523010254, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8720371723175049, + "num_tokens": 681380863.0, + "step": 17855 + }, + { + "epoch": 2.2714667345121486, + "ewc_loss": 0.04770456254482269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3852280719438568e-05, + "grad_norm": 30.361543655395508, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8769365549087524, + "num_tokens": 681421970.0, + "step": 17856 + }, + { + "epoch": 2.271593944790739, + "ewc_loss": 0.047693293541669846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3846647309255786e-05, + "grad_norm": 30.332443237304688, + "learning_rate": 1e-06, + "loss": 0.3642, + "mean_token_accuracy": 0.8860065937042236, + "num_tokens": 681453511.0, + "step": 17857 + }, + { + "epoch": 2.2717211550693297, + "ewc_loss": 0.04772346839308739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3861734007368796e-05, + "grad_norm": 30.453012466430664, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8601905107498169, + "num_tokens": 681496293.0, + "step": 17858 + }, + { + "epoch": 2.2718483653479202, + "ewc_loss": 0.04763802886009216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.381901504122652e-05, + "grad_norm": 30.312644958496094, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8644803166389465, + "num_tokens": 681537074.0, + "step": 17859 + }, + { + "epoch": 2.2719755756265108, + "ewc_loss": 0.047644540667533875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3822271032258868e-05, + "grad_norm": 30.474363327026367, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8651608824729919, + "num_tokens": 681578911.0, + "step": 17860 + }, + { + "epoch": 2.2721027859051013, + "ewc_loss": 0.04771370068192482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3856850020820275e-05, + "grad_norm": 30.356096267700195, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8680629730224609, + "num_tokens": 681622134.0, + "step": 17861 + }, + { + "epoch": 2.272229996183692, + "ewc_loss": 0.04756587743759155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3782939024385996e-05, + "grad_norm": 30.376201629638672, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8733107447624207, + "num_tokens": 681655919.0, + "step": 17862 + }, + { + "epoch": 2.2723572064622823, + "ewc_loss": 0.047684576362371445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.384228901064489e-05, + "grad_norm": 30.39630699157715, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8635904788970947, + "num_tokens": 681696539.0, + "step": 17863 + }, + { + "epoch": 2.2724844167408724, + "ewc_loss": 0.04764298349618912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.382149250479415e-05, + "grad_norm": 30.464998245239258, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8693888187408447, + "num_tokens": 681734630.0, + "step": 17864 + }, + { + "epoch": 2.2726116270194634, + "ewc_loss": 0.04767794534564018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3838972992962226e-05, + "grad_norm": 30.22744369506836, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8606635332107544, + "num_tokens": 681779669.0, + "step": 17865 + }, + { + "epoch": 2.2727388372980535, + "ewc_loss": 0.04762794077396393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.381397098361049e-05, + "grad_norm": 30.479949951171875, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8692857027053833, + "num_tokens": 681821920.0, + "step": 17866 + }, + { + "epoch": 2.272866047576644, + "ewc_loss": 0.04777809605002403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.388904795225244e-05, + "grad_norm": 30.397052764892578, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.869908332824707, + "num_tokens": 681859239.0, + "step": 17867 + }, + { + "epoch": 2.2729932578552345, + "ewc_loss": 0.047564342617988586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.37821714108577e-05, + "grad_norm": 30.40424919128418, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8592804670333862, + "num_tokens": 681896569.0, + "step": 17868 + }, + { + "epoch": 2.273120468133825, + "ewc_loss": 0.047629594802856445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3814796804799698e-05, + "grad_norm": 30.36505699157715, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.886268675327301, + "num_tokens": 681932549.0, + "step": 17869 + }, + { + "epoch": 2.2732476784124156, + "ewc_loss": 0.04771893844008446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.385946936556138e-05, + "grad_norm": 30.414691925048828, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8659772872924805, + "num_tokens": 681973981.0, + "step": 17870 + }, + { + "epoch": 2.273374888691006, + "ewc_loss": 0.04770380258560181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3851900550653227e-05, + "grad_norm": 30.437219619750977, + "learning_rate": 1e-06, + "loss": 0.5074, + "mean_token_accuracy": 0.8483972549438477, + "num_tokens": 682009512.0, + "step": 17871 + }, + { + "epoch": 2.2735020989695967, + "ewc_loss": 0.04764999821782112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3824999516364187e-05, + "grad_norm": 30.405778884887695, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8691200613975525, + "num_tokens": 682048058.0, + "step": 17872 + }, + { + "epoch": 2.273629309248187, + "ewc_loss": 0.04771299287676811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3856497136875987e-05, + "grad_norm": 30.459564208984375, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8646768927574158, + "num_tokens": 682084181.0, + "step": 17873 + }, + { + "epoch": 2.2737565195267777, + "ewc_loss": 0.0475984625518322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3799231712473556e-05, + "grad_norm": 30.454547882080078, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8916825652122498, + "num_tokens": 682119613.0, + "step": 17874 + }, + { + "epoch": 2.2738837298053682, + "ewc_loss": 0.04766201227903366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3831005819374695e-05, + "grad_norm": 30.321800231933594, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8751513361930847, + "num_tokens": 682159501.0, + "step": 17875 + }, + { + "epoch": 2.2740109400839588, + "ewc_loss": 0.04763636738061905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.38181837630691e-05, + "grad_norm": 30.442649841308594, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8804911971092224, + "num_tokens": 682197323.0, + "step": 17876 + }, + { + "epoch": 2.2741381503625493, + "ewc_loss": 0.04774437099695206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.387218592048157e-05, + "grad_norm": 30.444561004638672, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8730212450027466, + "num_tokens": 682231150.0, + "step": 17877 + }, + { + "epoch": 2.27426536064114, + "ewc_loss": 0.04765801131725311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3829004931030795e-05, + "grad_norm": 30.431562423706055, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8728800415992737, + "num_tokens": 682269351.0, + "step": 17878 + }, + { + "epoch": 2.2743925709197303, + "ewc_loss": 0.047712888568639755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3856444386183284e-05, + "grad_norm": 30.429847717285156, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.882806122303009, + "num_tokens": 682303633.0, + "step": 17879 + }, + { + "epoch": 2.274519781198321, + "ewc_loss": 0.04756828770041466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3784143195371144e-05, + "grad_norm": 30.449737548828125, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8700030446052551, + "num_tokens": 682340890.0, + "step": 17880 + }, + { + "epoch": 2.2746469914769114, + "ewc_loss": 0.047672200947999954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3836100808694027e-05, + "grad_norm": 30.27199935913086, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8868722319602966, + "num_tokens": 682374253.0, + "step": 17881 + }, + { + "epoch": 2.274774201755502, + "ewc_loss": 0.04772346094250679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.386173036938999e-05, + "grad_norm": 30.437576293945312, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8717197775840759, + "num_tokens": 682412675.0, + "step": 17882 + }, + { + "epoch": 2.2749014120340925, + "ewc_loss": 0.04779849201440811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3899245206848718e-05, + "grad_norm": 30.418298721313477, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8822641372680664, + "num_tokens": 682447864.0, + "step": 17883 + }, + { + "epoch": 2.275028622312683, + "ewc_loss": 0.04764905199408531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3824526579119265e-05, + "grad_norm": 30.41284942626953, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.872201681137085, + "num_tokens": 682486445.0, + "step": 17884 + }, + { + "epoch": 2.2751558325912735, + "ewc_loss": 0.04775504395365715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3877522835391574e-05, + "grad_norm": 30.414737701416016, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8702996373176575, + "num_tokens": 682518173.0, + "step": 17885 + }, + { + "epoch": 2.275283042869864, + "ewc_loss": 0.04767075553536415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.383537866990082e-05, + "grad_norm": 30.245256423950195, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8708375096321106, + "num_tokens": 682558627.0, + "step": 17886 + }, + { + "epoch": 2.2754102531484546, + "ewc_loss": 0.04774310439825058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3871552912169136e-05, + "grad_norm": 30.468143463134766, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8765657544136047, + "num_tokens": 682600821.0, + "step": 17887 + }, + { + "epoch": 2.275537463427045, + "ewc_loss": 0.04768472537398338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3842363589210436e-05, + "grad_norm": 30.279949188232422, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.859641432762146, + "num_tokens": 682643947.0, + "step": 17888 + }, + { + "epoch": 2.275664673705635, + "ewc_loss": 0.047754235565662384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3877117200754583e-05, + "grad_norm": 30.433137893676758, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8660004734992981, + "num_tokens": 682683045.0, + "step": 17889 + }, + { + "epoch": 2.275791883984226, + "ewc_loss": 0.04783990606665611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3919952582218684e-05, + "grad_norm": 30.514524459838867, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8724742531776428, + "num_tokens": 682721617.0, + "step": 17890 + }, + { + "epoch": 2.2759190942628162, + "ewc_loss": 0.04773290455341339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3866452465881594e-05, + "grad_norm": 30.302736282348633, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8641451597213745, + "num_tokens": 682758339.0, + "step": 17891 + }, + { + "epoch": 2.2760463045414068, + "ewc_loss": 0.04769487306475639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3847436750656925e-05, + "grad_norm": 30.429080963134766, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8766863346099854, + "num_tokens": 682800501.0, + "step": 17892 + }, + { + "epoch": 2.2761735148199973, + "ewc_loss": 0.04775986075401306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.387993117736187e-05, + "grad_norm": 30.45561981201172, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8730827569961548, + "num_tokens": 682833231.0, + "step": 17893 + }, + { + "epoch": 2.276300725098588, + "ewc_loss": 0.0477057620882988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3852880985941738e-05, + "grad_norm": 30.406160354614258, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8765014410018921, + "num_tokens": 682873945.0, + "step": 17894 + }, + { + "epoch": 2.2764279353771784, + "ewc_loss": 0.047575198113918304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.378759927523788e-05, + "grad_norm": 30.339750289916992, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8708648681640625, + "num_tokens": 682918329.0, + "step": 17895 + }, + { + "epoch": 2.276555145655769, + "ewc_loss": 0.0478026457130909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3901322492747568e-05, + "grad_norm": 30.429597854614258, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.877740204334259, + "num_tokens": 682956277.0, + "step": 17896 + }, + { + "epoch": 2.2766823559343594, + "ewc_loss": 0.04763786122202873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3818931367713958e-05, + "grad_norm": 30.33586311340332, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8744601607322693, + "num_tokens": 682990648.0, + "step": 17897 + }, + { + "epoch": 2.27680956621295, + "ewc_loss": 0.04775717854499817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.387858876318205e-05, + "grad_norm": 30.338891983032227, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8857603073120117, + "num_tokens": 683029046.0, + "step": 17898 + }, + { + "epoch": 2.2769367764915405, + "ewc_loss": 0.04770950600504875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3854752726037987e-05, + "grad_norm": 30.50922203063965, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8721938133239746, + "num_tokens": 683064541.0, + "step": 17899 + }, + { + "epoch": 2.277063986770131, + "ewc_loss": 0.04771263152360916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3856315237935632e-05, + "grad_norm": 30.228229522705078, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8713910579681396, + "num_tokens": 683105015.0, + "step": 17900 + }, + { + "epoch": 2.2771911970487215, + "ewc_loss": 0.04762467369437218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3812337531126104e-05, + "grad_norm": 30.446449279785156, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8657469749450684, + "num_tokens": 683142938.0, + "step": 17901 + }, + { + "epoch": 2.277318407327312, + "ewc_loss": 0.04776483029127121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3882415916887112e-05, + "grad_norm": 30.321884155273438, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8853186368942261, + "num_tokens": 683181593.0, + "step": 17902 + }, + { + "epoch": 2.2774456176059026, + "ewc_loss": 0.0476205013692379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3810251150280237e-05, + "grad_norm": 30.403038024902344, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.861441969871521, + "num_tokens": 683218641.0, + "step": 17903 + }, + { + "epoch": 2.277572827884493, + "ewc_loss": 0.04784717038273811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3923585104057565e-05, + "grad_norm": 30.490251541137695, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8554561734199524, + "num_tokens": 683261479.0, + "step": 17904 + }, + { + "epoch": 2.2777000381630836, + "ewc_loss": 0.0476730652153492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3836531909182668e-05, + "grad_norm": 30.480220794677734, + "learning_rate": 1e-06, + "loss": 0.4911, + "mean_token_accuracy": 0.846339762210846, + "num_tokens": 683300602.0, + "step": 17905 + }, + { + "epoch": 2.277827248441674, + "ewc_loss": 0.0476640984416008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.383204991929233e-05, + "grad_norm": 30.49205207824707, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8708797693252563, + "num_tokens": 683337012.0, + "step": 17906 + }, + { + "epoch": 2.2779544587202647, + "ewc_loss": 0.04765089228749275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.382544698775746e-05, + "grad_norm": 30.36248779296875, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8789511919021606, + "num_tokens": 683368949.0, + "step": 17907 + }, + { + "epoch": 2.2780816689988552, + "ewc_loss": 0.04768252745270729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.384126310062129e-05, + "grad_norm": 30.551477432250977, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8691716194152832, + "num_tokens": 683408741.0, + "step": 17908 + }, + { + "epoch": 2.2782088792774458, + "ewc_loss": 0.04775764048099518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3878819774836302e-05, + "grad_norm": 30.378450393676758, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8614425659179688, + "num_tokens": 683447843.0, + "step": 17909 + }, + { + "epoch": 2.2783360895560363, + "ewc_loss": 0.047656986862421036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.38284937950084e-05, + "grad_norm": 30.45182991027832, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8687598705291748, + "num_tokens": 683489695.0, + "step": 17910 + }, + { + "epoch": 2.278463299834627, + "ewc_loss": 0.047743573784828186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3871787561802194e-05, + "grad_norm": 30.458009719848633, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8694708943367004, + "num_tokens": 683528381.0, + "step": 17911 + }, + { + "epoch": 2.2785905101132173, + "ewc_loss": 0.047580718994140625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.379035868216306e-05, + "grad_norm": 30.272371292114258, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8737415671348572, + "num_tokens": 683567261.0, + "step": 17912 + }, + { + "epoch": 2.278717720391808, + "ewc_loss": 0.04764832183718681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3824160962249152e-05, + "grad_norm": 30.441911697387695, + "learning_rate": 1e-06, + "loss": 0.4696, + "mean_token_accuracy": 0.8575679063796997, + "num_tokens": 683601083.0, + "step": 17913 + }, + { + "epoch": 2.278844930670398, + "ewc_loss": 0.04770774021744728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3853870516177267e-05, + "grad_norm": 30.40669822692871, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8652104735374451, + "num_tokens": 683643494.0, + "step": 17914 + }, + { + "epoch": 2.278972140948989, + "ewc_loss": 0.04766775667667389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3833878003642894e-05, + "grad_norm": 30.407718658447266, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8676384687423706, + "num_tokens": 683684267.0, + "step": 17915 + }, + { + "epoch": 2.279099351227579, + "ewc_loss": 0.04764675348997116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3823376977816224e-05, + "grad_norm": 30.33709716796875, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8598684072494507, + "num_tokens": 683723292.0, + "step": 17916 + }, + { + "epoch": 2.2792265615061695, + "ewc_loss": 0.04767174646258354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3835873435018584e-05, + "grad_norm": 30.423418045043945, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.882353663444519, + "num_tokens": 683761378.0, + "step": 17917 + }, + { + "epoch": 2.27935377178476, + "ewc_loss": 0.0477229468524456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.386147389188409e-05, + "grad_norm": 30.391185760498047, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8639888763427734, + "num_tokens": 683794838.0, + "step": 17918 + }, + { + "epoch": 2.2794809820633506, + "ewc_loss": 0.04770422726869583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.385211337241344e-05, + "grad_norm": 30.237279891967773, + "learning_rate": 1e-06, + "loss": 0.4801, + "mean_token_accuracy": 0.8527984023094177, + "num_tokens": 683832307.0, + "step": 17919 + }, + { + "epoch": 2.279608192341941, + "ewc_loss": 0.047736383974552155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3868191419751383e-05, + "grad_norm": 30.383886337280273, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.865877628326416, + "num_tokens": 683871848.0, + "step": 17920 + }, + { + "epoch": 2.2797354026205316, + "ewc_loss": 0.04784422740340233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3922113541630097e-05, + "grad_norm": 30.23971939086914, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8743129968643188, + "num_tokens": 683915086.0, + "step": 17921 + }, + { + "epoch": 2.279862612899122, + "ewc_loss": 0.04779816418886185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.38990814978024e-05, + "grad_norm": 30.32940673828125, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8649863004684448, + "num_tokens": 683945411.0, + "step": 17922 + }, + { + "epoch": 2.2799898231777127, + "ewc_loss": 0.047803882509469986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3901940949144773e-05, + "grad_norm": 30.32989501953125, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8717255592346191, + "num_tokens": 683979664.0, + "step": 17923 + }, + { + "epoch": 2.2801170334563032, + "ewc_loss": 0.04789063706994057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.394531838945113e-05, + "grad_norm": 30.413347244262695, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8612481355667114, + "num_tokens": 684025526.0, + "step": 17924 + }, + { + "epoch": 2.2802442437348938, + "ewc_loss": 0.04789053276181221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.394526563875843e-05, + "grad_norm": 30.342880249023438, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8648273348808289, + "num_tokens": 684056992.0, + "step": 17925 + }, + { + "epoch": 2.2803714540134843, + "ewc_loss": 0.047820791602134705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3910395611892454e-05, + "grad_norm": 30.371503829956055, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8753960728645325, + "num_tokens": 684094515.0, + "step": 17926 + }, + { + "epoch": 2.280498664292075, + "ewc_loss": 0.04786975309252739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3934877390274778e-05, + "grad_norm": 30.344932556152344, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8789910674095154, + "num_tokens": 684132294.0, + "step": 17927 + }, + { + "epoch": 2.2806258745706653, + "ewc_loss": 0.047787655144929886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.389382825640496e-05, + "grad_norm": 30.278104782104492, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8751418590545654, + "num_tokens": 684164010.0, + "step": 17928 + }, + { + "epoch": 2.280753084849256, + "ewc_loss": 0.04784189164638519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.392094575043302e-05, + "grad_norm": 30.3741455078125, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8783653974533081, + "num_tokens": 684202077.0, + "step": 17929 + }, + { + "epoch": 2.2808802951278464, + "ewc_loss": 0.0477793850004673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3889691874501295e-05, + "grad_norm": 30.158052444458008, + "learning_rate": 1e-06, + "loss": 0.4866, + "mean_token_accuracy": 0.8446376323699951, + "num_tokens": 684240799.0, + "step": 17930 + }, + { + "epoch": 2.281007505406437, + "ewc_loss": 0.047922294586896896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3961147235240787e-05, + "grad_norm": 30.388080596923828, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8686674237251282, + "num_tokens": 684284080.0, + "step": 17931 + }, + { + "epoch": 2.2811347156850275, + "ewc_loss": 0.04789977893233299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3949889509822242e-05, + "grad_norm": 30.331424713134766, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8542072772979736, + "num_tokens": 684317915.0, + "step": 17932 + }, + { + "epoch": 2.281261925963618, + "ewc_loss": 0.047904159873723984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.395207957306411e-05, + "grad_norm": 30.4622745513916, + "learning_rate": 1e-06, + "loss": 0.3035, + "mean_token_accuracy": 0.9078607559204102, + "num_tokens": 684354665.0, + "step": 17933 + }, + { + "epoch": 2.2813891362422085, + "ewc_loss": 0.04795319586992264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3976597731234506e-05, + "grad_norm": 30.431598663330078, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8697148561477661, + "num_tokens": 684392866.0, + "step": 17934 + }, + { + "epoch": 2.281516346520799, + "ewc_loss": 0.04783393815159798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.391696943959687e-05, + "grad_norm": 30.4121036529541, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8739708065986633, + "num_tokens": 684428422.0, + "step": 17935 + }, + { + "epoch": 2.2816435567993896, + "ewc_loss": 0.04788845032453537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.39442251768196e-05, + "grad_norm": 30.38208770751953, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8581725358963013, + "num_tokens": 684467129.0, + "step": 17936 + }, + { + "epoch": 2.2817707670779797, + "ewc_loss": 0.04780923202633858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.390461668255739e-05, + "grad_norm": 30.35981559753418, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8643765449523926, + "num_tokens": 684500093.0, + "step": 17937 + }, + { + "epoch": 2.2818979773565706, + "ewc_loss": 0.047819215804338455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.390960798948072e-05, + "grad_norm": 30.336750030517578, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8622084856033325, + "num_tokens": 684541422.0, + "step": 17938 + }, + { + "epoch": 2.2820251876351607, + "ewc_loss": 0.04783587530255318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3917937141959555e-05, + "grad_norm": 30.4278507232666, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8672364950180054, + "num_tokens": 684579164.0, + "step": 17939 + }, + { + "epoch": 2.2821523979137512, + "ewc_loss": 0.04784351587295532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3921758838696405e-05, + "grad_norm": 30.30548667907715, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8548765182495117, + "num_tokens": 684618617.0, + "step": 17940 + }, + { + "epoch": 2.2822796081923418, + "ewc_loss": 0.047863561660051346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3931781470309943e-05, + "grad_norm": 30.423255920410156, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8569775819778442, + "num_tokens": 684655525.0, + "step": 17941 + }, + { + "epoch": 2.2824068184709323, + "ewc_loss": 0.04788780212402344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.394390139670577e-05, + "grad_norm": 30.260217666625977, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.870392918586731, + "num_tokens": 684694525.0, + "step": 17942 + }, + { + "epoch": 2.282534028749523, + "ewc_loss": 0.04785246029496193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.392622991465032e-05, + "grad_norm": 30.487680435180664, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8712856769561768, + "num_tokens": 684733247.0, + "step": 17943 + }, + { + "epoch": 2.2826612390281134, + "ewc_loss": 0.047963399440050125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3981699996511452e-05, + "grad_norm": 30.332332611083984, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8835059404373169, + "num_tokens": 684763642.0, + "step": 17944 + }, + { + "epoch": 2.282788449306704, + "ewc_loss": 0.047836851328611374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.391842645010911e-05, + "grad_norm": 30.516252517700195, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8770644664764404, + "num_tokens": 684806164.0, + "step": 17945 + }, + { + "epoch": 2.2829156595852944, + "ewc_loss": 0.0479888953268528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.39944474742515e-05, + "grad_norm": 30.40930938720703, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8696960806846619, + "num_tokens": 684844193.0, + "step": 17946 + }, + { + "epoch": 2.283042869863885, + "ewc_loss": 0.047771211713552475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.388560642430093e-05, + "grad_norm": 30.467487335205078, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8754358291625977, + "num_tokens": 684882103.0, + "step": 17947 + }, + { + "epoch": 2.2831700801424755, + "ewc_loss": 0.04790591076016426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3952954506967217e-05, + "grad_norm": 30.32073211669922, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8848011493682861, + "num_tokens": 684917667.0, + "step": 17948 + }, + { + "epoch": 2.283297290421066, + "ewc_loss": 0.04772711545228958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3863558453740552e-05, + "grad_norm": 30.53099250793457, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8753679990768433, + "num_tokens": 684954007.0, + "step": 17949 + }, + { + "epoch": 2.2834245006996565, + "ewc_loss": 0.04797361046075821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3986805899767205e-05, + "grad_norm": 30.414228439331055, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8662221431732178, + "num_tokens": 684994693.0, + "step": 17950 + }, + { + "epoch": 2.283551710978247, + "ewc_loss": 0.0477297268807888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3864862669142894e-05, + "grad_norm": 30.48891830444336, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8578968048095703, + "num_tokens": 685032713.0, + "step": 17951 + }, + { + "epoch": 2.2836789212568376, + "ewc_loss": 0.04778613895177841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.389306973782368e-05, + "grad_norm": 30.411914825439453, + "learning_rate": 1e-06, + "loss": 0.4433, + "mean_token_accuracy": 0.8647606372833252, + "num_tokens": 685066597.0, + "step": 17952 + }, + { + "epoch": 2.283806131535428, + "ewc_loss": 0.0477408766746521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3870437871664762e-05, + "grad_norm": 30.29887580871582, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8654829263687134, + "num_tokens": 685106243.0, + "step": 17953 + }, + { + "epoch": 2.2839333418140186, + "ewc_loss": 0.047754596918821335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3877299099694937e-05, + "grad_norm": 30.434955596923828, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8615870475769043, + "num_tokens": 685145177.0, + "step": 17954 + }, + { + "epoch": 2.284060552092609, + "ewc_loss": 0.04788259044289589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3941294784890488e-05, + "grad_norm": 30.409168243408203, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8766405582427979, + "num_tokens": 685189130.0, + "step": 17955 + }, + { + "epoch": 2.2841877623711997, + "ewc_loss": 0.0477076917886734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3853845050325617e-05, + "grad_norm": 30.37175178527832, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8757323622703552, + "num_tokens": 685227536.0, + "step": 17956 + }, + { + "epoch": 2.28431497264979, + "ewc_loss": 0.047856565564870834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3928283553686924e-05, + "grad_norm": 30.453433990478516, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8753069639205933, + "num_tokens": 685267213.0, + "step": 17957 + }, + { + "epoch": 2.2844421829283807, + "ewc_loss": 0.047817278653383255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.390863846812863e-05, + "grad_norm": 30.35051727294922, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8683358430862427, + "num_tokens": 685301115.0, + "step": 17958 + }, + { + "epoch": 2.2845693932069713, + "ewc_loss": 0.04786191135644913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3930955649120733e-05, + "grad_norm": 30.54325294494629, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8765808939933777, + "num_tokens": 685338497.0, + "step": 17959 + }, + { + "epoch": 2.284696603485562, + "ewc_loss": 0.047908712178468704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.395435694779735e-05, + "grad_norm": 30.452056884765625, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8594077825546265, + "num_tokens": 685374983.0, + "step": 17960 + }, + { + "epoch": 2.2848238137641523, + "ewc_loss": 0.04781453683972359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.390726876910776e-05, + "grad_norm": 30.504562377929688, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.876907229423523, + "num_tokens": 685408859.0, + "step": 17961 + }, + { + "epoch": 2.2849510240427424, + "ewc_loss": 0.04782114177942276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3910570234875195e-05, + "grad_norm": 30.54364776611328, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8672167062759399, + "num_tokens": 685445103.0, + "step": 17962 + }, + { + "epoch": 2.2850782343213334, + "ewc_loss": 0.04773101955652237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3865510229370557e-05, + "grad_norm": 30.269018173217773, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8660260438919067, + "num_tokens": 685484657.0, + "step": 17963 + }, + { + "epoch": 2.2852054445999235, + "ewc_loss": 0.04779953882098198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.389976907579694e-05, + "grad_norm": 30.498157501220703, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8788743615150452, + "num_tokens": 685524034.0, + "step": 17964 + }, + { + "epoch": 2.285332654878514, + "ewc_loss": 0.04783467948436737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3917340513435192e-05, + "grad_norm": 30.371816635131836, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8764734268188477, + "num_tokens": 685560368.0, + "step": 17965 + }, + { + "epoch": 2.2854598651571045, + "ewc_loss": 0.04776734113693237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.388367101957556e-05, + "grad_norm": 30.40984535217285, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8666050434112549, + "num_tokens": 685598192.0, + "step": 17966 + }, + { + "epoch": 2.285587075435695, + "ewc_loss": 0.04776964336633682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3884822439868003e-05, + "grad_norm": 30.32067108154297, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8730455040931702, + "num_tokens": 685636300.0, + "step": 17967 + }, + { + "epoch": 2.2857142857142856, + "ewc_loss": 0.047784481197595596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3892240278655663e-05, + "grad_norm": 30.52069664001465, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8685305118560791, + "num_tokens": 685671457.0, + "step": 17968 + }, + { + "epoch": 2.285841495992876, + "ewc_loss": 0.04780006781220436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3900034648249857e-05, + "grad_norm": 30.493053436279297, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8597046732902527, + "num_tokens": 685709687.0, + "step": 17969 + }, + { + "epoch": 2.2859687062714666, + "ewc_loss": 0.04777322709560394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3886614144430496e-05, + "grad_norm": 30.44434928894043, + "learning_rate": 1e-06, + "loss": 0.4924, + "mean_token_accuracy": 0.8487098217010498, + "num_tokens": 685747854.0, + "step": 17970 + }, + { + "epoch": 2.286095916550057, + "ewc_loss": 0.04767590016126633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3837950720917434e-05, + "grad_norm": 30.260066986083984, + "learning_rate": 1e-06, + "loss": 0.363, + "mean_token_accuracy": 0.8898834586143494, + "num_tokens": 685785215.0, + "step": 17971 + }, + { + "epoch": 2.2862231268286477, + "ewc_loss": 0.04774762690067291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3873813915997744e-05, + "grad_norm": 30.369829177856445, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8751338720321655, + "num_tokens": 685818871.0, + "step": 17972 + }, + { + "epoch": 2.2863503371072382, + "ewc_loss": 0.047800637781620026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.390031841059681e-05, + "grad_norm": 30.45827865600586, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8635568022727966, + "num_tokens": 685859089.0, + "step": 17973 + }, + { + "epoch": 2.2864775473858288, + "ewc_loss": 0.04769305884838104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3846529074944556e-05, + "grad_norm": 30.291128158569336, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8625026345252991, + "num_tokens": 685900346.0, + "step": 17974 + }, + { + "epoch": 2.2866047576644193, + "ewc_loss": 0.047793637961149216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3896818674984388e-05, + "grad_norm": 30.378677368164062, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8697702884674072, + "num_tokens": 685938186.0, + "step": 17975 + }, + { + "epoch": 2.28673196794301, + "ewc_loss": 0.047862645238637924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.393132308498025e-05, + "grad_norm": 30.376686096191406, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8771905899047852, + "num_tokens": 685974911.0, + "step": 17976 + }, + { + "epoch": 2.2868591782216003, + "ewc_loss": 0.047760266810655594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3880133085185662e-05, + "grad_norm": 30.436817169189453, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8731536865234375, + "num_tokens": 686010430.0, + "step": 17977 + }, + { + "epoch": 2.286986388500191, + "ewc_loss": 0.0478796549141407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3939826860441826e-05, + "grad_norm": 30.423171997070312, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8836138844490051, + "num_tokens": 686042727.0, + "step": 17978 + }, + { + "epoch": 2.2871135987787814, + "ewc_loss": 0.04783165082335472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.391582529526204e-05, + "grad_norm": 30.462940216064453, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8729703426361084, + "num_tokens": 686078819.0, + "step": 17979 + }, + { + "epoch": 2.287240809057372, + "ewc_loss": 0.04787573963403702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.393786962784361e-05, + "grad_norm": 30.42128562927246, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8727560043334961, + "num_tokens": 686117760.0, + "step": 17980 + }, + { + "epoch": 2.2873680193359625, + "ewc_loss": 0.04783586785197258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.391793350398075e-05, + "grad_norm": 30.408205032348633, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.861538290977478, + "num_tokens": 686152992.0, + "step": 17981 + }, + { + "epoch": 2.287495229614553, + "ewc_loss": 0.047782883048057556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3891441742307507e-05, + "grad_norm": 30.43891716003418, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8652170300483704, + "num_tokens": 686195353.0, + "step": 17982 + }, + { + "epoch": 2.2876224398931435, + "ewc_loss": 0.047848425805568695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.392421265540179e-05, + "grad_norm": 30.490758895874023, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.865941047668457, + "num_tokens": 686232990.0, + "step": 17983 + }, + { + "epoch": 2.287749650171734, + "ewc_loss": 0.0477733388543129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3886668714112602e-05, + "grad_norm": 30.303403854370117, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8755567669868469, + "num_tokens": 686265933.0, + "step": 17984 + }, + { + "epoch": 2.2878768604503246, + "ewc_loss": 0.047773707658052444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3886854251031764e-05, + "grad_norm": 30.49889373779297, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8770571947097778, + "num_tokens": 686307111.0, + "step": 17985 + }, + { + "epoch": 2.288004070728915, + "ewc_loss": 0.04783964157104492, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.391982161498163e-05, + "grad_norm": 30.321691513061523, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8727680444717407, + "num_tokens": 686343363.0, + "step": 17986 + }, + { + "epoch": 2.288131281007505, + "ewc_loss": 0.04774622619152069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.387311360507738e-05, + "grad_norm": 30.301544189453125, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8694309592247009, + "num_tokens": 686380888.0, + "step": 17987 + }, + { + "epoch": 2.288258491286096, + "ewc_loss": 0.047885503619909286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3942751795402728e-05, + "grad_norm": 30.403060913085938, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8615292310714722, + "num_tokens": 686417350.0, + "step": 17988 + }, + { + "epoch": 2.2883857015646862, + "ewc_loss": 0.04789358004927635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.39467899518786e-05, + "grad_norm": 30.314367294311523, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8785864114761353, + "num_tokens": 686458658.0, + "step": 17989 + }, + { + "epoch": 2.2885129118432768, + "ewc_loss": 0.04794355481863022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.397177740931511e-05, + "grad_norm": 30.505407333374023, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8741835355758667, + "num_tokens": 686493945.0, + "step": 17990 + }, + { + "epoch": 2.2886401221218673, + "ewc_loss": 0.047907423228025436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3953711206559092e-05, + "grad_norm": 30.31229019165039, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8702242374420166, + "num_tokens": 686527608.0, + "step": 17991 + }, + { + "epoch": 2.288767332400458, + "ewc_loss": 0.047842830419540405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3921415049699135e-05, + "grad_norm": 30.370040893554688, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8805993795394897, + "num_tokens": 686562272.0, + "step": 17992 + }, + { + "epoch": 2.2888945426790483, + "ewc_loss": 0.04796342924237251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.398171454842668e-05, + "grad_norm": 30.419130325317383, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8818492293357849, + "num_tokens": 686599937.0, + "step": 17993 + }, + { + "epoch": 2.289021752957639, + "ewc_loss": 0.047894179821014404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3947090085130185e-05, + "grad_norm": 30.469772338867188, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8720446825027466, + "num_tokens": 686632103.0, + "step": 17994 + }, + { + "epoch": 2.2891489632362294, + "ewc_loss": 0.047906406223773956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3953203708515503e-05, + "grad_norm": 30.323965072631836, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8766388893127441, + "num_tokens": 686662802.0, + "step": 17995 + }, + { + "epoch": 2.28927617351482, + "ewc_loss": 0.047873251140117645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3936625439091586e-05, + "grad_norm": 30.40021324157715, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8918047547340393, + "num_tokens": 686699386.0, + "step": 17996 + }, + { + "epoch": 2.2894033837934105, + "ewc_loss": 0.047962624579668045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3981312551768497e-05, + "grad_norm": 30.38112449645996, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8739992380142212, + "num_tokens": 686738369.0, + "step": 17997 + }, + { + "epoch": 2.289530594072001, + "ewc_loss": 0.04792872816324234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3964363208506256e-05, + "grad_norm": 30.43976593017578, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.863622784614563, + "num_tokens": 686774543.0, + "step": 17998 + }, + { + "epoch": 2.2896578043505915, + "ewc_loss": 0.0478491336107254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.392456735833548e-05, + "grad_norm": 30.31096839904785, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8637951612472534, + "num_tokens": 686807048.0, + "step": 17999 + }, + { + "epoch": 2.289785014629182, + "ewc_loss": 0.0478239580988884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3911979951662943e-05, + "grad_norm": 30.329261779785156, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8720052242279053, + "num_tokens": 686844678.0, + "step": 18000 + }, + { + "epoch": 2.2899122249077726, + "ewc_loss": 0.0478358268737793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.391791349509731e-05, + "grad_norm": 30.266340255737305, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8716698884963989, + "num_tokens": 686877721.0, + "step": 18001 + }, + { + "epoch": 2.290039435186363, + "ewc_loss": 0.047992415726184845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3996208255994134e-05, + "grad_norm": 30.426544189453125, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8632831573486328, + "num_tokens": 686918329.0, + "step": 18002 + }, + { + "epoch": 2.2901666454649536, + "ewc_loss": 0.04796943441033363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3984717699931934e-05, + "grad_norm": 30.337017059326172, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.866522490978241, + "num_tokens": 686960050.0, + "step": 18003 + }, + { + "epoch": 2.290293855743544, + "ewc_loss": 0.047906290739774704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.395314550085459e-05, + "grad_norm": 30.33602523803711, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8811377286911011, + "num_tokens": 686997849.0, + "step": 18004 + }, + { + "epoch": 2.2904210660221347, + "ewc_loss": 0.04798946529626846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3994733055587858e-05, + "grad_norm": 30.42943572998047, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8742209672927856, + "num_tokens": 687032363.0, + "step": 18005 + }, + { + "epoch": 2.290548276300725, + "ewc_loss": 0.04798531532287598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3992657588678412e-05, + "grad_norm": 30.47015380859375, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8551163673400879, + "num_tokens": 687069606.0, + "step": 18006 + }, + { + "epoch": 2.2906754865793157, + "ewc_loss": 0.04800126329064369, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4000632038223557e-05, + "grad_norm": 30.513151168823242, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8615277409553528, + "num_tokens": 687112857.0, + "step": 18007 + }, + { + "epoch": 2.2908026968579063, + "ewc_loss": 0.0479266457259655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3963322746567428e-05, + "grad_norm": 30.444028854370117, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8691325187683105, + "num_tokens": 687152245.0, + "step": 18008 + }, + { + "epoch": 2.290929907136497, + "ewc_loss": 0.047915372997522354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.395768569840584e-05, + "grad_norm": 30.441726684570312, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8701575994491577, + "num_tokens": 687193988.0, + "step": 18009 + }, + { + "epoch": 2.2910571174150873, + "ewc_loss": 0.04788343608379364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3941718609421514e-05, + "grad_norm": 30.45929527282715, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8658266663551331, + "num_tokens": 687232375.0, + "step": 18010 + }, + { + "epoch": 2.291184327693678, + "ewc_loss": 0.047964923083782196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3982462153071538e-05, + "grad_norm": 30.431692123413086, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8749232888221741, + "num_tokens": 687274566.0, + "step": 18011 + }, + { + "epoch": 2.291311537972268, + "ewc_loss": 0.04786025360226631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3930126189952716e-05, + "grad_norm": 30.345172882080078, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8701010346412659, + "num_tokens": 687313231.0, + "step": 18012 + }, + { + "epoch": 2.291438748250859, + "ewc_loss": 0.04792194440960884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3960972612258047e-05, + "grad_norm": 30.446836471557617, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8758610486984253, + "num_tokens": 687354502.0, + "step": 18013 + }, + { + "epoch": 2.291565958529449, + "ewc_loss": 0.04792296141386032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3961480110301636e-05, + "grad_norm": 30.35817527770996, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8674996495246887, + "num_tokens": 687399160.0, + "step": 18014 + }, + { + "epoch": 2.2916931688080395, + "ewc_loss": 0.04787628725171089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3938144295243546e-05, + "grad_norm": 30.340593338012695, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8614991903305054, + "num_tokens": 687438775.0, + "step": 18015 + }, + { + "epoch": 2.29182037908663, + "ewc_loss": 0.0478440523147583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3922026230138727e-05, + "grad_norm": 30.367950439453125, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.875100314617157, + "num_tokens": 687471435.0, + "step": 18016 + }, + { + "epoch": 2.2919475893652206, + "ewc_loss": 0.047887761145830154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.394388138782233e-05, + "grad_norm": 30.26400375366211, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8713588714599609, + "num_tokens": 687508657.0, + "step": 18017 + }, + { + "epoch": 2.292074799643811, + "ewc_loss": 0.047912757843732834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3956379664014094e-05, + "grad_norm": 30.461339950561523, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.880079448223114, + "num_tokens": 687545024.0, + "step": 18018 + }, + { + "epoch": 2.2922020099224016, + "ewc_loss": 0.04794875904917717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3974380383151583e-05, + "grad_norm": 30.371091842651367, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8745615482330322, + "num_tokens": 687579432.0, + "step": 18019 + }, + { + "epoch": 2.292329220200992, + "ewc_loss": 0.04783721640706062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3918608349049464e-05, + "grad_norm": 30.321102142333984, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.86033695936203, + "num_tokens": 687618378.0, + "step": 18020 + }, + { + "epoch": 2.2924564304795827, + "ewc_loss": 0.047924257814884186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3962129489518702e-05, + "grad_norm": 30.264848709106445, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8688563108444214, + "num_tokens": 687657119.0, + "step": 18021 + }, + { + "epoch": 2.2925836407581732, + "ewc_loss": 0.047864142805337906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3932070689625107e-05, + "grad_norm": 30.363418579101562, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.860956609249115, + "num_tokens": 687691924.0, + "step": 18022 + }, + { + "epoch": 2.2927108510367638, + "ewc_loss": 0.04793974384665489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3969871108420193e-05, + "grad_norm": 30.518375396728516, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8471044898033142, + "num_tokens": 687724221.0, + "step": 18023 + }, + { + "epoch": 2.2928380613153543, + "ewc_loss": 0.047898925840854645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.394946204731241e-05, + "grad_norm": 30.387617111206055, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8726248741149902, + "num_tokens": 687765049.0, + "step": 18024 + }, + { + "epoch": 2.292965271593945, + "ewc_loss": 0.0478583462536335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3929173039505258e-05, + "grad_norm": 30.48233413696289, + "learning_rate": 1e-06, + "loss": 0.4592, + "mean_token_accuracy": 0.8606804609298706, + "num_tokens": 687810222.0, + "step": 18025 + }, + { + "epoch": 2.2930924818725353, + "ewc_loss": 0.04792188107967377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3960939870448783e-05, + "grad_norm": 30.414907455444336, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8659349679946899, + "num_tokens": 687851961.0, + "step": 18026 + }, + { + "epoch": 2.293219692151126, + "ewc_loss": 0.0478433221578598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3921660613268614e-05, + "grad_norm": 30.50760269165039, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8587520122528076, + "num_tokens": 687890448.0, + "step": 18027 + }, + { + "epoch": 2.2933469024297164, + "ewc_loss": 0.04785668104887009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3928339942358434e-05, + "grad_norm": 30.509925842285156, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8788692951202393, + "num_tokens": 687929509.0, + "step": 18028 + }, + { + "epoch": 2.293474112708307, + "ewc_loss": 0.04785574972629547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3927874281071126e-05, + "grad_norm": 30.3944149017334, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8608982563018799, + "num_tokens": 687966814.0, + "step": 18029 + }, + { + "epoch": 2.2936013229868975, + "ewc_loss": 0.04777417704463005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.388708890066482e-05, + "grad_norm": 30.46964454650879, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8781532049179077, + "num_tokens": 688006398.0, + "step": 18030 + }, + { + "epoch": 2.293728533265488, + "ewc_loss": 0.047905728220939636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.395286355749704e-05, + "grad_norm": 30.454792022705078, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8680517673492432, + "num_tokens": 688044604.0, + "step": 18031 + }, + { + "epoch": 2.2938557435440785, + "ewc_loss": 0.04784984141588211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3924920242279768e-05, + "grad_norm": 30.36087989807129, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8663833737373352, + "num_tokens": 688082969.0, + "step": 18032 + }, + { + "epoch": 2.293982953822669, + "ewc_loss": 0.04788915067911148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.394457442278508e-05, + "grad_norm": 30.441898345947266, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8769363164901733, + "num_tokens": 688121442.0, + "step": 18033 + }, + { + "epoch": 2.2941101641012596, + "ewc_loss": 0.04785516858100891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3927585061755963e-05, + "grad_norm": 30.3847599029541, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8712268471717834, + "num_tokens": 688161096.0, + "step": 18034 + }, + { + "epoch": 2.2942373743798496, + "ewc_loss": 0.04785637930035591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.392818896623794e-05, + "grad_norm": 30.383182525634766, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8707586526870728, + "num_tokens": 688200782.0, + "step": 18035 + }, + { + "epoch": 2.2943645846584406, + "ewc_loss": 0.0478057935833931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3902895918581635e-05, + "grad_norm": 30.47635269165039, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8777889013290405, + "num_tokens": 688241333.0, + "step": 18036 + }, + { + "epoch": 2.2944917949370307, + "ewc_loss": 0.04783244803547859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3916223653941415e-05, + "grad_norm": 30.388399124145508, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8684878945350647, + "num_tokens": 688278297.0, + "step": 18037 + }, + { + "epoch": 2.2946190052156212, + "ewc_loss": 0.04781297594308853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3906488422653638e-05, + "grad_norm": 30.42017936706543, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8470829725265503, + "num_tokens": 688318835.0, + "step": 18038 + }, + { + "epoch": 2.2947462154942118, + "ewc_loss": 0.04787323996424675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3936619982123375e-05, + "grad_norm": 30.266347885131836, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8732640147209167, + "num_tokens": 688354814.0, + "step": 18039 + }, + { + "epoch": 2.2948734257728023, + "ewc_loss": 0.04785480722784996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3927403162815608e-05, + "grad_norm": 30.48596954345703, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8725970983505249, + "num_tokens": 688390008.0, + "step": 18040 + }, + { + "epoch": 2.295000636051393, + "ewc_loss": 0.047875769436359406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.393788417975884e-05, + "grad_norm": 30.31206512451172, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8723194599151611, + "num_tokens": 688430938.0, + "step": 18041 + }, + { + "epoch": 2.2951278463299833, + "ewc_loss": 0.04778394103050232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.389197106822394e-05, + "grad_norm": 30.403207778930664, + "learning_rate": 1e-06, + "loss": 0.539, + "mean_token_accuracy": 0.8400084972381592, + "num_tokens": 688466730.0, + "step": 18042 + }, + { + "epoch": 2.295255056608574, + "ewc_loss": 0.04798705130815506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3993525246623904e-05, + "grad_norm": 30.519638061523438, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8693324327468872, + "num_tokens": 688506666.0, + "step": 18043 + }, + { + "epoch": 2.2953822668871644, + "ewc_loss": 0.047905657440423965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3952828996698372e-05, + "grad_norm": 30.38410186767578, + "learning_rate": 1e-06, + "loss": 0.478, + "mean_token_accuracy": 0.8557683229446411, + "num_tokens": 688547726.0, + "step": 18044 + }, + { + "epoch": 2.295509477165755, + "ewc_loss": 0.04785510152578354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3927550500957295e-05, + "grad_norm": 30.57143211364746, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.866528332233429, + "num_tokens": 688587487.0, + "step": 18045 + }, + { + "epoch": 2.2956366874443455, + "ewc_loss": 0.048023320734500885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4011660570977256e-05, + "grad_norm": 30.52759552001953, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8642003536224365, + "num_tokens": 688632454.0, + "step": 18046 + }, + { + "epoch": 2.295763897722936, + "ewc_loss": 0.047797515988349915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3898757717688568e-05, + "grad_norm": 30.445878982543945, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8773053884506226, + "num_tokens": 688668489.0, + "step": 18047 + }, + { + "epoch": 2.2958911080015265, + "ewc_loss": 0.047907132655382156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3953565687406808e-05, + "grad_norm": 30.462265014648438, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8735314607620239, + "num_tokens": 688707899.0, + "step": 18048 + }, + { + "epoch": 2.296018318280117, + "ewc_loss": 0.04781500622630119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3907503418740816e-05, + "grad_norm": 30.460102081298828, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8754258751869202, + "num_tokens": 688747171.0, + "step": 18049 + }, + { + "epoch": 2.2961455285587076, + "ewc_loss": 0.04789179190993309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.394589682808146e-05, + "grad_norm": 30.466663360595703, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8677868247032166, + "num_tokens": 688789553.0, + "step": 18050 + }, + { + "epoch": 2.296272738837298, + "ewc_loss": 0.04783697798848152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.391848829574883e-05, + "grad_norm": 30.372312545776367, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8747684955596924, + "num_tokens": 688827632.0, + "step": 18051 + }, + { + "epoch": 2.2963999491158886, + "ewc_loss": 0.04790172353386879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.395086266915314e-05, + "grad_norm": 30.449609756469727, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8783708214759827, + "num_tokens": 688861505.0, + "step": 18052 + }, + { + "epoch": 2.296527159394479, + "ewc_loss": 0.0478719025850296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.393595059402287e-05, + "grad_norm": 30.46185874938965, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8643380999565125, + "num_tokens": 688898270.0, + "step": 18053 + }, + { + "epoch": 2.2966543696730697, + "ewc_loss": 0.047848280519247055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3924139895825647e-05, + "grad_norm": 30.466209411621094, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8713411092758179, + "num_tokens": 688937420.0, + "step": 18054 + }, + { + "epoch": 2.29678157995166, + "ewc_loss": 0.047780029475688934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3890013835625723e-05, + "grad_norm": 30.41274642944336, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8764752745628357, + "num_tokens": 688972426.0, + "step": 18055 + }, + { + "epoch": 2.2969087902302507, + "ewc_loss": 0.04785384237766266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.392692113062367e-05, + "grad_norm": 30.529460906982422, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8698717951774597, + "num_tokens": 689005801.0, + "step": 18056 + }, + { + "epoch": 2.2970360005088413, + "ewc_loss": 0.047855574637651443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3927786969579756e-05, + "grad_norm": 30.347347259521484, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8657740950584412, + "num_tokens": 689039999.0, + "step": 18057 + }, + { + "epoch": 2.297163210787432, + "ewc_loss": 0.04775865748524666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3879329091869295e-05, + "grad_norm": 30.427989959716797, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8552597165107727, + "num_tokens": 689079404.0, + "step": 18058 + }, + { + "epoch": 2.2972904210660223, + "ewc_loss": 0.04793437942862511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3967189918039367e-05, + "grad_norm": 30.40244483947754, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8484731316566467, + "num_tokens": 689116197.0, + "step": 18059 + }, + { + "epoch": 2.2974176313446124, + "ewc_loss": 0.047821734100580215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3910866730147973e-05, + "grad_norm": 30.408424377441406, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8709323406219482, + "num_tokens": 689153446.0, + "step": 18060 + }, + { + "epoch": 2.2975448416232034, + "ewc_loss": 0.047917064279317856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.395853152847849e-05, + "grad_norm": 30.560237884521484, + "learning_rate": 1e-06, + "loss": 0.4724, + "mean_token_accuracy": 0.8538314700126648, + "num_tokens": 689185310.0, + "step": 18061 + }, + { + "epoch": 2.2976720519017935, + "ewc_loss": 0.0478467121720314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.392335591139272e-05, + "grad_norm": 30.42777442932129, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.881140947341919, + "num_tokens": 689224150.0, + "step": 18062 + }, + { + "epoch": 2.297799262180384, + "ewc_loss": 0.04788438603281975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.394219336565584e-05, + "grad_norm": 30.456924438476562, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8598219156265259, + "num_tokens": 689261206.0, + "step": 18063 + }, + { + "epoch": 2.2979264724589745, + "ewc_loss": 0.047978926450014114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3989463443285786e-05, + "grad_norm": 30.693634033203125, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8760265111923218, + "num_tokens": 689299129.0, + "step": 18064 + }, + { + "epoch": 2.298053682737565, + "ewc_loss": 0.0478304922580719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.391524685663171e-05, + "grad_norm": 30.379182815551758, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8853916525840759, + "num_tokens": 689330554.0, + "step": 18065 + }, + { + "epoch": 2.2981808930161556, + "ewc_loss": 0.04786337539553642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.393168688286096e-05, + "grad_norm": 30.566781997680664, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8766483068466187, + "num_tokens": 689364084.0, + "step": 18066 + }, + { + "epoch": 2.298308103294746, + "ewc_loss": 0.04787364974617958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3936825527925976e-05, + "grad_norm": 30.288026809692383, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8589454889297485, + "num_tokens": 689405254.0, + "step": 18067 + }, + { + "epoch": 2.2984353135733366, + "ewc_loss": 0.047886837273836136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.394341936451383e-05, + "grad_norm": 30.556161880493164, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8615157604217529, + "num_tokens": 689441109.0, + "step": 18068 + }, + { + "epoch": 2.298562523851927, + "ewc_loss": 0.048024147748947144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.401207348157186e-05, + "grad_norm": 30.462221145629883, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.870940089225769, + "num_tokens": 689474937.0, + "step": 18069 + }, + { + "epoch": 2.2986897341305177, + "ewc_loss": 0.04788187891244888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3940940081956796e-05, + "grad_norm": 30.417871475219727, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8657733798027039, + "num_tokens": 689515971.0, + "step": 18070 + }, + { + "epoch": 2.298816944409108, + "ewc_loss": 0.047943443059921265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.39717210206436e-05, + "grad_norm": 30.366580963134766, + "learning_rate": 1e-06, + "loss": 0.4892, + "mean_token_accuracy": 0.8478777408599854, + "num_tokens": 689553620.0, + "step": 18071 + }, + { + "epoch": 2.2989441546876987, + "ewc_loss": 0.04799625277519226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3998127289814875e-05, + "grad_norm": 30.45122528076172, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8735339641571045, + "num_tokens": 689592157.0, + "step": 18072 + }, + { + "epoch": 2.2990713649662893, + "ewc_loss": 0.048041585832834244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.402079371677246e-05, + "grad_norm": 30.4773006439209, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8830105066299438, + "num_tokens": 689626889.0, + "step": 18073 + }, + { + "epoch": 2.29919857524488, + "ewc_loss": 0.047966305166482925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3983153369044885e-05, + "grad_norm": 30.50043296813965, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8818491697311401, + "num_tokens": 689666161.0, + "step": 18074 + }, + { + "epoch": 2.2993257855234703, + "ewc_loss": 0.04799005016684532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.399502591288183e-05, + "grad_norm": 30.387042999267578, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.8622828722000122, + "num_tokens": 689713713.0, + "step": 18075 + }, + { + "epoch": 2.299452995802061, + "ewc_loss": 0.04787374660372734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3936872821650468e-05, + "grad_norm": 30.545244216918945, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8624331951141357, + "num_tokens": 689755146.0, + "step": 18076 + }, + { + "epoch": 2.2995802060806514, + "ewc_loss": 0.048042844980955124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4021423087106086e-05, + "grad_norm": 30.434097290039062, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8791080117225647, + "num_tokens": 689795925.0, + "step": 18077 + }, + { + "epoch": 2.299707416359242, + "ewc_loss": 0.0479867085814476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.399335426161997e-05, + "grad_norm": 30.517536163330078, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8814419507980347, + "num_tokens": 689830673.0, + "step": 18078 + }, + { + "epoch": 2.2998346266378324, + "ewc_loss": 0.04800918698310852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.400459379714448e-05, + "grad_norm": 30.568161010742188, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8701051473617554, + "num_tokens": 689863649.0, + "step": 18079 + }, + { + "epoch": 2.299961836916423, + "ewc_loss": 0.04804101586341858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4020508135436103e-05, + "grad_norm": 30.400314331054688, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8683083057403564, + "num_tokens": 689901946.0, + "step": 18080 + }, + { + "epoch": 2.3000890471950135, + "ewc_loss": 0.04786175861954689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3930879251565784e-05, + "grad_norm": 30.528703689575195, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8637229204177856, + "num_tokens": 689939968.0, + "step": 18081 + }, + { + "epoch": 2.300216257473604, + "ewc_loss": 0.04793865606188774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3969327230588533e-05, + "grad_norm": 30.329370498657227, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8788207173347473, + "num_tokens": 689979792.0, + "step": 18082 + }, + { + "epoch": 2.3003434677521946, + "ewc_loss": 0.047989409416913986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.39947039517574e-05, + "grad_norm": 30.562816619873047, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8839699029922485, + "num_tokens": 690013974.0, + "step": 18083 + }, + { + "epoch": 2.300470678030785, + "ewc_loss": 0.048003681004047394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.400183984718751e-05, + "grad_norm": 30.423690795898438, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8783859014511108, + "num_tokens": 690056540.0, + "step": 18084 + }, + { + "epoch": 2.300597888309375, + "ewc_loss": 0.047828469425439835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.391423549852334e-05, + "grad_norm": 30.471235275268555, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8640351295471191, + "num_tokens": 690101032.0, + "step": 18085 + }, + { + "epoch": 2.300725098587966, + "ewc_loss": 0.04798563942313194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3992819478735328e-05, + "grad_norm": 30.460689544677734, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8825706839561462, + "num_tokens": 690135417.0, + "step": 18086 + }, + { + "epoch": 2.3008523088665562, + "ewc_loss": 0.047840844839811325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.39204218814848e-05, + "grad_norm": 30.566713333129883, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8710647821426392, + "num_tokens": 690178552.0, + "step": 18087 + }, + { + "epoch": 2.3009795191451468, + "ewc_loss": 0.0479406900703907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.397034586465452e-05, + "grad_norm": 30.51189422607422, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8670617938041687, + "num_tokens": 690221928.0, + "step": 18088 + }, + { + "epoch": 2.3011067294237373, + "ewc_loss": 0.04779438674449921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.389719338680152e-05, + "grad_norm": 30.533721923828125, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8760988116264343, + "num_tokens": 690257040.0, + "step": 18089 + }, + { + "epoch": 2.301233939702328, + "ewc_loss": 0.04792401194572449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.396200579823926e-05, + "grad_norm": 30.525238037109375, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8703639507293701, + "num_tokens": 690302720.0, + "step": 18090 + }, + { + "epoch": 2.3013611499809183, + "ewc_loss": 0.04777407646179199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3887037968961522e-05, + "grad_norm": 30.379064559936523, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8642781972885132, + "num_tokens": 690337042.0, + "step": 18091 + }, + { + "epoch": 2.301488360259509, + "ewc_loss": 0.04781990870833397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3909953597467393e-05, + "grad_norm": 30.34477996826172, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8761781454086304, + "num_tokens": 690380247.0, + "step": 18092 + }, + { + "epoch": 2.3016155705380994, + "ewc_loss": 0.04793718829751015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3968594177858904e-05, + "grad_norm": 30.552799224853516, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8744089603424072, + "num_tokens": 690425361.0, + "step": 18093 + }, + { + "epoch": 2.30174278081669, + "ewc_loss": 0.047904353588819504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.39521759795025e-05, + "grad_norm": 30.396942138671875, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8645799160003662, + "num_tokens": 690465137.0, + "step": 18094 + }, + { + "epoch": 2.3018699910952805, + "ewc_loss": 0.04774908348917961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3874541511759162e-05, + "grad_norm": 30.449861526489258, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8687357306480408, + "num_tokens": 690501437.0, + "step": 18095 + }, + { + "epoch": 2.301997201373871, + "ewc_loss": 0.04790743440389633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3953716663527302e-05, + "grad_norm": 30.369232177734375, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8657073974609375, + "num_tokens": 690544134.0, + "step": 18096 + }, + { + "epoch": 2.3021244116524615, + "ewc_loss": 0.047796156257390976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3898077415651642e-05, + "grad_norm": 30.53971290588379, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8798471689224243, + "num_tokens": 690581062.0, + "step": 18097 + }, + { + "epoch": 2.302251621931052, + "ewc_loss": 0.04791593179106712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3957965822773986e-05, + "grad_norm": 30.423816680908203, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8757931590080261, + "num_tokens": 690615145.0, + "step": 18098 + }, + { + "epoch": 2.3023788322096426, + "ewc_loss": 0.047718919813632965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3859460270614363e-05, + "grad_norm": 30.44233512878418, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8770496249198914, + "num_tokens": 690655483.0, + "step": 18099 + }, + { + "epoch": 2.302506042488233, + "ewc_loss": 0.04783857241272926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3919286832096986e-05, + "grad_norm": 30.565622329711914, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8673037886619568, + "num_tokens": 690694133.0, + "step": 18100 + }, + { + "epoch": 2.3026332527668236, + "ewc_loss": 0.04785745590925217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.392872738710139e-05, + "grad_norm": 30.479812622070312, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.848797082901001, + "num_tokens": 690729939.0, + "step": 18101 + }, + { + "epoch": 2.302760463045414, + "ewc_loss": 0.04789707809686661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.394853981968481e-05, + "grad_norm": 30.516088485717773, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8759899735450745, + "num_tokens": 690770139.0, + "step": 18102 + }, + { + "epoch": 2.3028876733240047, + "ewc_loss": 0.0477481447160244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3874072212493047e-05, + "grad_norm": 30.395496368408203, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8725610971450806, + "num_tokens": 690811379.0, + "step": 18103 + }, + { + "epoch": 2.303014883602595, + "ewc_loss": 0.04789084568619728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3945422071847133e-05, + "grad_norm": 30.541927337646484, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8663538694381714, + "num_tokens": 690844194.0, + "step": 18104 + }, + { + "epoch": 2.3031420938811857, + "ewc_loss": 0.04783434793353081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3917173166410066e-05, + "grad_norm": 30.549453735351562, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8637714385986328, + "num_tokens": 690882129.0, + "step": 18105 + }, + { + "epoch": 2.3032693041597763, + "ewc_loss": 0.04782888665795326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3914442863315344e-05, + "grad_norm": 30.38641929626465, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8840370178222656, + "num_tokens": 690915568.0, + "step": 18106 + }, + { + "epoch": 2.303396514438367, + "ewc_loss": 0.04779257997870445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3896289349067956e-05, + "grad_norm": 30.52681541442871, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8677290678024292, + "num_tokens": 690953412.0, + "step": 18107 + }, + { + "epoch": 2.3035237247169573, + "ewc_loss": 0.04792749509215355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3963748390087858e-05, + "grad_norm": 30.456220626831055, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8673375248908997, + "num_tokens": 690990047.0, + "step": 18108 + }, + { + "epoch": 2.303650934995548, + "ewc_loss": 0.04778579622507095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3892898752819747e-05, + "grad_norm": 30.467357635498047, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8758357167243958, + "num_tokens": 691031010.0, + "step": 18109 + }, + { + "epoch": 2.303778145274138, + "ewc_loss": 0.04789814352989197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3949070964590646e-05, + "grad_norm": 30.36885643005371, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.871617317199707, + "num_tokens": 691065830.0, + "step": 18110 + }, + { + "epoch": 2.303905355552729, + "ewc_loss": 0.04790731146931648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3953656636876985e-05, + "grad_norm": 30.56374740600586, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.881782054901123, + "num_tokens": 691101418.0, + "step": 18111 + }, + { + "epoch": 2.304032565831319, + "ewc_loss": 0.047948163002729416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3974082068889402e-05, + "grad_norm": 30.380464553833008, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8560796976089478, + "num_tokens": 691139447.0, + "step": 18112 + }, + { + "epoch": 2.3041597761099095, + "ewc_loss": 0.04792731627821922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.396365744061768e-05, + "grad_norm": 30.642520904541016, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8506954908370972, + "num_tokens": 691168809.0, + "step": 18113 + }, + { + "epoch": 2.3042869863885, + "ewc_loss": 0.04796086251735687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.398043216089718e-05, + "grad_norm": 30.504497528076172, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8764783143997192, + "num_tokens": 691200420.0, + "step": 18114 + }, + { + "epoch": 2.3044141966670906, + "ewc_loss": 0.04786674305796623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3933371267048642e-05, + "grad_norm": 30.538291931152344, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8659022450447083, + "num_tokens": 691238587.0, + "step": 18115 + }, + { + "epoch": 2.304541406945681, + "ewc_loss": 0.04795738682150841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.397869320702739e-05, + "grad_norm": 30.503080368041992, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8680777549743652, + "num_tokens": 691277137.0, + "step": 18116 + }, + { + "epoch": 2.3046686172242716, + "ewc_loss": 0.04790044203400612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3950220565893687e-05, + "grad_norm": 30.449962615966797, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8686508536338806, + "num_tokens": 691319564.0, + "step": 18117 + }, + { + "epoch": 2.304795827502862, + "ewc_loss": 0.047918375581502914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.395918818365317e-05, + "grad_norm": 30.640783309936523, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8696544170379639, + "num_tokens": 691359652.0, + "step": 18118 + }, + { + "epoch": 2.3049230377814527, + "ewc_loss": 0.047937359660863876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.396867967036087e-05, + "grad_norm": 30.419668197631836, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8708470463752747, + "num_tokens": 691399705.0, + "step": 18119 + }, + { + "epoch": 2.305050248060043, + "ewc_loss": 0.04794112592935562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.397056232439354e-05, + "grad_norm": 30.662185668945312, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8634476065635681, + "num_tokens": 691439447.0, + "step": 18120 + }, + { + "epoch": 2.3051774583386337, + "ewc_loss": 0.0479818731546402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3990936824702658e-05, + "grad_norm": 30.537532806396484, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8583823442459106, + "num_tokens": 691477453.0, + "step": 18121 + }, + { + "epoch": 2.3053046686172243, + "ewc_loss": 0.047886498272418976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3943248379509896e-05, + "grad_norm": 30.510469436645508, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8701991438865662, + "num_tokens": 691517766.0, + "step": 18122 + }, + { + "epoch": 2.305431878895815, + "ewc_loss": 0.047951839864254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3975919248186983e-05, + "grad_norm": 30.627225875854492, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8695089221000671, + "num_tokens": 691553956.0, + "step": 18123 + }, + { + "epoch": 2.3055590891744053, + "ewc_loss": 0.04794778674840927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3973892893991433e-05, + "grad_norm": 30.544292449951172, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8688440322875977, + "num_tokens": 691592522.0, + "step": 18124 + }, + { + "epoch": 2.305686299452996, + "ewc_loss": 0.047914933413267136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3957467419677414e-05, + "grad_norm": 30.556480407714844, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.869505763053894, + "num_tokens": 691629202.0, + "step": 18125 + }, + { + "epoch": 2.3058135097315864, + "ewc_loss": 0.047902341932058334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3951170078362338e-05, + "grad_norm": 30.551118850708008, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8728495240211487, + "num_tokens": 691665324.0, + "step": 18126 + }, + { + "epoch": 2.305940720010177, + "ewc_loss": 0.047859031707048416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3929515009513125e-05, + "grad_norm": 30.41514778137207, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.880294919013977, + "num_tokens": 691704223.0, + "step": 18127 + }, + { + "epoch": 2.3060679302887674, + "ewc_loss": 0.047869972884655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.393498652963899e-05, + "grad_norm": 30.592403411865234, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8665035963058472, + "num_tokens": 691745061.0, + "step": 18128 + }, + { + "epoch": 2.306195140567358, + "ewc_loss": 0.04788116365671158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3940581741044298e-05, + "grad_norm": 30.3989315032959, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8810172080993652, + "num_tokens": 691783630.0, + "step": 18129 + }, + { + "epoch": 2.3063223508459485, + "ewc_loss": 0.04787362366914749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3936810976010747e-05, + "grad_norm": 30.671066284179688, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8690622448921204, + "num_tokens": 691818283.0, + "step": 18130 + }, + { + "epoch": 2.306449561124539, + "ewc_loss": 0.04795520007610321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.397759999439586e-05, + "grad_norm": 30.38897132873535, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8609497547149658, + "num_tokens": 691854585.0, + "step": 18131 + }, + { + "epoch": 2.3065767714031296, + "ewc_loss": 0.04775306209921837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.387653148616664e-05, + "grad_norm": 30.588375091552734, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8527753353118896, + "num_tokens": 691891784.0, + "step": 18132 + }, + { + "epoch": 2.3067039816817196, + "ewc_loss": 0.047889143228530884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3944570784806274e-05, + "grad_norm": 30.389650344848633, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.871940553188324, + "num_tokens": 691927389.0, + "step": 18133 + }, + { + "epoch": 2.3068311919603106, + "ewc_loss": 0.04779893159866333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3899465304566547e-05, + "grad_norm": 30.5748233795166, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8678814768791199, + "num_tokens": 691962928.0, + "step": 18134 + }, + { + "epoch": 2.3069584022389007, + "ewc_loss": 0.0478937067091465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3946853616507724e-05, + "grad_norm": 30.395368576049805, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8693593740463257, + "num_tokens": 692007064.0, + "step": 18135 + }, + { + "epoch": 2.3070856125174912, + "ewc_loss": 0.04785798862576485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.392899477854371e-05, + "grad_norm": 30.613445281982422, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8738181591033936, + "num_tokens": 692040668.0, + "step": 18136 + }, + { + "epoch": 2.3072128227960818, + "ewc_loss": 0.047950293868780136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3975146177690476e-05, + "grad_norm": 30.457021713256836, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8707860112190247, + "num_tokens": 692077710.0, + "step": 18137 + }, + { + "epoch": 2.3073400330746723, + "ewc_loss": 0.04795689135789871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.397844582446851e-05, + "grad_norm": 30.656333923339844, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8770879507064819, + "num_tokens": 692112316.0, + "step": 18138 + }, + { + "epoch": 2.307467243353263, + "ewc_loss": 0.04792426899075508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3962134946486913e-05, + "grad_norm": 30.42044448852539, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8753035068511963, + "num_tokens": 692153201.0, + "step": 18139 + }, + { + "epoch": 2.3075944536318533, + "ewc_loss": 0.04783835634589195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3919177692732774e-05, + "grad_norm": 30.477216720581055, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8647355437278748, + "num_tokens": 692192942.0, + "step": 18140 + }, + { + "epoch": 2.307721663910444, + "ewc_loss": 0.047949183732271194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3974591385922395e-05, + "grad_norm": 30.471160888671875, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8851226568222046, + "num_tokens": 692235515.0, + "step": 18141 + }, + { + "epoch": 2.3078488741890344, + "ewc_loss": 0.04788581654429436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3942908228491433e-05, + "grad_norm": 30.439624786376953, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8611925840377808, + "num_tokens": 692271338.0, + "step": 18142 + }, + { + "epoch": 2.307976084467625, + "ewc_loss": 0.047890804708004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3945402062963694e-05, + "grad_norm": 30.45182991027832, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8513320088386536, + "num_tokens": 692311469.0, + "step": 18143 + }, + { + "epoch": 2.3081032947462155, + "ewc_loss": 0.04795948788523674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.397974458290264e-05, + "grad_norm": 30.604421615600586, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8523688316345215, + "num_tokens": 692352052.0, + "step": 18144 + }, + { + "epoch": 2.308230505024806, + "ewc_loss": 0.04793405905365944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3967029846971855e-05, + "grad_norm": 30.54817008972168, + "learning_rate": 1e-06, + "loss": 0.4711, + "mean_token_accuracy": 0.8543682098388672, + "num_tokens": 692382978.0, + "step": 18145 + }, + { + "epoch": 2.3083577153033965, + "ewc_loss": 0.04793504625558853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3967522793100215e-05, + "grad_norm": 30.473413467407227, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8727679252624512, + "num_tokens": 692415651.0, + "step": 18146 + }, + { + "epoch": 2.308484925581987, + "ewc_loss": 0.04788536950945854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3942684492794797e-05, + "grad_norm": 30.538820266723633, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8523766994476318, + "num_tokens": 692455937.0, + "step": 18147 + }, + { + "epoch": 2.3086121358605776, + "ewc_loss": 0.047960538417100906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.398026845185086e-05, + "grad_norm": 30.409151077270508, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8646824359893799, + "num_tokens": 692496277.0, + "step": 18148 + }, + { + "epoch": 2.308739346139168, + "ewc_loss": 0.047888364642858505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3944181521073915e-05, + "grad_norm": 30.50665855407715, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8705375790596008, + "num_tokens": 692530485.0, + "step": 18149 + }, + { + "epoch": 2.3088665564177586, + "ewc_loss": 0.048018645495176315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.40093231695937e-05, + "grad_norm": 30.50028419494629, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8681104183197021, + "num_tokens": 692564794.0, + "step": 18150 + }, + { + "epoch": 2.308993766696349, + "ewc_loss": 0.04791734367609024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.395867159066256e-05, + "grad_norm": 30.472583770751953, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8596369028091431, + "num_tokens": 692610359.0, + "step": 18151 + }, + { + "epoch": 2.3091209769749397, + "ewc_loss": 0.048002537339925766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4001268684514798e-05, + "grad_norm": 30.53687286376953, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8764075040817261, + "num_tokens": 692641560.0, + "step": 18152 + }, + { + "epoch": 2.30924818725353, + "ewc_loss": 0.0480363667011261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.401818346697837e-05, + "grad_norm": 30.495378494262695, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8648970127105713, + "num_tokens": 692680114.0, + "step": 18153 + }, + { + "epoch": 2.3093753975321207, + "ewc_loss": 0.047943364828825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3971682821866125e-05, + "grad_norm": 30.493764877319336, + "learning_rate": 1e-06, + "loss": 0.4726, + "mean_token_accuracy": 0.857094943523407, + "num_tokens": 692714199.0, + "step": 18154 + }, + { + "epoch": 2.3095026078107113, + "ewc_loss": 0.047973670065402985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3986835003597662e-05, + "grad_norm": 30.43780517578125, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8719318509101868, + "num_tokens": 692752331.0, + "step": 18155 + }, + { + "epoch": 2.309629818089302, + "ewc_loss": 0.04793575033545494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3967875677044503e-05, + "grad_norm": 30.4103946685791, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8752787113189697, + "num_tokens": 692789167.0, + "step": 18156 + }, + { + "epoch": 2.3097570283678923, + "ewc_loss": 0.04802670702338219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4013354050111957e-05, + "grad_norm": 30.435501098632812, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8693902492523193, + "num_tokens": 692829234.0, + "step": 18157 + }, + { + "epoch": 2.3098842386464824, + "ewc_loss": 0.04806326702237129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.403163307462819e-05, + "grad_norm": 30.519020080566406, + "learning_rate": 1e-06, + "loss": 0.488, + "mean_token_accuracy": 0.8486512899398804, + "num_tokens": 692863749.0, + "step": 18158 + }, + { + "epoch": 2.3100114489250734, + "ewc_loss": 0.04802339896559715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.401169876975473e-05, + "grad_norm": 30.433500289916992, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8657494783401489, + "num_tokens": 692900972.0, + "step": 18159 + }, + { + "epoch": 2.3101386592036635, + "ewc_loss": 0.048060230910778046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4030116037465632e-05, + "grad_norm": 30.643856048583984, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8640503883361816, + "num_tokens": 692938522.0, + "step": 18160 + }, + { + "epoch": 2.310265869482254, + "ewc_loss": 0.048156168311834335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.407808460702654e-05, + "grad_norm": 30.599546432495117, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8788672685623169, + "num_tokens": 692973653.0, + "step": 18161 + }, + { + "epoch": 2.3103930797608445, + "ewc_loss": 0.047963835299015045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3981918275239877e-05, + "grad_norm": 30.53143310546875, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8628149032592773, + "num_tokens": 693016325.0, + "step": 18162 + }, + { + "epoch": 2.310520290039435, + "ewc_loss": 0.04799496382474899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3997481548576616e-05, + "grad_norm": 30.511394500732422, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8809312582015991, + "num_tokens": 693052022.0, + "step": 18163 + }, + { + "epoch": 2.3106475003180256, + "ewc_loss": 0.047985538840293884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.399276854703203e-05, + "grad_norm": 30.504405975341797, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8743595480918884, + "num_tokens": 693093936.0, + "step": 18164 + }, + { + "epoch": 2.310774710596616, + "ewc_loss": 0.04811051860451698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4055259927990846e-05, + "grad_norm": 30.682228088378906, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.882572591304779, + "num_tokens": 693130160.0, + "step": 18165 + }, + { + "epoch": 2.3109019208752066, + "ewc_loss": 0.04787816107273102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3939081074786372e-05, + "grad_norm": 30.5266056060791, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8719160556793213, + "num_tokens": 693172405.0, + "step": 18166 + }, + { + "epoch": 2.311029131153797, + "ewc_loss": 0.04794813320040703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3974067516974173e-05, + "grad_norm": 30.548036575317383, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8726102709770203, + "num_tokens": 693206499.0, + "step": 18167 + }, + { + "epoch": 2.3111563414323877, + "ewc_loss": 0.047903865575790405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3951932234922424e-05, + "grad_norm": 30.4206600189209, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8680906295776367, + "num_tokens": 693244287.0, + "step": 18168 + }, + { + "epoch": 2.311283551710978, + "ewc_loss": 0.04806426167488098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.403213147772476e-05, + "grad_norm": 30.610326766967773, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8611947298049927, + "num_tokens": 693286164.0, + "step": 18169 + }, + { + "epoch": 2.3114107619895687, + "ewc_loss": 0.04801373928785324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4006869352888316e-05, + "grad_norm": 30.50950050354004, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8766725063323975, + "num_tokens": 693330961.0, + "step": 18170 + }, + { + "epoch": 2.3115379722681593, + "ewc_loss": 0.04796494543552399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.398247306700796e-05, + "grad_norm": 30.536535263061523, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8742754459381104, + "num_tokens": 693375169.0, + "step": 18171 + }, + { + "epoch": 2.31166518254675, + "ewc_loss": 0.04798613116145134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3993065042304806e-05, + "grad_norm": 30.4584903717041, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8640961050987244, + "num_tokens": 693414127.0, + "step": 18172 + }, + { + "epoch": 2.3117923928253403, + "ewc_loss": 0.0479678213596344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.398391006863676e-05, + "grad_norm": 30.606372833251953, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8664513826370239, + "num_tokens": 693450446.0, + "step": 18173 + }, + { + "epoch": 2.311919603103931, + "ewc_loss": 0.047980643808841705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.399032200628426e-05, + "grad_norm": 30.551593780517578, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8613730669021606, + "num_tokens": 693490973.0, + "step": 18174 + }, + { + "epoch": 2.3120468133825214, + "ewc_loss": 0.047892309725284576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3946155124576762e-05, + "grad_norm": 30.43990707397461, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8717468976974487, + "num_tokens": 693526815.0, + "step": 18175 + }, + { + "epoch": 2.312174023661112, + "ewc_loss": 0.04790777713060379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3953887648531236e-05, + "grad_norm": 30.51224136352539, + "learning_rate": 1e-06, + "loss": 0.4616, + "mean_token_accuracy": 0.8635317087173462, + "num_tokens": 693566949.0, + "step": 18176 + }, + { + "epoch": 2.3123012339397024, + "ewc_loss": 0.047977376729249954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3988688553799875e-05, + "grad_norm": 30.50128746032715, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8628897666931152, + "num_tokens": 693601116.0, + "step": 18177 + }, + { + "epoch": 2.312428444218293, + "ewc_loss": 0.047941043972969055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3970522306626663e-05, + "grad_norm": 30.448144912719727, + "learning_rate": 1e-06, + "loss": 0.3445, + "mean_token_accuracy": 0.8998934030532837, + "num_tokens": 693640588.0, + "step": 18178 + }, + { + "epoch": 2.3125556544968835, + "ewc_loss": 0.047993071377277374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3996535674086772e-05, + "grad_norm": 30.671836853027344, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8714206218719482, + "num_tokens": 693683352.0, + "step": 18179 + }, + { + "epoch": 2.312682864775474, + "ewc_loss": 0.04798377305269241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.399188633717131e-05, + "grad_norm": 30.451276779174805, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8755301833152771, + "num_tokens": 693722579.0, + "step": 18180 + }, + { + "epoch": 2.3128100750540646, + "ewc_loss": 0.04800223186612129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.40011158894049e-05, + "grad_norm": 30.6412410736084, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.870842695236206, + "num_tokens": 693766743.0, + "step": 18181 + }, + { + "epoch": 2.312937285332655, + "ewc_loss": 0.04797033965587616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3985170628293417e-05, + "grad_norm": 30.511415481567383, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8718095421791077, + "num_tokens": 693809699.0, + "step": 18182 + }, + { + "epoch": 2.313064495611245, + "ewc_loss": 0.047928061336278915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3964030333445407e-05, + "grad_norm": 30.618587493896484, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8761920928955078, + "num_tokens": 693845508.0, + "step": 18183 + }, + { + "epoch": 2.313191705889836, + "ewc_loss": 0.04790681228041649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3953405616339296e-05, + "grad_norm": 30.484424591064453, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8692679405212402, + "num_tokens": 693883763.0, + "step": 18184 + }, + { + "epoch": 2.313318916168426, + "ewc_loss": 0.04780865088105202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3904325644252822e-05, + "grad_norm": 30.516273498535156, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8751606941223145, + "num_tokens": 693917093.0, + "step": 18185 + }, + { + "epoch": 2.3134461264470167, + "ewc_loss": 0.04789280518889427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3946402507135645e-05, + "grad_norm": 30.619537353515625, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.871125340461731, + "num_tokens": 693949219.0, + "step": 18186 + }, + { + "epoch": 2.3135733367256073, + "ewc_loss": 0.047934673726558685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3967337256181054e-05, + "grad_norm": 30.65205192565918, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8638345003128052, + "num_tokens": 693992755.0, + "step": 18187 + }, + { + "epoch": 2.313700547004198, + "ewc_loss": 0.04787759482860565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.393879731243942e-05, + "grad_norm": 30.628650665283203, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8548892140388489, + "num_tokens": 694030386.0, + "step": 18188 + }, + { + "epoch": 2.3138277572827883, + "ewc_loss": 0.04778282344341278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.389141263847705e-05, + "grad_norm": 30.489757537841797, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8784356713294983, + "num_tokens": 694065019.0, + "step": 18189 + }, + { + "epoch": 2.313954967561379, + "ewc_loss": 0.04791584983468056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3957925805007108e-05, + "grad_norm": 30.722164154052734, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8659564256668091, + "num_tokens": 694106645.0, + "step": 18190 + }, + { + "epoch": 2.3140821778399694, + "ewc_loss": 0.04778379201889038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3891896489658393e-05, + "grad_norm": 30.50410270690918, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8741379976272583, + "num_tokens": 694149146.0, + "step": 18191 + }, + { + "epoch": 2.31420938811856, + "ewc_loss": 0.04784687981009483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.392343958490528e-05, + "grad_norm": 30.661088943481445, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.854101300239563, + "num_tokens": 694193567.0, + "step": 18192 + }, + { + "epoch": 2.3143365983971504, + "ewc_loss": 0.047943584620952606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3971791961230338e-05, + "grad_norm": 30.544248580932617, + "learning_rate": 1e-06, + "loss": 0.3854, + "mean_token_accuracy": 0.8812742233276367, + "num_tokens": 694232160.0, + "step": 18193 + }, + { + "epoch": 2.314463808675741, + "ewc_loss": 0.04785614833235741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3928074369905517e-05, + "grad_norm": 30.662368774414062, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8705560564994812, + "num_tokens": 694275153.0, + "step": 18194 + }, + { + "epoch": 2.3145910189543315, + "ewc_loss": 0.04791753739118576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.395876799710095e-05, + "grad_norm": 30.681779861450195, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8756036162376404, + "num_tokens": 694313722.0, + "step": 18195 + }, + { + "epoch": 2.314718229232922, + "ewc_loss": 0.04780687391757965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3903436158434488e-05, + "grad_norm": 30.512575149536133, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8730698823928833, + "num_tokens": 694350544.0, + "step": 18196 + }, + { + "epoch": 2.3148454395115126, + "ewc_loss": 0.047835398465394974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.391769885434769e-05, + "grad_norm": 30.542091369628906, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8718914985656738, + "num_tokens": 694386552.0, + "step": 18197 + }, + { + "epoch": 2.314972649790103, + "ewc_loss": 0.04790184646844864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3950922695803456e-05, + "grad_norm": 30.53927993774414, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8658773303031921, + "num_tokens": 694434368.0, + "step": 18198 + }, + { + "epoch": 2.3150998600686936, + "ewc_loss": 0.04796817526221275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3984088329598308e-05, + "grad_norm": 30.709047317504883, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8698090314865112, + "num_tokens": 694479856.0, + "step": 18199 + }, + { + "epoch": 2.315227070347284, + "ewc_loss": 0.04791620373725891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.395810224697925e-05, + "grad_norm": 30.561355590820312, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8702890872955322, + "num_tokens": 694521107.0, + "step": 18200 + }, + { + "epoch": 2.3153542806258747, + "ewc_loss": 0.047857511788606644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3928756490931846e-05, + "grad_norm": 30.446735382080078, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.870963454246521, + "num_tokens": 694560886.0, + "step": 18201 + }, + { + "epoch": 2.315481490904465, + "ewc_loss": 0.04793162643909454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3965812943060882e-05, + "grad_norm": 30.671865463256836, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8743137121200562, + "num_tokens": 694598404.0, + "step": 18202 + }, + { + "epoch": 2.3156087011830557, + "ewc_loss": 0.047933295369148254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.396664785919711e-05, + "grad_norm": 30.56331443786621, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.865359365940094, + "num_tokens": 694637162.0, + "step": 18203 + }, + { + "epoch": 2.3157359114616463, + "ewc_loss": 0.04776144400238991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.388072243775241e-05, + "grad_norm": 30.53862762451172, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8731592893600464, + "num_tokens": 694677435.0, + "step": 18204 + }, + { + "epoch": 2.315863121740237, + "ewc_loss": 0.0478462316095829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.392311580479145e-05, + "grad_norm": 30.521221160888672, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.878926157951355, + "num_tokens": 694713980.0, + "step": 18205 + }, + { + "epoch": 2.3159903320188273, + "ewc_loss": 0.04789149388670921, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3945747670950368e-05, + "grad_norm": 30.623498916625977, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8651895523071289, + "num_tokens": 694753884.0, + "step": 18206 + }, + { + "epoch": 2.316117542297418, + "ewc_loss": 0.047819171100854874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3909586161607876e-05, + "grad_norm": 30.35586929321289, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8832715749740601, + "num_tokens": 694785739.0, + "step": 18207 + }, + { + "epoch": 2.316244752576008, + "ewc_loss": 0.047795701771974564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.38978500419762e-05, + "grad_norm": 30.669479370117188, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.868465781211853, + "num_tokens": 694827629.0, + "step": 18208 + }, + { + "epoch": 2.316371962854599, + "ewc_loss": 0.04802429676055908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4012148060137406e-05, + "grad_norm": 30.39004135131836, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8664955496788025, + "num_tokens": 694868241.0, + "step": 18209 + }, + { + "epoch": 2.316499173133189, + "ewc_loss": 0.04773151874542236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3865759430918843e-05, + "grad_norm": 30.53034782409668, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8611285090446472, + "num_tokens": 694905298.0, + "step": 18210 + }, + { + "epoch": 2.3166263834117795, + "ewc_loss": 0.04803790524601936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.401895289949607e-05, + "grad_norm": 30.557416915893555, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8624047040939331, + "num_tokens": 694938054.0, + "step": 18211 + }, + { + "epoch": 2.31675359369037, + "ewc_loss": 0.047895826399326324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.394791408732999e-05, + "grad_norm": 30.444238662719727, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8651927709579468, + "num_tokens": 694978761.0, + "step": 18212 + }, + { + "epoch": 2.3168808039689606, + "ewc_loss": 0.047974396497011185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.398719880147837e-05, + "grad_norm": 30.544374465942383, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8662455677986145, + "num_tokens": 695015073.0, + "step": 18213 + }, + { + "epoch": 2.317008014247551, + "ewc_loss": 0.047986749559640884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.399337427050341e-05, + "grad_norm": 30.512039184570312, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8646745085716248, + "num_tokens": 695052288.0, + "step": 18214 + }, + { + "epoch": 2.3171352245261416, + "ewc_loss": 0.04803422465920448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4017112082219683e-05, + "grad_norm": 30.535715103149414, + "learning_rate": 1e-06, + "loss": 0.4697, + "mean_token_accuracy": 0.8587663173675537, + "num_tokens": 695088976.0, + "step": 18215 + }, + { + "epoch": 2.317262434804732, + "ewc_loss": 0.04793650656938553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.396825402684044e-05, + "grad_norm": 30.57210922241211, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8732455968856812, + "num_tokens": 695128482.0, + "step": 18216 + }, + { + "epoch": 2.3173896450833227, + "ewc_loss": 0.04801429063081741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4007145839277655e-05, + "grad_norm": 30.563447952270508, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8665893077850342, + "num_tokens": 695164963.0, + "step": 18217 + }, + { + "epoch": 2.317516855361913, + "ewc_loss": 0.047944240272045135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3972119379322976e-05, + "grad_norm": 30.520389556884766, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8916134238243103, + "num_tokens": 695201515.0, + "step": 18218 + }, + { + "epoch": 2.3176440656405037, + "ewc_loss": 0.04793320596218109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.396660238446202e-05, + "grad_norm": 30.436031341552734, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8759267330169678, + "num_tokens": 695242057.0, + "step": 18219 + }, + { + "epoch": 2.3177712759190943, + "ewc_loss": 0.047986529767513275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3993265131139196e-05, + "grad_norm": 30.541715621948242, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8714554309844971, + "num_tokens": 695277128.0, + "step": 18220 + }, + { + "epoch": 2.317898486197685, + "ewc_loss": 0.04807952046394348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4039760319283232e-05, + "grad_norm": 30.495738983154297, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8824353218078613, + "num_tokens": 695311984.0, + "step": 18221 + }, + { + "epoch": 2.3180256964762753, + "ewc_loss": 0.04804924130439758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.402462087047752e-05, + "grad_norm": 30.46439552307129, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8596780300140381, + "num_tokens": 695348457.0, + "step": 18222 + }, + { + "epoch": 2.318152906754866, + "ewc_loss": 0.04795342683792114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3976714146556333e-05, + "grad_norm": 30.455703735351562, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8718913793563843, + "num_tokens": 695386518.0, + "step": 18223 + }, + { + "epoch": 2.3182801170334564, + "ewc_loss": 0.04811067506670952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.40553381445352e-05, + "grad_norm": 30.590377807617188, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8663954138755798, + "num_tokens": 695425365.0, + "step": 18224 + }, + { + "epoch": 2.318407327312047, + "ewc_loss": 0.04810623079538345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4053115339484066e-05, + "grad_norm": 30.639829635620117, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8721268177032471, + "num_tokens": 695459559.0, + "step": 18225 + }, + { + "epoch": 2.3185345375906374, + "ewc_loss": 0.0479595810174942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.397979005763773e-05, + "grad_norm": 30.515417098999023, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8660067319869995, + "num_tokens": 695491386.0, + "step": 18226 + }, + { + "epoch": 2.318661747869228, + "ewc_loss": 0.04792708531022072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3963542844285257e-05, + "grad_norm": 30.515716552734375, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8531436324119568, + "num_tokens": 695534660.0, + "step": 18227 + }, + { + "epoch": 2.3187889581478185, + "ewc_loss": 0.0481112077832222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4055603716988117e-05, + "grad_norm": 30.655771255493164, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8624696731567383, + "num_tokens": 695578127.0, + "step": 18228 + }, + { + "epoch": 2.318916168426409, + "ewc_loss": 0.047971248626708984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.39856235566549e-05, + "grad_norm": 30.532644271850586, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.857488751411438, + "num_tokens": 695614181.0, + "step": 18229 + }, + { + "epoch": 2.3190433787049995, + "ewc_loss": 0.048011407256126404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4005703380680643e-05, + "grad_norm": 30.630203247070312, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8687043190002441, + "num_tokens": 695652753.0, + "step": 18230 + }, + { + "epoch": 2.3191705889835896, + "ewc_loss": 0.047965142875909805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.398257129243575e-05, + "grad_norm": 30.494503021240234, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8769238591194153, + "num_tokens": 695692449.0, + "step": 18231 + }, + { + "epoch": 2.3192977992621806, + "ewc_loss": 0.04796073958277702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.398037031525746e-05, + "grad_norm": 30.58091926574707, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8691310882568359, + "num_tokens": 695730812.0, + "step": 18232 + }, + { + "epoch": 2.3194250095407707, + "ewc_loss": 0.04807114228606224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4035571186686866e-05, + "grad_norm": 30.566986083984375, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8837389945983887, + "num_tokens": 695767577.0, + "step": 18233 + }, + { + "epoch": 2.319552219819361, + "ewc_loss": 0.04804235324263573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4021175704547204e-05, + "grad_norm": 30.493999481201172, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8703548908233643, + "num_tokens": 695812372.0, + "step": 18234 + }, + { + "epoch": 2.3196794300979517, + "ewc_loss": 0.04801611974835396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.400806079094764e-05, + "grad_norm": 30.524913787841797, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8858537673950195, + "num_tokens": 695847470.0, + "step": 18235 + }, + { + "epoch": 2.3198066403765423, + "ewc_loss": 0.048067860305309296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4033930458244868e-05, + "grad_norm": 30.526044845581055, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8854472637176514, + "num_tokens": 695888082.0, + "step": 18236 + }, + { + "epoch": 2.319933850655133, + "ewc_loss": 0.04811098426580429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.40554927586345e-05, + "grad_norm": 30.582717895507812, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8765984177589417, + "num_tokens": 695928904.0, + "step": 18237 + }, + { + "epoch": 2.3200610609337233, + "ewc_loss": 0.048007577657699585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.400378798483871e-05, + "grad_norm": 30.531661987304688, + "learning_rate": 1e-06, + "loss": 0.455, + "mean_token_accuracy": 0.85956209897995, + "num_tokens": 695967549.0, + "step": 18238 + }, + { + "epoch": 2.320188271212314, + "ewc_loss": 0.04806103929877281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.403051985311322e-05, + "grad_norm": 30.582386016845703, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8647820949554443, + "num_tokens": 696005611.0, + "step": 18239 + }, + { + "epoch": 2.3203154814909044, + "ewc_loss": 0.04802238941192627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4011194909689948e-05, + "grad_norm": 30.470212936401367, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8723238706588745, + "num_tokens": 696034864.0, + "step": 18240 + }, + { + "epoch": 2.320442691769495, + "ewc_loss": 0.047907035797834396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3953518393682316e-05, + "grad_norm": 30.511459350585938, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8678077459335327, + "num_tokens": 696072137.0, + "step": 18241 + }, + { + "epoch": 2.3205699020480854, + "ewc_loss": 0.04806474223732948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.403237158432603e-05, + "grad_norm": 30.569385528564453, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8727362155914307, + "num_tokens": 696115744.0, + "step": 18242 + }, + { + "epoch": 2.320697112326676, + "ewc_loss": 0.04795430973172188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.397715434199199e-05, + "grad_norm": 30.526657104492188, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8651800751686096, + "num_tokens": 696151293.0, + "step": 18243 + }, + { + "epoch": 2.3208243226052665, + "ewc_loss": 0.04795262590050697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.397631214989815e-05, + "grad_norm": 30.50912857055664, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8882125616073608, + "num_tokens": 696190742.0, + "step": 18244 + }, + { + "epoch": 2.320951532883857, + "ewc_loss": 0.04796968400478363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3984841391211376e-05, + "grad_norm": 30.48345947265625, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8593977689743042, + "num_tokens": 696232807.0, + "step": 18245 + }, + { + "epoch": 2.3210787431624476, + "ewc_loss": 0.04801490157842636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.400745142949745e-05, + "grad_norm": 30.522809982299805, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.866417646408081, + "num_tokens": 696269416.0, + "step": 18246 + }, + { + "epoch": 2.321205953441038, + "ewc_loss": 0.04797447472810745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3987237000255845e-05, + "grad_norm": 30.51184844970703, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8713945746421814, + "num_tokens": 696305144.0, + "step": 18247 + }, + { + "epoch": 2.3213331637196286, + "ewc_loss": 0.04798915982246399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.399458026047796e-05, + "grad_norm": 30.43553924560547, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8623378276824951, + "num_tokens": 696344854.0, + "step": 18248 + }, + { + "epoch": 2.321460373998219, + "ewc_loss": 0.04801253601908684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4006267267395742e-05, + "grad_norm": 30.542747497558594, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8881158232688904, + "num_tokens": 696380154.0, + "step": 18249 + }, + { + "epoch": 2.3215875842768097, + "ewc_loss": 0.048061493784189224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4030747226788662e-05, + "grad_norm": 30.57481575012207, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8557046055793762, + "num_tokens": 696416458.0, + "step": 18250 + }, + { + "epoch": 2.3217147945554, + "ewc_loss": 0.047981102019548416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3990551198949106e-05, + "grad_norm": 30.497608184814453, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8729486465454102, + "num_tokens": 696453232.0, + "step": 18251 + }, + { + "epoch": 2.3218420048339907, + "ewc_loss": 0.048000238835811615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4000119083211757e-05, + "grad_norm": 30.43167495727539, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8496601581573486, + "num_tokens": 696487930.0, + "step": 18252 + }, + { + "epoch": 2.3219692151125813, + "ewc_loss": 0.04799560084939003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3997799871722236e-05, + "grad_norm": 30.606260299682617, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8606260418891907, + "num_tokens": 696523824.0, + "step": 18253 + }, + { + "epoch": 2.322096425391172, + "ewc_loss": 0.04796799644827843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.398399738012813e-05, + "grad_norm": 30.418272018432617, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8805437684059143, + "num_tokens": 696567007.0, + "step": 18254 + }, + { + "epoch": 2.3222236356697623, + "ewc_loss": 0.047953054308891296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3976526790647767e-05, + "grad_norm": 30.59803009033203, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8865526914596558, + "num_tokens": 696610055.0, + "step": 18255 + }, + { + "epoch": 2.3223508459483524, + "ewc_loss": 0.04810195788741112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4050979845924303e-05, + "grad_norm": 30.49703598022461, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8563433289527893, + "num_tokens": 696644058.0, + "step": 18256 + }, + { + "epoch": 2.3224780562269434, + "ewc_loss": 0.04794764146208763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.397382013441529e-05, + "grad_norm": 30.49789810180664, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8789210319519043, + "num_tokens": 696686381.0, + "step": 18257 + }, + { + "epoch": 2.3226052665055335, + "ewc_loss": 0.0481610931456089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4080545699689537e-05, + "grad_norm": 30.451942443847656, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8645563125610352, + "num_tokens": 696727567.0, + "step": 18258 + }, + { + "epoch": 2.322732476784124, + "ewc_loss": 0.04811503738164902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.405751911283005e-05, + "grad_norm": 30.569229125976562, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8595710396766663, + "num_tokens": 696767266.0, + "step": 18259 + }, + { + "epoch": 2.3228596870627145, + "ewc_loss": 0.048098258674144745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4049129933700897e-05, + "grad_norm": 30.552961349487305, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8833479881286621, + "num_tokens": 696806059.0, + "step": 18260 + }, + { + "epoch": 2.322986897341305, + "ewc_loss": 0.048106372356414795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4053186280070804e-05, + "grad_norm": 30.596769332885742, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8684381246566772, + "num_tokens": 696844232.0, + "step": 18261 + }, + { + "epoch": 2.3231141076198956, + "ewc_loss": 0.048151709139347076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4075854526017793e-05, + "grad_norm": 30.595674514770508, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8723205924034119, + "num_tokens": 696886223.0, + "step": 18262 + }, + { + "epoch": 2.323241317898486, + "ewc_loss": 0.048001185059547424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.400059202045668e-05, + "grad_norm": 30.44247817993164, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8557594418525696, + "num_tokens": 696922625.0, + "step": 18263 + }, + { + "epoch": 2.3233685281770766, + "ewc_loss": 0.048054881393909454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4027440304053016e-05, + "grad_norm": 30.626720428466797, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8584275841712952, + "num_tokens": 696965160.0, + "step": 18264 + }, + { + "epoch": 2.323495738455667, + "ewc_loss": 0.048130664974451065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.406533167231828e-05, + "grad_norm": 30.619487762451172, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8749421238899231, + "num_tokens": 697000854.0, + "step": 18265 + }, + { + "epoch": 2.3236229487342577, + "ewc_loss": 0.047985292971134186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.399264667474199e-05, + "grad_norm": 30.511016845703125, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8848965167999268, + "num_tokens": 697041264.0, + "step": 18266 + }, + { + "epoch": 2.323750159012848, + "ewc_loss": 0.04802703857421875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.401351957814768e-05, + "grad_norm": 30.528575897216797, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8676549196243286, + "num_tokens": 697080079.0, + "step": 18267 + }, + { + "epoch": 2.3238773692914387, + "ewc_loss": 0.04800280183553696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4001401470741257e-05, + "grad_norm": 30.42936897277832, + "learning_rate": 1e-06, + "loss": 0.437, + "mean_token_accuracy": 0.865688681602478, + "num_tokens": 697123755.0, + "step": 18268 + }, + { + "epoch": 2.3240045795700293, + "ewc_loss": 0.0480169877409935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4008493710425682e-05, + "grad_norm": 30.664169311523438, + "learning_rate": 1e-06, + "loss": 0.5186, + "mean_token_accuracy": 0.8420862555503845, + "num_tokens": 697162012.0, + "step": 18269 + }, + { + "epoch": 2.32413178984862, + "ewc_loss": 0.048184141516685486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4092070816550404e-05, + "grad_norm": 30.5032901763916, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.880034863948822, + "num_tokens": 697200188.0, + "step": 18270 + }, + { + "epoch": 2.3242590001272103, + "ewc_loss": 0.048074640333652496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4037319235503674e-05, + "grad_norm": 30.635908126831055, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8726991415023804, + "num_tokens": 697240536.0, + "step": 18271 + }, + { + "epoch": 2.324386210405801, + "ewc_loss": 0.0479920320212841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3996015443117358e-05, + "grad_norm": 30.46179962158203, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8812583684921265, + "num_tokens": 697279459.0, + "step": 18272 + }, + { + "epoch": 2.3245134206843914, + "ewc_loss": 0.04802263155579567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4011314962990582e-05, + "grad_norm": 30.52022933959961, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8871396780014038, + "num_tokens": 697312594.0, + "step": 18273 + }, + { + "epoch": 2.324640630962982, + "ewc_loss": 0.0480882003903389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4044100428000093e-05, + "grad_norm": 30.491474151611328, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.8564952611923218, + "num_tokens": 697357834.0, + "step": 18274 + }, + { + "epoch": 2.3247678412415724, + "ewc_loss": 0.04804325848817825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4021628632908687e-05, + "grad_norm": 30.425451278686523, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8684870600700378, + "num_tokens": 697401522.0, + "step": 18275 + }, + { + "epoch": 2.324895051520163, + "ewc_loss": 0.048093438148498535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.40467197727412e-05, + "grad_norm": 30.57567024230957, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8675602078437805, + "num_tokens": 697440415.0, + "step": 18276 + }, + { + "epoch": 2.3250222617987535, + "ewc_loss": 0.0482105128467083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4105256670736708e-05, + "grad_norm": 30.522850036621094, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8536462187767029, + "num_tokens": 697477609.0, + "step": 18277 + }, + { + "epoch": 2.325149472077344, + "ewc_loss": 0.048096202313899994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.404810038569849e-05, + "grad_norm": 30.522485733032227, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.865432620048523, + "num_tokens": 697515566.0, + "step": 18278 + }, + { + "epoch": 2.3252766823559345, + "ewc_loss": 0.04811577871441841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4057890186668374e-05, + "grad_norm": 30.55251693725586, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8741729259490967, + "num_tokens": 697549537.0, + "step": 18279 + }, + { + "epoch": 2.325403892634525, + "ewc_loss": 0.04806935414671898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.403467624390032e-05, + "grad_norm": 30.409303665161133, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8746403455734253, + "num_tokens": 697582618.0, + "step": 18280 + }, + { + "epoch": 2.325531102913115, + "ewc_loss": 0.048170823603868484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4085411496344022e-05, + "grad_norm": 30.604827880859375, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8840026259422302, + "num_tokens": 697622234.0, + "step": 18281 + }, + { + "epoch": 2.325658313191706, + "ewc_loss": 0.048224691301584244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4112345272442326e-05, + "grad_norm": 30.522111892700195, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.861650288105011, + "num_tokens": 697657504.0, + "step": 18282 + }, + { + "epoch": 2.325785523470296, + "ewc_loss": 0.048061344772577286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4030672648223117e-05, + "grad_norm": 30.495695114135742, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.858384370803833, + "num_tokens": 697699496.0, + "step": 18283 + }, + { + "epoch": 2.3259127337488867, + "ewc_loss": 0.04820351302623749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4101756935124286e-05, + "grad_norm": 30.63633155822754, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8825485110282898, + "num_tokens": 697739969.0, + "step": 18284 + }, + { + "epoch": 2.3260399440274773, + "ewc_loss": 0.0480673611164093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4033681256696582e-05, + "grad_norm": 30.48969841003418, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8598249554634094, + "num_tokens": 697774346.0, + "step": 18285 + }, + { + "epoch": 2.326167154306068, + "ewc_loss": 0.0481363981962204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4068198399618268e-05, + "grad_norm": 30.678295135498047, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8555562496185303, + "num_tokens": 697813013.0, + "step": 18286 + }, + { + "epoch": 2.3262943645846583, + "ewc_loss": 0.048121072351932526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4060536816250533e-05, + "grad_norm": 30.483489990234375, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8630321025848389, + "num_tokens": 697854163.0, + "step": 18287 + }, + { + "epoch": 2.326421574863249, + "ewc_loss": 0.04808535799384117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4042679797275923e-05, + "grad_norm": 30.66403579711914, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8776962757110596, + "num_tokens": 697898049.0, + "step": 18288 + }, + { + "epoch": 2.3265487851418394, + "ewc_loss": 0.048254746943712234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4127373762894422e-05, + "grad_norm": 30.48433494567871, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8730453252792358, + "num_tokens": 697936086.0, + "step": 18289 + }, + { + "epoch": 2.32667599542043, + "ewc_loss": 0.04805522784590721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4027613108046353e-05, + "grad_norm": 30.626710891723633, + "learning_rate": 1e-06, + "loss": 0.5195, + "mean_token_accuracy": 0.8453390598297119, + "num_tokens": 697981529.0, + "step": 18290 + }, + { + "epoch": 2.3268032056990204, + "ewc_loss": 0.04820576310157776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4102881070575677e-05, + "grad_norm": 30.475202560424805, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.886825442314148, + "num_tokens": 698016047.0, + "step": 18291 + }, + { + "epoch": 2.326930415977611, + "ewc_loss": 0.04811747744679451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4058737835730426e-05, + "grad_norm": 30.51024055480957, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8816786408424377, + "num_tokens": 698052408.0, + "step": 18292 + }, + { + "epoch": 2.3270576262562015, + "ewc_loss": 0.048181723803281784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4090861188597046e-05, + "grad_norm": 30.502037048339844, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8629904389381409, + "num_tokens": 698089751.0, + "step": 18293 + }, + { + "epoch": 2.327184836534792, + "ewc_loss": 0.048133574426174164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4066786863841116e-05, + "grad_norm": 30.43013572692871, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8674545288085938, + "num_tokens": 698121274.0, + "step": 18294 + }, + { + "epoch": 2.3273120468133826, + "ewc_loss": 0.04810090735554695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4050454157986678e-05, + "grad_norm": 30.460975646972656, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8793596029281616, + "num_tokens": 698155880.0, + "step": 18295 + }, + { + "epoch": 2.327439257091973, + "ewc_loss": 0.0481824055314064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4091203158604912e-05, + "grad_norm": 30.513940811157227, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8731904029846191, + "num_tokens": 698191448.0, + "step": 18296 + }, + { + "epoch": 2.3275664673705636, + "ewc_loss": 0.048228710889816284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4114355255733244e-05, + "grad_norm": 30.621511459350586, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8639329075813293, + "num_tokens": 698228437.0, + "step": 18297 + }, + { + "epoch": 2.327693677649154, + "ewc_loss": 0.04819981008768082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4099905203911476e-05, + "grad_norm": 30.444866180419922, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8630509972572327, + "num_tokens": 698268714.0, + "step": 18298 + }, + { + "epoch": 2.3278208879277447, + "ewc_loss": 0.048227496445178986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.411374771327246e-05, + "grad_norm": 30.6895751953125, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8703930377960205, + "num_tokens": 698305575.0, + "step": 18299 + }, + { + "epoch": 2.327948098206335, + "ewc_loss": 0.04825996235013008, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4129980374709703e-05, + "grad_norm": 30.451364517211914, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.865243673324585, + "num_tokens": 698347701.0, + "step": 18300 + }, + { + "epoch": 2.3280753084849257, + "ewc_loss": 0.04816251993179321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.408126056252513e-05, + "grad_norm": 30.547426223754883, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8847545981407166, + "num_tokens": 698385906.0, + "step": 18301 + }, + { + "epoch": 2.3282025187635162, + "ewc_loss": 0.04818517714738846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4092589228530414e-05, + "grad_norm": 30.506832122802734, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.869143009185791, + "num_tokens": 698415511.0, + "step": 18302 + }, + { + "epoch": 2.3283297290421068, + "ewc_loss": 0.04820474237203598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4102371753542684e-05, + "grad_norm": 30.499074935913086, + "learning_rate": 1e-06, + "loss": 0.4381, + "mean_token_accuracy": 0.8652112483978271, + "num_tokens": 698460528.0, + "step": 18303 + }, + { + "epoch": 2.3284569393206973, + "ewc_loss": 0.04825272038578987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4126360585796647e-05, + "grad_norm": 30.47598648071289, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.8559276461601257, + "num_tokens": 698500800.0, + "step": 18304 + }, + { + "epoch": 2.328584149599288, + "ewc_loss": 0.04822684824466705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.411342393315863e-05, + "grad_norm": 30.564939498901367, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8680602312088013, + "num_tokens": 698539373.0, + "step": 18305 + }, + { + "epoch": 2.328711359877878, + "ewc_loss": 0.04825285077095032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4126426069415174e-05, + "grad_norm": 30.599109649658203, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8914418816566467, + "num_tokens": 698577067.0, + "step": 18306 + }, + { + "epoch": 2.328838570156469, + "ewc_loss": 0.04813075810670853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4065378966042772e-05, + "grad_norm": 30.41361427307129, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8682352304458618, + "num_tokens": 698613515.0, + "step": 18307 + }, + { + "epoch": 2.328965780435059, + "ewc_loss": 0.048261720687150955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.413086076558102e-05, + "grad_norm": 30.565532684326172, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.8721407651901245, + "num_tokens": 698647278.0, + "step": 18308 + }, + { + "epoch": 2.3290929907136495, + "ewc_loss": 0.04818536713719368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.40926838159794e-05, + "grad_norm": 30.416091918945312, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8771975040435791, + "num_tokens": 698688738.0, + "step": 18309 + }, + { + "epoch": 2.32922020099224, + "ewc_loss": 0.04817947372794151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4089737053145654e-05, + "grad_norm": 30.441856384277344, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8738926649093628, + "num_tokens": 698725003.0, + "step": 18310 + }, + { + "epoch": 2.3293474112708306, + "ewc_loss": 0.04829367250204086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4146836949512362e-05, + "grad_norm": 30.5076961517334, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8523068428039551, + "num_tokens": 698768205.0, + "step": 18311 + }, + { + "epoch": 2.329474621549421, + "ewc_loss": 0.04829123988747597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.414562004560139e-05, + "grad_norm": 30.614145278930664, + "learning_rate": 1e-06, + "loss": 0.4868, + "mean_token_accuracy": 0.850459098815918, + "num_tokens": 698805352.0, + "step": 18312 + }, + { + "epoch": 2.3296018318280116, + "ewc_loss": 0.04827431961894035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.41371599258855e-05, + "grad_norm": 30.58713150024414, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8794168829917908, + "num_tokens": 698836610.0, + "step": 18313 + }, + { + "epoch": 2.329729042106602, + "ewc_loss": 0.048237159848213196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.411858076811768e-05, + "grad_norm": 30.54300308227539, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8779113292694092, + "num_tokens": 698877514.0, + "step": 18314 + }, + { + "epoch": 2.3298562523851927, + "ewc_loss": 0.0482400618493557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4120030502672307e-05, + "grad_norm": 30.528112411499023, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8731040954589844, + "num_tokens": 698916699.0, + "step": 18315 + }, + { + "epoch": 2.329983462663783, + "ewc_loss": 0.04822220280766487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.41111010836903e-05, + "grad_norm": 30.45355987548828, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8704044222831726, + "num_tokens": 698953568.0, + "step": 18316 + }, + { + "epoch": 2.3301106729423737, + "ewc_loss": 0.04824927821755409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4124639821820892e-05, + "grad_norm": 30.644073486328125, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8657171130180359, + "num_tokens": 698990562.0, + "step": 18317 + }, + { + "epoch": 2.3302378832209643, + "ewc_loss": 0.04826033487915993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4130167730618268e-05, + "grad_norm": 30.345523834228516, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8721632957458496, + "num_tokens": 699024095.0, + "step": 18318 + }, + { + "epoch": 2.330365093499555, + "ewc_loss": 0.04813964664936066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4069822757155634e-05, + "grad_norm": 30.732589721679688, + "learning_rate": 1e-06, + "loss": 0.4824, + "mean_token_accuracy": 0.8500267267227173, + "num_tokens": 699062327.0, + "step": 18319 + }, + { + "epoch": 2.3304923037781453, + "ewc_loss": 0.048265039920806885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4132519683917053e-05, + "grad_norm": 30.441953659057617, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8754065036773682, + "num_tokens": 699101591.0, + "step": 18320 + }, + { + "epoch": 2.330619514056736, + "ewc_loss": 0.04812036454677582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.406018211331684e-05, + "grad_norm": 30.744165420532227, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8610262274742126, + "num_tokens": 699134836.0, + "step": 18321 + }, + { + "epoch": 2.3307467243353264, + "ewc_loss": 0.04829578101634979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4147890144377016e-05, + "grad_norm": 30.43643569946289, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8616259694099426, + "num_tokens": 699171218.0, + "step": 18322 + }, + { + "epoch": 2.330873934613917, + "ewc_loss": 0.04818453639745712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.409226908639539e-05, + "grad_norm": 30.655113220214844, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.887326717376709, + "num_tokens": 699200891.0, + "step": 18323 + }, + { + "epoch": 2.3310011448925074, + "ewc_loss": 0.04832090437412262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4160452085197903e-05, + "grad_norm": 30.53477668762207, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8852550983428955, + "num_tokens": 699238080.0, + "step": 18324 + }, + { + "epoch": 2.331128355171098, + "ewc_loss": 0.04813249036669731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.406624480499886e-05, + "grad_norm": 30.513084411621094, + "learning_rate": 1e-06, + "loss": 0.4685, + "mean_token_accuracy": 0.8585551977157593, + "num_tokens": 699276051.0, + "step": 18325 + }, + { + "epoch": 2.3312555654496885, + "ewc_loss": 0.04824810102581978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4124050469254144e-05, + "grad_norm": 30.5277099609375, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8768244981765747, + "num_tokens": 699315721.0, + "step": 18326 + }, + { + "epoch": 2.331382775728279, + "ewc_loss": 0.048226840794086456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.411342029517982e-05, + "grad_norm": 30.450557708740234, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8823559880256653, + "num_tokens": 699350342.0, + "step": 18327 + }, + { + "epoch": 2.3315099860068695, + "ewc_loss": 0.04817406088113785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4087030396913178e-05, + "grad_norm": 30.596370697021484, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.875442385673523, + "num_tokens": 699390301.0, + "step": 18328 + }, + { + "epoch": 2.3316371962854596, + "ewc_loss": 0.04820089414715767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4100447262753733e-05, + "grad_norm": 30.493671417236328, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8552936911582947, + "num_tokens": 699427135.0, + "step": 18329 + }, + { + "epoch": 2.3317644065640506, + "ewc_loss": 0.04820476844906807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.410238448646851e-05, + "grad_norm": 30.543174743652344, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8748089075088501, + "num_tokens": 699466461.0, + "step": 18330 + }, + { + "epoch": 2.3318916168426407, + "ewc_loss": 0.048278845846652985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.413942274870351e-05, + "grad_norm": 30.532711029052734, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8628175258636475, + "num_tokens": 699502446.0, + "step": 18331 + }, + { + "epoch": 2.332018827121231, + "ewc_loss": 0.04817301034927368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4086504708975554e-05, + "grad_norm": 30.516698837280273, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8695652484893799, + "num_tokens": 699546357.0, + "step": 18332 + }, + { + "epoch": 2.3321460373998217, + "ewc_loss": 0.048179153352975845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4089576982078142e-05, + "grad_norm": 30.632675170898438, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.869193971157074, + "num_tokens": 699581879.0, + "step": 18333 + }, + { + "epoch": 2.3322732476784123, + "ewc_loss": 0.048194024711847305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4097013010759838e-05, + "grad_norm": 30.487722396850586, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8836341500282288, + "num_tokens": 699616287.0, + "step": 18334 + }, + { + "epoch": 2.332400457957003, + "ewc_loss": 0.048121172934770584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.406058592896443e-05, + "grad_norm": 30.557924270629883, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8690275549888611, + "num_tokens": 699655409.0, + "step": 18335 + }, + { + "epoch": 2.3325276682355933, + "ewc_loss": 0.048332732170820236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.416636561974883e-05, + "grad_norm": 30.63591766357422, + "learning_rate": 1e-06, + "loss": 0.4969, + "mean_token_accuracy": 0.844575822353363, + "num_tokens": 699701420.0, + "step": 18336 + }, + { + "epoch": 2.332654878514184, + "ewc_loss": 0.048166122287511826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.408306136203464e-05, + "grad_norm": 30.60084342956543, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8712624311447144, + "num_tokens": 699736151.0, + "step": 18337 + }, + { + "epoch": 2.3327820887927744, + "ewc_loss": 0.048154741525650024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.407737156318035e-05, + "grad_norm": 30.528865814208984, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8622552752494812, + "num_tokens": 699770111.0, + "step": 18338 + }, + { + "epoch": 2.332909299071365, + "ewc_loss": 0.0481150858104229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4057542759692296e-05, + "grad_norm": 30.559412002563477, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8663339614868164, + "num_tokens": 699807442.0, + "step": 18339 + }, + { + "epoch": 2.3330365093499554, + "ewc_loss": 0.04819526895880699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.409763510513585e-05, + "grad_norm": 30.56090545654297, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8674989938735962, + "num_tokens": 699849364.0, + "step": 18340 + }, + { + "epoch": 2.333163719628546, + "ewc_loss": 0.04805637151002884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.402818608970847e-05, + "grad_norm": 30.42300033569336, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8833562135696411, + "num_tokens": 699886234.0, + "step": 18341 + }, + { + "epoch": 2.3332909299071365, + "ewc_loss": 0.04818170890212059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.409085391263943e-05, + "grad_norm": 30.602081298828125, + "learning_rate": 1e-06, + "loss": 0.4669, + "mean_token_accuracy": 0.8524670600891113, + "num_tokens": 699925660.0, + "step": 18342 + }, + { + "epoch": 2.333418140185727, + "ewc_loss": 0.04824044555425644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4120223315549083e-05, + "grad_norm": 30.570058822631836, + "learning_rate": 1e-06, + "loss": 0.5089, + "mean_token_accuracy": 0.8432452082633972, + "num_tokens": 699967894.0, + "step": 18343 + }, + { + "epoch": 2.3335453504643175, + "ewc_loss": 0.048158641904592514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.407932151982095e-05, + "grad_norm": 30.524715423583984, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8637630939483643, + "num_tokens": 699999211.0, + "step": 18344 + }, + { + "epoch": 2.333672560742908, + "ewc_loss": 0.04823444038629532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.411722016404383e-05, + "grad_norm": 30.596412658691406, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8778782486915588, + "num_tokens": 700037639.0, + "step": 18345 + }, + { + "epoch": 2.3337997710214986, + "ewc_loss": 0.04817568510770798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.408784166618716e-05, + "grad_norm": 30.554542541503906, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8631011843681335, + "num_tokens": 700075677.0, + "step": 18346 + }, + { + "epoch": 2.333926981300089, + "ewc_loss": 0.04807696118950844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4038479750743136e-05, + "grad_norm": 30.442138671875, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.876724123954773, + "num_tokens": 700115697.0, + "step": 18347 + }, + { + "epoch": 2.3340541915786797, + "ewc_loss": 0.04820552095770836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4102761017275043e-05, + "grad_norm": 30.572397232055664, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8517495393753052, + "num_tokens": 700152553.0, + "step": 18348 + }, + { + "epoch": 2.33418140185727, + "ewc_loss": 0.04822749271988869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4113745894283056e-05, + "grad_norm": 30.613073348999023, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8757216334342957, + "num_tokens": 700190958.0, + "step": 18349 + }, + { + "epoch": 2.3343086121358607, + "ewc_loss": 0.04814568907022476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4072844098554924e-05, + "grad_norm": 30.591291427612305, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8678606748580933, + "num_tokens": 700224244.0, + "step": 18350 + }, + { + "epoch": 2.3344358224144512, + "ewc_loss": 0.04813062772154808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4065313482424244e-05, + "grad_norm": 30.650869369506836, + "learning_rate": 1e-06, + "loss": 0.343, + "mean_token_accuracy": 0.8952952027320862, + "num_tokens": 700259736.0, + "step": 18351 + }, + { + "epoch": 2.3345630326930418, + "ewc_loss": 0.048180047422647476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4090024453471415e-05, + "grad_norm": 30.587711334228516, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8700835704803467, + "num_tokens": 700293624.0, + "step": 18352 + }, + { + "epoch": 2.3346902429716323, + "ewc_loss": 0.04806121066212654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4030605345615186e-05, + "grad_norm": 30.578357696533203, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8738580942153931, + "num_tokens": 700328052.0, + "step": 18353 + }, + { + "epoch": 2.3348174532502224, + "ewc_loss": 0.04816744104027748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4083719836198725e-05, + "grad_norm": 30.61107635498047, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8621224761009216, + "num_tokens": 700364652.0, + "step": 18354 + }, + { + "epoch": 2.3349446635288134, + "ewc_loss": 0.04811451584100723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.405725717835594e-05, + "grad_norm": 30.623682022094727, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8820233345031738, + "num_tokens": 700403698.0, + "step": 18355 + }, + { + "epoch": 2.3350718738074034, + "ewc_loss": 0.04809970408678055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4049852072494105e-05, + "grad_norm": 30.67386245727539, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8842782378196716, + "num_tokens": 700439391.0, + "step": 18356 + }, + { + "epoch": 2.335199084085994, + "ewc_loss": 0.048104140907526016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.405207123956643e-05, + "grad_norm": 30.649856567382812, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8722546100616455, + "num_tokens": 700475095.0, + "step": 18357 + }, + { + "epoch": 2.3353262943645845, + "ewc_loss": 0.04805442690849304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4027212930377573e-05, + "grad_norm": 30.58961296081543, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8744665384292603, + "num_tokens": 700514167.0, + "step": 18358 + }, + { + "epoch": 2.335453504643175, + "ewc_loss": 0.048085879534482956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.404293991276063e-05, + "grad_norm": 30.575088500976562, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8791236877441406, + "num_tokens": 700548333.0, + "step": 18359 + }, + { + "epoch": 2.3355807149217656, + "ewc_loss": 0.048129647970199585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.406482417427469e-05, + "grad_norm": 30.565446853637695, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8744082450866699, + "num_tokens": 700594261.0, + "step": 18360 + }, + { + "epoch": 2.335707925200356, + "ewc_loss": 0.048065558075904846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4032779037952423e-05, + "grad_norm": 30.61674690246582, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8755292892456055, + "num_tokens": 700632513.0, + "step": 18361 + }, + { + "epoch": 2.3358351354789466, + "ewc_loss": 0.04810657352209091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4053286324488e-05, + "grad_norm": 30.554515838623047, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8765039443969727, + "num_tokens": 700672430.0, + "step": 18362 + }, + { + "epoch": 2.335962345757537, + "ewc_loss": 0.048082247376441956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.404112456133589e-05, + "grad_norm": 30.54574966430664, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8765113949775696, + "num_tokens": 700714321.0, + "step": 18363 + }, + { + "epoch": 2.3360895560361277, + "ewc_loss": 0.04803542047739029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.401771052973345e-05, + "grad_norm": 30.5167236328125, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8707107901573181, + "num_tokens": 700751714.0, + "step": 18364 + }, + { + "epoch": 2.336216766314718, + "ewc_loss": 0.04807806387543678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4039032723521814e-05, + "grad_norm": 30.555660247802734, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8669708967208862, + "num_tokens": 700794087.0, + "step": 18365 + }, + { + "epoch": 2.3363439765933087, + "ewc_loss": 0.0480847992002964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4042399672907777e-05, + "grad_norm": 30.596302032470703, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8647503852844238, + "num_tokens": 700832031.0, + "step": 18366 + }, + { + "epoch": 2.3364711868718993, + "ewc_loss": 0.04802366718649864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4011833374970593e-05, + "grad_norm": 30.583040237426758, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8732765316963196, + "num_tokens": 700867175.0, + "step": 18367 + }, + { + "epoch": 2.33659839715049, + "ewc_loss": 0.04804711416363716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4023556761676446e-05, + "grad_norm": 30.578989028930664, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8765097856521606, + "num_tokens": 700906762.0, + "step": 18368 + }, + { + "epoch": 2.3367256074290803, + "ewc_loss": 0.048006102442741394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4003051294130273e-05, + "grad_norm": 30.69762420654297, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8732466697692871, + "num_tokens": 700944316.0, + "step": 18369 + }, + { + "epoch": 2.336852817707671, + "ewc_loss": 0.047998253256082535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.399912591499742e-05, + "grad_norm": 30.559959411621094, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8673949241638184, + "num_tokens": 700979369.0, + "step": 18370 + }, + { + "epoch": 2.3369800279862614, + "ewc_loss": 0.047980669885873795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3990334739210084e-05, + "grad_norm": 30.662017822265625, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8664793968200684, + "num_tokens": 701016213.0, + "step": 18371 + }, + { + "epoch": 2.337107238264852, + "ewc_loss": 0.04810555651783943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.405277882644441e-05, + "grad_norm": 30.620601654052734, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8673962354660034, + "num_tokens": 701055270.0, + "step": 18372 + }, + { + "epoch": 2.3372344485434424, + "ewc_loss": 0.04802219569683075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.401109850325156e-05, + "grad_norm": 30.523574829101562, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8684642910957336, + "num_tokens": 701091058.0, + "step": 18373 + }, + { + "epoch": 2.337361658822033, + "ewc_loss": 0.04806840792298317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.40342033066554e-05, + "grad_norm": 30.59482192993164, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8785689473152161, + "num_tokens": 701129446.0, + "step": 18374 + }, + { + "epoch": 2.3374888691006235, + "ewc_loss": 0.04815845564007759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4079226932371967e-05, + "grad_norm": 30.51468276977539, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8736594915390015, + "num_tokens": 701168726.0, + "step": 18375 + }, + { + "epoch": 2.337616079379214, + "ewc_loss": 0.04813551530241966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.406775820418261e-05, + "grad_norm": 30.65069007873535, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8635671138763428, + "num_tokens": 701204041.0, + "step": 18376 + }, + { + "epoch": 2.3377432896578045, + "ewc_loss": 0.048084039241075516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4042019504122436e-05, + "grad_norm": 30.532880783081055, + "learning_rate": 1e-06, + "loss": 0.4898, + "mean_token_accuracy": 0.8478103280067444, + "num_tokens": 701244660.0, + "step": 18377 + }, + { + "epoch": 2.337870499936395, + "ewc_loss": 0.04806488752365112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.403244434390217e-05, + "grad_norm": 30.669876098632812, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8620296716690063, + "num_tokens": 701284830.0, + "step": 18378 + }, + { + "epoch": 2.337997710214985, + "ewc_loss": 0.04814418405294418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.407209285593126e-05, + "grad_norm": 30.466964721679688, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8770641088485718, + "num_tokens": 701319720.0, + "step": 18379 + }, + { + "epoch": 2.338124920493576, + "ewc_loss": 0.04807417094707489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.403708640486002e-05, + "grad_norm": 30.658720016479492, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.8497930765151978, + "num_tokens": 701361770.0, + "step": 18380 + }, + { + "epoch": 2.338252130772166, + "ewc_loss": 0.048252467066049576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.41262332565384e-05, + "grad_norm": 30.597736358642578, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8674290776252747, + "num_tokens": 701399777.0, + "step": 18381 + }, + { + "epoch": 2.3383793410507567, + "ewc_loss": 0.048010971397161484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4005485101952218e-05, + "grad_norm": 30.55902099609375, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8698407411575317, + "num_tokens": 701432680.0, + "step": 18382 + }, + { + "epoch": 2.3385065513293473, + "ewc_loss": 0.04809881001710892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4049404601100832e-05, + "grad_norm": 30.519243240356445, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8800873756408691, + "num_tokens": 701474101.0, + "step": 18383 + }, + { + "epoch": 2.338633761607938, + "ewc_loss": 0.04814523458480835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.407261672487948e-05, + "grad_norm": 30.626806259155273, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8825183510780334, + "num_tokens": 701517568.0, + "step": 18384 + }, + { + "epoch": 2.3387609718865283, + "ewc_loss": 0.04818110913038254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4090553779387847e-05, + "grad_norm": 30.596338272094727, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8632035255432129, + "num_tokens": 701558466.0, + "step": 18385 + }, + { + "epoch": 2.338888182165119, + "ewc_loss": 0.04806838929653168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4034194211708382e-05, + "grad_norm": 30.4644775390625, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8645654916763306, + "num_tokens": 701591104.0, + "step": 18386 + }, + { + "epoch": 2.3390153924437094, + "ewc_loss": 0.04817113280296326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4085566110443324e-05, + "grad_norm": 30.74810028076172, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8623408079147339, + "num_tokens": 701633242.0, + "step": 18387 + }, + { + "epoch": 2.3391426027223, + "ewc_loss": 0.04820955544710159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4104778276523575e-05, + "grad_norm": 30.6738338470459, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.869279146194458, + "num_tokens": 701668864.0, + "step": 18388 + }, + { + "epoch": 2.3392698130008904, + "ewc_loss": 0.048031579703092575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4015789676923305e-05, + "grad_norm": 30.536453247070312, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8737192153930664, + "num_tokens": 701703648.0, + "step": 18389 + }, + { + "epoch": 2.339397023279481, + "ewc_loss": 0.048202089965343475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.41010457102675e-05, + "grad_norm": 30.78762435913086, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.866257905960083, + "num_tokens": 701744417.0, + "step": 18390 + }, + { + "epoch": 2.3395242335580715, + "ewc_loss": 0.0481833815574646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4091690647765063e-05, + "grad_norm": 30.483491897583008, + "learning_rate": 1e-06, + "loss": 0.5102, + "mean_token_accuracy": 0.8382403254508972, + "num_tokens": 701783541.0, + "step": 18391 + }, + { + "epoch": 2.339651443836662, + "ewc_loss": 0.04817529395222664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.408764703432098e-05, + "grad_norm": 30.71773338317871, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8659694194793701, + "num_tokens": 701825054.0, + "step": 18392 + }, + { + "epoch": 2.3397786541152525, + "ewc_loss": 0.048228517174720764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4114258849294856e-05, + "grad_norm": 30.585336685180664, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8767305016517639, + "num_tokens": 701866173.0, + "step": 18393 + }, + { + "epoch": 2.339905864393843, + "ewc_loss": 0.04806835204362869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4034176021814346e-05, + "grad_norm": 30.561824798583984, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8658793568611145, + "num_tokens": 701904812.0, + "step": 18394 + }, + { + "epoch": 2.3400330746724336, + "ewc_loss": 0.048210758715867996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4105378543026745e-05, + "grad_norm": 30.76158332824707, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8597913384437561, + "num_tokens": 701942590.0, + "step": 18395 + }, + { + "epoch": 2.340160284951024, + "ewc_loss": 0.04817748814821243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.408874388493132e-05, + "grad_norm": 30.56416130065918, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.872994601726532, + "num_tokens": 701980773.0, + "step": 18396 + }, + { + "epoch": 2.3402874952296147, + "ewc_loss": 0.04809238389134407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.404619226581417e-05, + "grad_norm": 30.640316009521484, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8830094337463379, + "num_tokens": 702014508.0, + "step": 18397 + }, + { + "epoch": 2.340414705508205, + "ewc_loss": 0.04816574975848198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4082874006126076e-05, + "grad_norm": 30.688438415527344, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8816206455230713, + "num_tokens": 702055754.0, + "step": 18398 + }, + { + "epoch": 2.3405419157867957, + "ewc_loss": 0.04815630987286568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4078155547613278e-05, + "grad_norm": 30.599966049194336, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8687057495117188, + "num_tokens": 702100022.0, + "step": 18399 + }, + { + "epoch": 2.3406691260653862, + "ewc_loss": 0.04804622009396553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4023109290283173e-05, + "grad_norm": 30.69657325744629, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8586361408233643, + "num_tokens": 702133217.0, + "step": 18400 + }, + { + "epoch": 2.3407963363439768, + "ewc_loss": 0.04821045324206352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4105225747916847e-05, + "grad_norm": 30.709566116333008, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8831343650817871, + "num_tokens": 702176998.0, + "step": 18401 + }, + { + "epoch": 2.3409235466225673, + "ewc_loss": 0.04798653721809387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3993268769118004e-05, + "grad_norm": 30.616418838500977, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.87617027759552, + "num_tokens": 702216219.0, + "step": 18402 + }, + { + "epoch": 2.341050756901158, + "ewc_loss": 0.04811018705368042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4055094399955124e-05, + "grad_norm": 30.66276741027832, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8655250072479248, + "num_tokens": 702246502.0, + "step": 18403 + }, + { + "epoch": 2.341177967179748, + "ewc_loss": 0.04809413105249405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4047065380727872e-05, + "grad_norm": 30.60553741455078, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8769341111183167, + "num_tokens": 702277990.0, + "step": 18404 + }, + { + "epoch": 2.341305177458339, + "ewc_loss": 0.048113517463207245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4056758775259368e-05, + "grad_norm": 30.632753372192383, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8720398545265198, + "num_tokens": 702312742.0, + "step": 18405 + }, + { + "epoch": 2.341432387736929, + "ewc_loss": 0.04811425507068634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4057128030108288e-05, + "grad_norm": 30.686521530151367, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8853697776794434, + "num_tokens": 702348575.0, + "step": 18406 + }, + { + "epoch": 2.3415595980155195, + "ewc_loss": 0.048048511147499084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4024255253607407e-05, + "grad_norm": 30.434328079223633, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8591800332069397, + "num_tokens": 702385956.0, + "step": 18407 + }, + { + "epoch": 2.34168680829411, + "ewc_loss": 0.04810773581266403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4053868401097134e-05, + "grad_norm": 30.76191520690918, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8742654323577881, + "num_tokens": 702423973.0, + "step": 18408 + }, + { + "epoch": 2.3418140185727006, + "ewc_loss": 0.048226453363895416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4113227482303046e-05, + "grad_norm": 30.615812301635742, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8671934008598328, + "num_tokens": 702455508.0, + "step": 18409 + }, + { + "epoch": 2.341941228851291, + "ewc_loss": 0.04803549125790596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4017745090532117e-05, + "grad_norm": 30.531307220458984, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8805856704711914, + "num_tokens": 702493365.0, + "step": 18410 + }, + { + "epoch": 2.3420684391298816, + "ewc_loss": 0.048144929111003876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4072463929769583e-05, + "grad_norm": 30.509485244750977, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8718817830085754, + "num_tokens": 702528036.0, + "step": 18411 + }, + { + "epoch": 2.342195649408472, + "ewc_loss": 0.04816056042909622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.408028012723662e-05, + "grad_norm": 30.630537033081055, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.864657998085022, + "num_tokens": 702571238.0, + "step": 18412 + }, + { + "epoch": 2.3423228596870627, + "ewc_loss": 0.04818073287606239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.409036642347928e-05, + "grad_norm": 30.589252471923828, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8660858273506165, + "num_tokens": 702611247.0, + "step": 18413 + }, + { + "epoch": 2.342450069965653, + "ewc_loss": 0.04811056703329086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4055283574853092e-05, + "grad_norm": 30.567527770996094, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8714278936386108, + "num_tokens": 702649661.0, + "step": 18414 + }, + { + "epoch": 2.3425772802442437, + "ewc_loss": 0.048150334507226944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4075166948023252e-05, + "grad_norm": 30.615869522094727, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8626830577850342, + "num_tokens": 702693573.0, + "step": 18415 + }, + { + "epoch": 2.3427044905228342, + "ewc_loss": 0.04813582822680473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4067914637271315e-05, + "grad_norm": 30.562360763549805, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8814709186553955, + "num_tokens": 702735351.0, + "step": 18416 + }, + { + "epoch": 2.3428317008014248, + "ewc_loss": 0.04817274957895279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4086375560727902e-05, + "grad_norm": 30.525222778320312, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8759215474128723, + "num_tokens": 702773406.0, + "step": 18417 + }, + { + "epoch": 2.3429589110800153, + "ewc_loss": 0.04812059924006462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4060300347628072e-05, + "grad_norm": 30.615692138671875, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8762611150741577, + "num_tokens": 702809939.0, + "step": 18418 + }, + { + "epoch": 2.343086121358606, + "ewc_loss": 0.048165880143642426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4082939489744604e-05, + "grad_norm": 30.536142349243164, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8676677942276001, + "num_tokens": 702851120.0, + "step": 18419 + }, + { + "epoch": 2.3432133316371964, + "ewc_loss": 0.048199594020843506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4099797883536667e-05, + "grad_norm": 30.563404083251953, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8782802820205688, + "num_tokens": 702889492.0, + "step": 18420 + }, + { + "epoch": 2.343340541915787, + "ewc_loss": 0.04820239543914795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4101198505377397e-05, + "grad_norm": 30.694673538208008, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8638317584991455, + "num_tokens": 702930486.0, + "step": 18421 + }, + { + "epoch": 2.3434677521943774, + "ewc_loss": 0.04814603924751282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4073020540527068e-05, + "grad_norm": 30.532873153686523, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8616776466369629, + "num_tokens": 702969866.0, + "step": 18422 + }, + { + "epoch": 2.343594962472968, + "ewc_loss": 0.0481577105820179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4078855858533643e-05, + "grad_norm": 30.58677101135254, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8612926602363586, + "num_tokens": 703013703.0, + "step": 18423 + }, + { + "epoch": 2.3437221727515585, + "ewc_loss": 0.048164207488298416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4082104573608376e-05, + "grad_norm": 30.574840545654297, + "learning_rate": 1e-06, + "loss": 0.4564, + "mean_token_accuracy": 0.8595871925354004, + "num_tokens": 703054849.0, + "step": 18424 + }, + { + "epoch": 2.343849383030149, + "ewc_loss": 0.04814968258142471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4074841348920017e-05, + "grad_norm": 30.634437561035156, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8642914295196533, + "num_tokens": 703088250.0, + "step": 18425 + }, + { + "epoch": 2.3439765933087395, + "ewc_loss": 0.04815787822008133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4078939532046206e-05, + "grad_norm": 30.59210968017578, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8667469024658203, + "num_tokens": 703124072.0, + "step": 18426 + }, + { + "epoch": 2.3441038035873296, + "ewc_loss": 0.04808895289897919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4044476958806626e-05, + "grad_norm": 30.65514373779297, + "learning_rate": 1e-06, + "loss": 0.4776, + "mean_token_accuracy": 0.8530283570289612, + "num_tokens": 703158869.0, + "step": 18427 + }, + { + "epoch": 2.3442310138659206, + "ewc_loss": 0.04819417744874954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4097089408314787e-05, + "grad_norm": 30.646211624145508, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.872679591178894, + "num_tokens": 703193342.0, + "step": 18428 + }, + { + "epoch": 2.3443582241445107, + "ewc_loss": 0.04813648387789726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4068242055363953e-05, + "grad_norm": 30.666915893554688, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8823180198669434, + "num_tokens": 703229899.0, + "step": 18429 + }, + { + "epoch": 2.344485434423101, + "ewc_loss": 0.04806094616651535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4030472559388727e-05, + "grad_norm": 30.513198852539062, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.869297981262207, + "num_tokens": 703266735.0, + "step": 18430 + }, + { + "epoch": 2.3446126447016917, + "ewc_loss": 0.04820021241903305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4100105292745866e-05, + "grad_norm": 30.674837112426758, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8656641244888306, + "num_tokens": 703307515.0, + "step": 18431 + }, + { + "epoch": 2.3447398549802823, + "ewc_loss": 0.04805297777056694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.402648897259496e-05, + "grad_norm": 30.547607421875, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8805392384529114, + "num_tokens": 703341237.0, + "step": 18432 + }, + { + "epoch": 2.344867065258873, + "ewc_loss": 0.04808882996439934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4044415113166906e-05, + "grad_norm": 30.58241081237793, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.880622148513794, + "num_tokens": 703378900.0, + "step": 18433 + }, + { + "epoch": 2.3449942755374633, + "ewc_loss": 0.048161689192056656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.408084401395172e-05, + "grad_norm": 30.633899688720703, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8777022957801819, + "num_tokens": 703415260.0, + "step": 18434 + }, + { + "epoch": 2.345121485816054, + "ewc_loss": 0.048161279410123825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.408064028713852e-05, + "grad_norm": 30.52130699157715, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8757106065750122, + "num_tokens": 703452399.0, + "step": 18435 + }, + { + "epoch": 2.3452486960946444, + "ewc_loss": 0.04815263673663139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4076318368315697e-05, + "grad_norm": 30.743459701538086, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.871070384979248, + "num_tokens": 703492700.0, + "step": 18436 + }, + { + "epoch": 2.345375906373235, + "ewc_loss": 0.04820441082119942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.410220622550696e-05, + "grad_norm": 30.568559646606445, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8814541101455688, + "num_tokens": 703530805.0, + "step": 18437 + }, + { + "epoch": 2.3455031166518254, + "ewc_loss": 0.04807277023792267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.403638427495025e-05, + "grad_norm": 30.59494400024414, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8788068294525146, + "num_tokens": 703571894.0, + "step": 18438 + }, + { + "epoch": 2.345630326930416, + "ewc_loss": 0.04812358692288399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4061793737928383e-05, + "grad_norm": 30.621212005615234, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8757736682891846, + "num_tokens": 703612583.0, + "step": 18439 + }, + { + "epoch": 2.3457575372090065, + "ewc_loss": 0.04815363511443138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4076816771412268e-05, + "grad_norm": 30.484373092651367, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8637499809265137, + "num_tokens": 703649100.0, + "step": 18440 + }, + { + "epoch": 2.345884747487597, + "ewc_loss": 0.04816696047782898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4083479729597457e-05, + "grad_norm": 30.6663875579834, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.885995626449585, + "num_tokens": 703680425.0, + "step": 18441 + }, + { + "epoch": 2.3460119577661875, + "ewc_loss": 0.048281703144311905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4140852474374697e-05, + "grad_norm": 30.683605194091797, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8769102096557617, + "num_tokens": 703721819.0, + "step": 18442 + }, + { + "epoch": 2.346139168044778, + "ewc_loss": 0.048178281635046005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.408914042462129e-05, + "grad_norm": 30.56145668029785, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8669500350952148, + "num_tokens": 703755432.0, + "step": 18443 + }, + { + "epoch": 2.3462663783233686, + "ewc_loss": 0.04819350689649582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.409675289527513e-05, + "grad_norm": 30.647951126098633, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8692260980606079, + "num_tokens": 703795903.0, + "step": 18444 + }, + { + "epoch": 2.346393588601959, + "ewc_loss": 0.048176199197769165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4088099962682463e-05, + "grad_norm": 30.594114303588867, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8812467455863953, + "num_tokens": 703836074.0, + "step": 18445 + }, + { + "epoch": 2.3465207988805497, + "ewc_loss": 0.04815270006656647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4076349291135557e-05, + "grad_norm": 30.704803466796875, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.875078022480011, + "num_tokens": 703870609.0, + "step": 18446 + }, + { + "epoch": 2.34664800915914, + "ewc_loss": 0.04816707596182823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.408353793725837e-05, + "grad_norm": 30.538799285888672, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8751218914985657, + "num_tokens": 703909034.0, + "step": 18447 + }, + { + "epoch": 2.3467752194377307, + "ewc_loss": 0.04822627454996109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.411313653283287e-05, + "grad_norm": 30.614803314208984, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8632417321205139, + "num_tokens": 703946976.0, + "step": 18448 + }, + { + "epoch": 2.3469024297163212, + "ewc_loss": 0.048181790858507156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4090895749395713e-05, + "grad_norm": 30.5888729095459, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8738241195678711, + "num_tokens": 703986379.0, + "step": 18449 + }, + { + "epoch": 2.3470296399949118, + "ewc_loss": 0.048239566385746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4119783120113425e-05, + "grad_norm": 30.47343635559082, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8896582722663879, + "num_tokens": 704019461.0, + "step": 18450 + }, + { + "epoch": 2.3471568502735023, + "ewc_loss": 0.04818321764469147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4091608793241903e-05, + "grad_norm": 30.601110458374023, + "learning_rate": 1e-06, + "loss": 0.4602, + "mean_token_accuracy": 0.8572107553482056, + "num_tokens": 704053411.0, + "step": 18451 + }, + { + "epoch": 2.3472840605520924, + "ewc_loss": 0.048254307359457016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4127153665176593e-05, + "grad_norm": 30.50042152404785, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.875173807144165, + "num_tokens": 704092633.0, + "step": 18452 + }, + { + "epoch": 2.3474112708306833, + "ewc_loss": 0.048207733780145645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.41038669628324e-05, + "grad_norm": 30.45829963684082, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8746927380561829, + "num_tokens": 704126955.0, + "step": 18453 + }, + { + "epoch": 2.3475384811092734, + "ewc_loss": 0.04838460683822632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.419230258965399e-05, + "grad_norm": 30.641582489013672, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8876364231109619, + "num_tokens": 704162222.0, + "step": 18454 + }, + { + "epoch": 2.347665691387864, + "ewc_loss": 0.04834722355008125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4173612473532557e-05, + "grad_norm": 30.591053009033203, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8714944124221802, + "num_tokens": 704198949.0, + "step": 18455 + }, + { + "epoch": 2.3477929016664545, + "ewc_loss": 0.048246853053569794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4123426555888727e-05, + "grad_norm": 30.681480407714844, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8651984930038452, + "num_tokens": 704235327.0, + "step": 18456 + }, + { + "epoch": 2.347920111945045, + "ewc_loss": 0.048271581530570984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.413579022686463e-05, + "grad_norm": 30.545106887817383, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8764849901199341, + "num_tokens": 704272783.0, + "step": 18457 + }, + { + "epoch": 2.3480473222236355, + "ewc_loss": 0.048208367079496384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4104183466988616e-05, + "grad_norm": 30.60883331298828, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.8532783389091492, + "num_tokens": 704314396.0, + "step": 18458 + }, + { + "epoch": 2.348174532502226, + "ewc_loss": 0.04835890233516693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.417945142951794e-05, + "grad_norm": 30.63016700744629, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8877376317977905, + "num_tokens": 704346903.0, + "step": 18459 + }, + { + "epoch": 2.3483017427808166, + "ewc_loss": 0.04826134070754051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4130669771693647e-05, + "grad_norm": 30.702882766723633, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8787297010421753, + "num_tokens": 704380456.0, + "step": 18460 + }, + { + "epoch": 2.348428953059407, + "ewc_loss": 0.04825009033083916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4125045456457883e-05, + "grad_norm": 30.598024368286133, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8829560279846191, + "num_tokens": 704416403.0, + "step": 18461 + }, + { + "epoch": 2.3485561633379977, + "ewc_loss": 0.04817299544811249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.408649743301794e-05, + "grad_norm": 30.6953182220459, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.871819257736206, + "num_tokens": 704460431.0, + "step": 18462 + }, + { + "epoch": 2.348683373616588, + "ewc_loss": 0.04825349897146225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4126749849529006e-05, + "grad_norm": 30.572458267211914, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8790735006332397, + "num_tokens": 704497872.0, + "step": 18463 + }, + { + "epoch": 2.3488105838951787, + "ewc_loss": 0.04820655286312103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4103275791276246e-05, + "grad_norm": 30.720609664916992, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8729580640792847, + "num_tokens": 704538180.0, + "step": 18464 + }, + { + "epoch": 2.3489377941737692, + "ewc_loss": 0.0481884591281414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4094229956972413e-05, + "grad_norm": 30.720361709594727, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8834565877914429, + "num_tokens": 704578342.0, + "step": 18465 + }, + { + "epoch": 2.3490650044523598, + "ewc_loss": 0.04807264357805252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.403632242931053e-05, + "grad_norm": 30.610925674438477, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8824719190597534, + "num_tokens": 704611792.0, + "step": 18466 + }, + { + "epoch": 2.3491922147309503, + "ewc_loss": 0.048166465014219284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4083232347038575e-05, + "grad_norm": 30.699190139770508, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8746309280395508, + "num_tokens": 704649898.0, + "step": 18467 + }, + { + "epoch": 2.349319425009541, + "ewc_loss": 0.04810919612646103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4054597815847956e-05, + "grad_norm": 30.58049964904785, + "learning_rate": 1e-06, + "loss": 0.3937, + "mean_token_accuracy": 0.8795737028121948, + "num_tokens": 704690632.0, + "step": 18468 + }, + { + "epoch": 2.3494466352881314, + "ewc_loss": 0.04807184636592865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.403592225164175e-05, + "grad_norm": 30.5443172454834, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8611748218536377, + "num_tokens": 704722955.0, + "step": 18469 + }, + { + "epoch": 2.349573845566722, + "ewc_loss": 0.04821101576089859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4105507691274397e-05, + "grad_norm": 30.568235397338867, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8697725534439087, + "num_tokens": 704762678.0, + "step": 18470 + }, + { + "epoch": 2.3497010558453124, + "ewc_loss": 0.048230767250061035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4115382984746248e-05, + "grad_norm": 30.74660301208496, + "learning_rate": 1e-06, + "loss": 0.3437, + "mean_token_accuracy": 0.8961679935455322, + "num_tokens": 704805440.0, + "step": 18471 + }, + { + "epoch": 2.349828266123903, + "ewc_loss": 0.04817935451865196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4089677026495337e-05, + "grad_norm": 30.514694213867188, + "learning_rate": 1e-06, + "loss": 0.4841, + "mean_token_accuracy": 0.8524109721183777, + "num_tokens": 704848153.0, + "step": 18472 + }, + { + "epoch": 2.3499554764024935, + "ewc_loss": 0.04808267578482628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4041337383096106e-05, + "grad_norm": 30.693416595458984, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8694225549697876, + "num_tokens": 704886739.0, + "step": 18473 + }, + { + "epoch": 2.350082686681084, + "ewc_loss": 0.04812829568982124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.406414751021657e-05, + "grad_norm": 30.672569274902344, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8711055517196655, + "num_tokens": 704926145.0, + "step": 18474 + }, + { + "epoch": 2.3502098969596745, + "ewc_loss": 0.048094797879457474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.404739825578872e-05, + "grad_norm": 30.61817741394043, + "learning_rate": 1e-06, + "loss": 0.4612, + "mean_token_accuracy": 0.85489821434021, + "num_tokens": 704964135.0, + "step": 18475 + }, + { + "epoch": 2.350337107238265, + "ewc_loss": 0.048203885555267334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4101942472043447e-05, + "grad_norm": 30.694950103759766, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8600747585296631, + "num_tokens": 705002171.0, + "step": 18476 + }, + { + "epoch": 2.350464317516855, + "ewc_loss": 0.04816354438662529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4081771698547527e-05, + "grad_norm": 30.660226821899414, + "learning_rate": 1e-06, + "loss": 0.3552, + "mean_token_accuracy": 0.8905590772628784, + "num_tokens": 705040660.0, + "step": 18477 + }, + { + "epoch": 2.350591527795446, + "ewc_loss": 0.048154428601264954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4077215130091645e-05, + "grad_norm": 30.638935089111328, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.877051830291748, + "num_tokens": 705073655.0, + "step": 18478 + }, + { + "epoch": 2.350718738074036, + "ewc_loss": 0.04810380935668945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4051903892541304e-05, + "grad_norm": 30.551265716552734, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8680148124694824, + "num_tokens": 705115497.0, + "step": 18479 + }, + { + "epoch": 2.3508459483526267, + "ewc_loss": 0.048088010400533676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4044005840551108e-05, + "grad_norm": 30.657804489135742, + "learning_rate": 1e-06, + "loss": 0.3674, + "mean_token_accuracy": 0.8887192010879517, + "num_tokens": 705152642.0, + "step": 18480 + }, + { + "epoch": 2.3509731586312173, + "ewc_loss": 0.048133350908756256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.40666759054875e-05, + "grad_norm": 30.584423065185547, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8565665483474731, + "num_tokens": 705187282.0, + "step": 18481 + }, + { + "epoch": 2.351100368909808, + "ewc_loss": 0.04811171814799309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4055858375504613e-05, + "grad_norm": 30.710432052612305, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8743922710418701, + "num_tokens": 705225635.0, + "step": 18482 + }, + { + "epoch": 2.3512275791883983, + "ewc_loss": 0.0482335090637207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.411675450275652e-05, + "grad_norm": 30.70815086364746, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8806238174438477, + "num_tokens": 705260376.0, + "step": 18483 + }, + { + "epoch": 2.351354789466989, + "ewc_loss": 0.04809457063674927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.40472854784457e-05, + "grad_norm": 30.672555923461914, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8694459199905396, + "num_tokens": 705298799.0, + "step": 18484 + }, + { + "epoch": 2.3514819997455794, + "ewc_loss": 0.048138707876205444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.406935345788952e-05, + "grad_norm": 30.671504974365234, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.853344738483429, + "num_tokens": 705332570.0, + "step": 18485 + }, + { + "epoch": 2.35160921002417, + "ewc_loss": 0.04811948165297508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.405974009889178e-05, + "grad_norm": 30.811487197875977, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8725008964538574, + "num_tokens": 705371214.0, + "step": 18486 + }, + { + "epoch": 2.3517364203027604, + "ewc_loss": 0.048131946474313736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.406597377557773e-05, + "grad_norm": 30.592512130737305, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8682543039321899, + "num_tokens": 705405051.0, + "step": 18487 + }, + { + "epoch": 2.351863630581351, + "ewc_loss": 0.047953054308891296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3976526790647767e-05, + "grad_norm": 30.606689453125, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.869515597820282, + "num_tokens": 705442460.0, + "step": 18488 + }, + { + "epoch": 2.3519908408599415, + "ewc_loss": 0.04814957082271576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4074784960248508e-05, + "grad_norm": 30.717605590820312, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8708951473236084, + "num_tokens": 705482929.0, + "step": 18489 + }, + { + "epoch": 2.352118051138532, + "ewc_loss": 0.04816833883523941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.40841691265814e-05, + "grad_norm": 30.78434181213379, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8669958114624023, + "num_tokens": 705525313.0, + "step": 18490 + }, + { + "epoch": 2.3522452614171225, + "ewc_loss": 0.04815166816115379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4075834517134354e-05, + "grad_norm": 30.827909469604492, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8661932945251465, + "num_tokens": 705563730.0, + "step": 18491 + }, + { + "epoch": 2.352372471695713, + "ewc_loss": 0.04806329682469368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4031647626543418e-05, + "grad_norm": 30.693740844726562, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.875003457069397, + "num_tokens": 705602459.0, + "step": 18492 + }, + { + "epoch": 2.3524996819743036, + "ewc_loss": 0.047996558248996735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.399827826593537e-05, + "grad_norm": 30.48284912109375, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8681091070175171, + "num_tokens": 705636223.0, + "step": 18493 + }, + { + "epoch": 2.352626892252894, + "ewc_loss": 0.04811848700046539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4059243514784612e-05, + "grad_norm": 30.888540267944336, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8692260980606079, + "num_tokens": 705676178.0, + "step": 18494 + }, + { + "epoch": 2.3527541025314846, + "ewc_loss": 0.04823790490627289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4118951841956005e-05, + "grad_norm": 30.711101531982422, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8776930570602417, + "num_tokens": 705716687.0, + "step": 18495 + }, + { + "epoch": 2.352881312810075, + "ewc_loss": 0.04799007624387741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.3995038645807654e-05, + "grad_norm": 30.688154220581055, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8785292506217957, + "num_tokens": 705759163.0, + "step": 18496 + }, + { + "epoch": 2.3530085230886657, + "ewc_loss": 0.04805939644575119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4029697669902816e-05, + "grad_norm": 30.672334671020508, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8728491067886353, + "num_tokens": 705795982.0, + "step": 18497 + }, + { + "epoch": 2.3531357333672562, + "ewc_loss": 0.048073913902044296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4036957256612368e-05, + "grad_norm": 30.81365394592285, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8748198747634888, + "num_tokens": 705835601.0, + "step": 18498 + }, + { + "epoch": 2.3532629436458468, + "ewc_loss": 0.04807329922914505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.403664984740317e-05, + "grad_norm": 30.607929229736328, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8688173294067383, + "num_tokens": 705876858.0, + "step": 18499 + }, + { + "epoch": 2.353390153924437, + "ewc_loss": 0.04804399609565735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4021997887757607e-05, + "grad_norm": 30.726991653442383, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8625818490982056, + "num_tokens": 705921944.0, + "step": 18500 + }, + { + "epoch": 2.353517364203028, + "ewc_loss": 0.04810106009244919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4050530555541627e-05, + "grad_norm": 30.699819564819336, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8716768622398376, + "num_tokens": 705967999.0, + "step": 18501 + }, + { + "epoch": 2.353644574481618, + "ewc_loss": 0.047970373183488846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.398518699919805e-05, + "grad_norm": 30.55885887145996, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8629586696624756, + "num_tokens": 706003370.0, + "step": 18502 + }, + { + "epoch": 2.353771784760209, + "ewc_loss": 0.04801778122782707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4008890250115655e-05, + "grad_norm": 30.684432983398438, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8603609800338745, + "num_tokens": 706036551.0, + "step": 18503 + }, + { + "epoch": 2.353898995038799, + "ewc_loss": 0.048066895455121994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4033448426052928e-05, + "grad_norm": 30.65663719177246, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8874503970146179, + "num_tokens": 706075965.0, + "step": 18504 + }, + { + "epoch": 2.3540262053173895, + "ewc_loss": 0.04808083921670914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4040418793447316e-05, + "grad_norm": 30.740556716918945, + "learning_rate": 1e-06, + "loss": 0.4547, + "mean_token_accuracy": 0.8588989973068237, + "num_tokens": 706117943.0, + "step": 18505 + }, + { + "epoch": 2.35415341559598, + "ewc_loss": 0.04808640107512474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4043200028245337e-05, + "grad_norm": 30.7371826171875, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8747670650482178, + "num_tokens": 706157183.0, + "step": 18506 + }, + { + "epoch": 2.3542806258745705, + "ewc_loss": 0.04803971201181412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.401985511824023e-05, + "grad_norm": 30.635026931762695, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8799877166748047, + "num_tokens": 706196878.0, + "step": 18507 + }, + { + "epoch": 2.354407836153161, + "ewc_loss": 0.04815538972616196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.407769534329418e-05, + "grad_norm": 30.824758529663086, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8589291572570801, + "num_tokens": 706231102.0, + "step": 18508 + }, + { + "epoch": 2.3545350464317516, + "ewc_loss": 0.04813046008348465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.406522980891168e-05, + "grad_norm": 30.71205711364746, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8783287405967712, + "num_tokens": 706273372.0, + "step": 18509 + }, + { + "epoch": 2.354662256710342, + "ewc_loss": 0.04801485687494278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4007427782635204e-05, + "grad_norm": 30.685169219970703, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8658303022384644, + "num_tokens": 706308106.0, + "step": 18510 + }, + { + "epoch": 2.3547894669889327, + "ewc_loss": 0.0481615848839283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.408079308224842e-05, + "grad_norm": 30.803274154663086, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.87856125831604, + "num_tokens": 706344267.0, + "step": 18511 + }, + { + "epoch": 2.354916677267523, + "ewc_loss": 0.04813924804329872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4069624487310648e-05, + "grad_norm": 30.81169319152832, + "learning_rate": 1e-06, + "loss": 0.3711, + "mean_token_accuracy": 0.8894088268280029, + "num_tokens": 706385019.0, + "step": 18512 + }, + { + "epoch": 2.3550438875461137, + "ewc_loss": 0.048013798892498016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4006900275708176e-05, + "grad_norm": 30.55426788330078, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8700372576713562, + "num_tokens": 706424913.0, + "step": 18513 + }, + { + "epoch": 2.3551710978247042, + "ewc_loss": 0.04806559532880783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.403279722784646e-05, + "grad_norm": 30.743898391723633, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.872013509273529, + "num_tokens": 706466064.0, + "step": 18514 + }, + { + "epoch": 2.3552983081032948, + "ewc_loss": 0.04810916259884834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4054581444943324e-05, + "grad_norm": 30.650821685791016, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.877945601940155, + "num_tokens": 706504572.0, + "step": 18515 + }, + { + "epoch": 2.3554255183818853, + "ewc_loss": 0.04807066172361374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4035331080085598e-05, + "grad_norm": 30.706430435180664, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8650797605514526, + "num_tokens": 706543901.0, + "step": 18516 + }, + { + "epoch": 2.355552728660476, + "ewc_loss": 0.04821060597896576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4105302145471796e-05, + "grad_norm": 30.64192771911621, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8642919659614563, + "num_tokens": 706579095.0, + "step": 18517 + }, + { + "epoch": 2.3556799389390664, + "ewc_loss": 0.04806077107787132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4030385247897357e-05, + "grad_norm": 30.6008358001709, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8640625476837158, + "num_tokens": 706614078.0, + "step": 18518 + }, + { + "epoch": 2.355807149217657, + "ewc_loss": 0.048141609877347946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.407080501143355e-05, + "grad_norm": 30.658212661743164, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8787346482276917, + "num_tokens": 706652596.0, + "step": 18519 + }, + { + "epoch": 2.3559343594962474, + "ewc_loss": 0.04818587377667427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.409293665550649e-05, + "grad_norm": 30.551776885986328, + "learning_rate": 1e-06, + "loss": 0.3482, + "mean_token_accuracy": 0.8945409059524536, + "num_tokens": 706691297.0, + "step": 18520 + }, + { + "epoch": 2.356061569774838, + "ewc_loss": 0.04816403239965439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4082015443127602e-05, + "grad_norm": 30.650484085083008, + "learning_rate": 1e-06, + "loss": 0.4635, + "mean_token_accuracy": 0.8599004745483398, + "num_tokens": 706727344.0, + "step": 18521 + }, + { + "epoch": 2.3561887800534285, + "ewc_loss": 0.04823141172528267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4115704945870675e-05, + "grad_norm": 30.586986541748047, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.875085175037384, + "num_tokens": 706770411.0, + "step": 18522 + }, + { + "epoch": 2.356315990332019, + "ewc_loss": 0.04816427454352379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.408213731541764e-05, + "grad_norm": 30.736913681030273, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8739227056503296, + "num_tokens": 706816215.0, + "step": 18523 + }, + { + "epoch": 2.3564432006106095, + "ewc_loss": 0.048308115452528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.415405833744444e-05, + "grad_norm": 30.672794342041016, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.865888237953186, + "num_tokens": 706857400.0, + "step": 18524 + }, + { + "epoch": 2.3565704108891996, + "ewc_loss": 0.0480923168361187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4046157705015503e-05, + "grad_norm": 30.527807235717773, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8639201521873474, + "num_tokens": 706892715.0, + "step": 18525 + }, + { + "epoch": 2.3566976211677906, + "ewc_loss": 0.04816233739256859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.408116779406555e-05, + "grad_norm": 30.691497802734375, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8713891506195068, + "num_tokens": 706934373.0, + "step": 18526 + }, + { + "epoch": 2.3568248314463807, + "ewc_loss": 0.04818684235215187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4093420506687835e-05, + "grad_norm": 30.646753311157227, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.872298002243042, + "num_tokens": 706976885.0, + "step": 18527 + }, + { + "epoch": 2.356952041724971, + "ewc_loss": 0.04812169075012207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4060846044449136e-05, + "grad_norm": 30.659879684448242, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8769814968109131, + "num_tokens": 707016957.0, + "step": 18528 + }, + { + "epoch": 2.3570792520035617, + "ewc_loss": 0.048140984028577805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.407049214525614e-05, + "grad_norm": 30.61305046081543, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8809244632720947, + "num_tokens": 707053687.0, + "step": 18529 + }, + { + "epoch": 2.3572064622821522, + "ewc_loss": 0.04814564809203148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4072824089671485e-05, + "grad_norm": 30.806316375732422, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8650834560394287, + "num_tokens": 707087327.0, + "step": 18530 + }, + { + "epoch": 2.3573336725607428, + "ewc_loss": 0.048174768686294556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.408738509984687e-05, + "grad_norm": 30.558738708496094, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8833601474761963, + "num_tokens": 707124479.0, + "step": 18531 + }, + { + "epoch": 2.3574608828393333, + "ewc_loss": 0.04815259948372841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.407630017842166e-05, + "grad_norm": 30.74366569519043, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8763031959533691, + "num_tokens": 707162452.0, + "step": 18532 + }, + { + "epoch": 2.357588093117924, + "ewc_loss": 0.04815182462334633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4075912733678706e-05, + "grad_norm": 30.587482452392578, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8803694844245911, + "num_tokens": 707196765.0, + "step": 18533 + }, + { + "epoch": 2.3577153033965144, + "ewc_loss": 0.048162996768951416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4081498850136995e-05, + "grad_norm": 30.63100242614746, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8772875666618347, + "num_tokens": 707230151.0, + "step": 18534 + }, + { + "epoch": 2.357842513675105, + "ewc_loss": 0.048168595880270004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4084298274829052e-05, + "grad_norm": 30.690561294555664, + "learning_rate": 1e-06, + "loss": 0.474, + "mean_token_accuracy": 0.8542628288269043, + "num_tokens": 707264373.0, + "step": 18535 + }, + { + "epoch": 2.3579697239536954, + "ewc_loss": 0.04822836071252823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4114180632750504e-05, + "grad_norm": 30.69020652770996, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8658826351165771, + "num_tokens": 707304844.0, + "step": 18536 + }, + { + "epoch": 2.358096934232286, + "ewc_loss": 0.04810227081179619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4051136279013008e-05, + "grad_norm": 30.68562889099121, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.873687207698822, + "num_tokens": 707347081.0, + "step": 18537 + }, + { + "epoch": 2.3582241445108765, + "ewc_loss": 0.04812771826982498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4063858290901408e-05, + "grad_norm": 30.67582130432129, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.864427387714386, + "num_tokens": 707388243.0, + "step": 18538 + }, + { + "epoch": 2.358351354789467, + "ewc_loss": 0.04820486158132553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4102429961203597e-05, + "grad_norm": 30.720712661743164, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8581441640853882, + "num_tokens": 707427109.0, + "step": 18539 + }, + { + "epoch": 2.3584785650680575, + "ewc_loss": 0.04813132435083389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4065662728389725e-05, + "grad_norm": 30.575544357299805, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8723774552345276, + "num_tokens": 707462502.0, + "step": 18540 + }, + { + "epoch": 2.358605775346648, + "ewc_loss": 0.048267610371112823, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.413380570942536e-05, + "grad_norm": 30.680749893188477, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8656771183013916, + "num_tokens": 707502861.0, + "step": 18541 + }, + { + "epoch": 2.3587329856252386, + "ewc_loss": 0.04822823777794838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4114118787110783e-05, + "grad_norm": 30.56817054748535, + "learning_rate": 1e-06, + "loss": 0.4827, + "mean_token_accuracy": 0.8510067462921143, + "num_tokens": 707535868.0, + "step": 18542 + }, + { + "epoch": 2.358860195903829, + "ewc_loss": 0.0482432059943676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.412160210951697e-05, + "grad_norm": 30.794729232788086, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8749474883079529, + "num_tokens": 707566128.0, + "step": 18543 + }, + { + "epoch": 2.3589874061824196, + "ewc_loss": 0.048292357474565506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.414617847534828e-05, + "grad_norm": 30.570636749267578, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8777203559875488, + "num_tokens": 707605113.0, + "step": 18544 + }, + { + "epoch": 2.35911461646101, + "ewc_loss": 0.048180438578128815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4090219085337594e-05, + "grad_norm": 30.6064395904541, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8690668344497681, + "num_tokens": 707647968.0, + "step": 18545 + }, + { + "epoch": 2.3592418267396007, + "ewc_loss": 0.048386216163635254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4193108401959762e-05, + "grad_norm": 30.654449462890625, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8783557415008545, + "num_tokens": 707682183.0, + "step": 18546 + }, + { + "epoch": 2.3593690370181912, + "ewc_loss": 0.04841040074825287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4205201043514535e-05, + "grad_norm": 30.760278701782227, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8800041675567627, + "num_tokens": 707720938.0, + "step": 18547 + }, + { + "epoch": 2.3594962472967818, + "ewc_loss": 0.048258326947689056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.412916364846751e-05, + "grad_norm": 30.57963752746582, + "learning_rate": 1e-06, + "loss": 0.4618, + "mean_token_accuracy": 0.8594141006469727, + "num_tokens": 707762624.0, + "step": 18548 + }, + { + "epoch": 2.3596234575753723, + "ewc_loss": 0.04829384759068489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4146924261003733e-05, + "grad_norm": 30.69498634338379, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8639124035835266, + "num_tokens": 707798406.0, + "step": 18549 + }, + { + "epoch": 2.3597506678539624, + "ewc_loss": 0.048302892595529556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4151446268660948e-05, + "grad_norm": 30.670156478881836, + "learning_rate": 1e-06, + "loss": 0.4238, + "mean_token_accuracy": 0.873165488243103, + "num_tokens": 707835935.0, + "step": 18550 + }, + { + "epoch": 2.3598778781325533, + "ewc_loss": 0.04829063639044762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.41453180933604e-05, + "grad_norm": 30.69020652770996, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8821142911911011, + "num_tokens": 707872971.0, + "step": 18551 + }, + { + "epoch": 2.3600050884111434, + "ewc_loss": 0.04823129624128342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4115648557199165e-05, + "grad_norm": 30.496110916137695, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8816267848014832, + "num_tokens": 707909798.0, + "step": 18552 + }, + { + "epoch": 2.360132298689734, + "ewc_loss": 0.048216767609119415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4108383513521403e-05, + "grad_norm": 30.682161331176758, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8740373849868774, + "num_tokens": 707951130.0, + "step": 18553 + }, + { + "epoch": 2.3602595089683245, + "ewc_loss": 0.048383865505456924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.419193333480507e-05, + "grad_norm": 30.706926345825195, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.859694242477417, + "num_tokens": 707989502.0, + "step": 18554 + }, + { + "epoch": 2.360386719246915, + "ewc_loss": 0.048160575330257416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4080287403194234e-05, + "grad_norm": 30.567609786987305, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8700337409973145, + "num_tokens": 708023521.0, + "step": 18555 + }, + { + "epoch": 2.3605139295255055, + "ewc_loss": 0.04827766865491867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.413883339613676e-05, + "grad_norm": 30.789602279663086, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8767372369766235, + "num_tokens": 708057594.0, + "step": 18556 + }, + { + "epoch": 2.360641139804096, + "ewc_loss": 0.048274800181388855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4137400032486767e-05, + "grad_norm": 30.624467849731445, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8691772222518921, + "num_tokens": 708101393.0, + "step": 18557 + }, + { + "epoch": 2.3607683500826866, + "ewc_loss": 0.048184797167778015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4092398234643042e-05, + "grad_norm": 30.744272232055664, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8821638822555542, + "num_tokens": 708137141.0, + "step": 18558 + }, + { + "epoch": 2.360895560361277, + "ewc_loss": 0.04824931174516678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4124656192725524e-05, + "grad_norm": 30.71141242980957, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.876907229423523, + "num_tokens": 708173497.0, + "step": 18559 + }, + { + "epoch": 2.3610227706398677, + "ewc_loss": 0.04817941039800644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4089706130325794e-05, + "grad_norm": 30.676166534423828, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8710979223251343, + "num_tokens": 708209412.0, + "step": 18560 + }, + { + "epoch": 2.361149980918458, + "ewc_loss": 0.04819214716553688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4096074412227608e-05, + "grad_norm": 30.73417091369629, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8794903755187988, + "num_tokens": 708239675.0, + "step": 18561 + }, + { + "epoch": 2.3612771911970487, + "ewc_loss": 0.04824502393603325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4122511604218744e-05, + "grad_norm": 30.727977752685547, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8656979203224182, + "num_tokens": 708274918.0, + "step": 18562 + }, + { + "epoch": 2.3614044014756392, + "ewc_loss": 0.04821901395916939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4109507648972794e-05, + "grad_norm": 30.68126678466797, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.854815661907196, + "num_tokens": 708313645.0, + "step": 18563 + }, + { + "epoch": 2.3615316117542298, + "ewc_loss": 0.048201099038124084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.410054912616033e-05, + "grad_norm": 30.66907501220703, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8720828294754028, + "num_tokens": 708357985.0, + "step": 18564 + }, + { + "epoch": 2.3616588220328203, + "ewc_loss": 0.048247236758470535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.41236175497761e-05, + "grad_norm": 30.696619033813477, + "learning_rate": 1e-06, + "loss": 0.4978, + "mean_token_accuracy": 0.8511773347854614, + "num_tokens": 708393606.0, + "step": 18565 + }, + { + "epoch": 2.361786032311411, + "ewc_loss": 0.04820556938648224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.410278466413729e-05, + "grad_norm": 30.633256912231445, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8773667812347412, + "num_tokens": 708428941.0, + "step": 18566 + }, + { + "epoch": 2.3619132425900013, + "ewc_loss": 0.04824594780802727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4122973627527244e-05, + "grad_norm": 30.738656997680664, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8759427070617676, + "num_tokens": 708467847.0, + "step": 18567 + }, + { + "epoch": 2.362040452868592, + "ewc_loss": 0.04829513281583786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4147566364263184e-05, + "grad_norm": 30.658370971679688, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8875308036804199, + "num_tokens": 708503212.0, + "step": 18568 + }, + { + "epoch": 2.3621676631471824, + "ewc_loss": 0.048246387392282486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4123193725245073e-05, + "grad_norm": 30.691650390625, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8715173602104187, + "num_tokens": 708542897.0, + "step": 18569 + }, + { + "epoch": 2.362294873425773, + "ewc_loss": 0.04833928868174553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4169645257643424e-05, + "grad_norm": 30.563940048217773, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.85987389087677, + "num_tokens": 708574725.0, + "step": 18570 + }, + { + "epoch": 2.3624220837043635, + "ewc_loss": 0.04817817360162735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.408908767392859e-05, + "grad_norm": 30.71255874633789, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8815370798110962, + "num_tokens": 708613557.0, + "step": 18571 + }, + { + "epoch": 2.362549293982954, + "ewc_loss": 0.048384081572294235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.419204065517988e-05, + "grad_norm": 30.654788970947266, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8636587858200073, + "num_tokens": 708651872.0, + "step": 18572 + }, + { + "epoch": 2.3626765042615445, + "ewc_loss": 0.04824872687458992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4124363335431553e-05, + "grad_norm": 30.675949096679688, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8596986532211304, + "num_tokens": 708688654.0, + "step": 18573 + }, + { + "epoch": 2.362803714540135, + "ewc_loss": 0.04842659831047058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.421329918433912e-05, + "grad_norm": 30.838790893554688, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8782836198806763, + "num_tokens": 708723285.0, + "step": 18574 + }, + { + "epoch": 2.362930924818725, + "ewc_loss": 0.048284951597452164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.414247501292266e-05, + "grad_norm": 30.73122787475586, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8810909390449524, + "num_tokens": 708760617.0, + "step": 18575 + }, + { + "epoch": 2.363058135097316, + "ewc_loss": 0.04824947938323021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4124739866238087e-05, + "grad_norm": 30.794105529785156, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8756351470947266, + "num_tokens": 708790792.0, + "step": 18576 + }, + { + "epoch": 2.363185345375906, + "ewc_loss": 0.04815701022744179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.407850479357876e-05, + "grad_norm": 30.57952880859375, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8620998859405518, + "num_tokens": 708829935.0, + "step": 18577 + }, + { + "epoch": 2.3633125556544967, + "ewc_loss": 0.048341818153858185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.417090945527889e-05, + "grad_norm": 30.69804573059082, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8658415675163269, + "num_tokens": 708870578.0, + "step": 18578 + }, + { + "epoch": 2.3634397659330872, + "ewc_loss": 0.04828836768865585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.414418304397259e-05, + "grad_norm": 30.740070343017578, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8694373369216919, + "num_tokens": 708907044.0, + "step": 18579 + }, + { + "epoch": 2.3635669762116778, + "ewc_loss": 0.04840992018580437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4204960936913267e-05, + "grad_norm": 30.861915588378906, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8536348938941956, + "num_tokens": 708947443.0, + "step": 18580 + }, + { + "epoch": 2.3636941864902683, + "ewc_loss": 0.04828784242272377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.414392110949848e-05, + "grad_norm": 30.667362213134766, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8728790283203125, + "num_tokens": 708983174.0, + "step": 18581 + }, + { + "epoch": 2.363821396768859, + "ewc_loss": 0.04824959859251976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4124799892888404e-05, + "grad_norm": 30.73675537109375, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8750559091567993, + "num_tokens": 709021950.0, + "step": 18582 + }, + { + "epoch": 2.3639486070474494, + "ewc_loss": 0.048432160168886185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4216080419137143e-05, + "grad_norm": 30.731382369995117, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8776755928993225, + "num_tokens": 709057236.0, + "step": 18583 + }, + { + "epoch": 2.36407581732604, + "ewc_loss": 0.0482603944838047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4130196834448725e-05, + "grad_norm": 30.81895637512207, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.872524082660675, + "num_tokens": 709093376.0, + "step": 18584 + }, + { + "epoch": 2.3642030276046304, + "ewc_loss": 0.04831188917160034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4155944629455917e-05, + "grad_norm": 30.583480834960938, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8671838045120239, + "num_tokens": 709129502.0, + "step": 18585 + }, + { + "epoch": 2.364330237883221, + "ewc_loss": 0.04832356423139572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4161781766451895e-05, + "grad_norm": 30.809711456298828, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8704748153686523, + "num_tokens": 709162538.0, + "step": 18586 + }, + { + "epoch": 2.3644574481618115, + "ewc_loss": 0.048481572419404984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4240785933216102e-05, + "grad_norm": 30.68824005126953, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8605897426605225, + "num_tokens": 709201690.0, + "step": 18587 + }, + { + "epoch": 2.364584658440402, + "ewc_loss": 0.048238545656204224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.411927198409103e-05, + "grad_norm": 30.694265365600586, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8642287254333496, + "num_tokens": 709240593.0, + "step": 18588 + }, + { + "epoch": 2.3647118687189925, + "ewc_loss": 0.048457007855176926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.422850411676336e-05, + "grad_norm": 30.868179321289062, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8822851777076721, + "num_tokens": 709278192.0, + "step": 18589 + }, + { + "epoch": 2.364839078997583, + "ewc_loss": 0.04826868325471878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4134342311299406e-05, + "grad_norm": 30.687421798706055, + "learning_rate": 1e-06, + "loss": 0.3557, + "mean_token_accuracy": 0.8885203003883362, + "num_tokens": 709311797.0, + "step": 18590 + }, + { + "epoch": 2.3649662892761736, + "ewc_loss": 0.04832145944237709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4160730390576646e-05, + "grad_norm": 30.708728790283203, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.869868278503418, + "num_tokens": 709350379.0, + "step": 18591 + }, + { + "epoch": 2.365093499554764, + "ewc_loss": 0.04829220473766327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.414610207779333e-05, + "grad_norm": 30.768878936767578, + "learning_rate": 1e-06, + "loss": 0.3428, + "mean_token_accuracy": 0.8972347974777222, + "num_tokens": 709389030.0, + "step": 18592 + }, + { + "epoch": 2.3652207098333546, + "ewc_loss": 0.04836784675717354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4183922505471855e-05, + "grad_norm": 30.789047241210938, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8637676239013672, + "num_tokens": 709433692.0, + "step": 18593 + }, + { + "epoch": 2.365347920111945, + "ewc_loss": 0.04836711660027504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4183558707591146e-05, + "grad_norm": 30.700424194335938, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8713444471359253, + "num_tokens": 709470773.0, + "step": 18594 + }, + { + "epoch": 2.3654751303905357, + "ewc_loss": 0.04824600741267204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4123004550347105e-05, + "grad_norm": 30.682470321655273, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8672374486923218, + "num_tokens": 709506764.0, + "step": 18595 + }, + { + "epoch": 2.3656023406691262, + "ewc_loss": 0.04829968139529228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4149840101017617e-05, + "grad_norm": 30.75807762145996, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8696609735488892, + "num_tokens": 709546931.0, + "step": 18596 + }, + { + "epoch": 2.3657295509477168, + "ewc_loss": 0.04830145835876465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.415072958683595e-05, + "grad_norm": 30.598581314086914, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8719701766967773, + "num_tokens": 709582842.0, + "step": 18597 + }, + { + "epoch": 2.365856761226307, + "ewc_loss": 0.048260606825351715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4130304154823534e-05, + "grad_norm": 30.67989158630371, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8713043928146362, + "num_tokens": 709621574.0, + "step": 18598 + }, + { + "epoch": 2.365983971504898, + "ewc_loss": 0.048208966851234436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.41044836002402e-05, + "grad_norm": 30.509784698486328, + "learning_rate": 1e-06, + "loss": 0.3632, + "mean_token_accuracy": 0.8886135816574097, + "num_tokens": 709662437.0, + "step": 18599 + }, + { + "epoch": 2.366111181783488, + "ewc_loss": 0.04839232936501503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.419616430415772e-05, + "grad_norm": 30.801679611206055, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8841730356216431, + "num_tokens": 709701318.0, + "step": 18600 + }, + { + "epoch": 2.3662383920620784, + "ewc_loss": 0.048350539058446884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4175269572879188e-05, + "grad_norm": 30.595304489135742, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8713388442993164, + "num_tokens": 709745156.0, + "step": 18601 + }, + { + "epoch": 2.366365602340669, + "ewc_loss": 0.04831905663013458, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4159528038580902e-05, + "grad_norm": 30.762737274169922, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8659795522689819, + "num_tokens": 709785849.0, + "step": 18602 + }, + { + "epoch": 2.3664928126192595, + "ewc_loss": 0.04832164943218231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.416082497802563e-05, + "grad_norm": 30.611209869384766, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8753237128257751, + "num_tokens": 709830038.0, + "step": 18603 + }, + { + "epoch": 2.36662002289785, + "ewc_loss": 0.04826134815812111, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4130673409672454e-05, + "grad_norm": 30.725950241088867, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8694902658462524, + "num_tokens": 709871174.0, + "step": 18604 + }, + { + "epoch": 2.3667472331764405, + "ewc_loss": 0.04839860275387764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4199302060878836e-05, + "grad_norm": 30.734249114990234, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.892352283000946, + "num_tokens": 709905921.0, + "step": 18605 + }, + { + "epoch": 2.366874443455031, + "ewc_loss": 0.04814315587282181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4071578081930056e-05, + "grad_norm": 30.674766540527344, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8874477744102478, + "num_tokens": 709939856.0, + "step": 18606 + }, + { + "epoch": 2.3670016537336216, + "ewc_loss": 0.048287685960531235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4143842892954126e-05, + "grad_norm": 30.73586082458496, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8851451873779297, + "num_tokens": 709978805.0, + "step": 18607 + }, + { + "epoch": 2.367128864012212, + "ewc_loss": 0.04818945378065109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.409472654107958e-05, + "grad_norm": 30.613346099853516, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.885562539100647, + "num_tokens": 710015015.0, + "step": 18608 + }, + { + "epoch": 2.3672560742908026, + "ewc_loss": 0.048250243067741394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4125121854012832e-05, + "grad_norm": 30.663270950317383, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8625023365020752, + "num_tokens": 710054887.0, + "step": 18609 + }, + { + "epoch": 2.367383284569393, + "ewc_loss": 0.04823978990316391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.411989407846704e-05, + "grad_norm": 30.612300872802734, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8722099661827087, + "num_tokens": 710097012.0, + "step": 18610 + }, + { + "epoch": 2.3675104948479837, + "ewc_loss": 0.04828013852238655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.414006848994177e-05, + "grad_norm": 30.680438995361328, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8555170893669128, + "num_tokens": 710130912.0, + "step": 18611 + }, + { + "epoch": 2.3676377051265742, + "ewc_loss": 0.04836370795965195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4181854314520024e-05, + "grad_norm": 30.740848541259766, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8679319620132446, + "num_tokens": 710168880.0, + "step": 18612 + }, + { + "epoch": 2.3677649154051648, + "ewc_loss": 0.048248570412397385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.41242851188872e-05, + "grad_norm": 30.62899398803711, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8804186582565308, + "num_tokens": 710205619.0, + "step": 18613 + }, + { + "epoch": 2.3678921256837553, + "ewc_loss": 0.04818996787071228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4094984837574884e-05, + "grad_norm": 30.60464859008789, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.87766033411026, + "num_tokens": 710239475.0, + "step": 18614 + }, + { + "epoch": 2.368019335962346, + "ewc_loss": 0.04830329492688179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.415164817648474e-05, + "grad_norm": 30.719484329223633, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8743612766265869, + "num_tokens": 710276533.0, + "step": 18615 + }, + { + "epoch": 2.3681465462409363, + "ewc_loss": 0.04831526800990105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.415763447061181e-05, + "grad_norm": 30.614425659179688, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8873692750930786, + "num_tokens": 710316731.0, + "step": 18616 + }, + { + "epoch": 2.368273756519527, + "ewc_loss": 0.04830058291554451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4150291210389696e-05, + "grad_norm": 30.709562301635742, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8686420917510986, + "num_tokens": 710352310.0, + "step": 18617 + }, + { + "epoch": 2.3684009667981174, + "ewc_loss": 0.04835270717740059, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.41763536905637e-05, + "grad_norm": 30.5762996673584, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.867993950843811, + "num_tokens": 710400203.0, + "step": 18618 + }, + { + "epoch": 2.368528177076708, + "ewc_loss": 0.04825877770781517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.412938920315355e-05, + "grad_norm": 30.695323944091797, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8778346180915833, + "num_tokens": 710433054.0, + "step": 18619 + }, + { + "epoch": 2.3686553873552985, + "ewc_loss": 0.048416562378406525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4208280592574738e-05, + "grad_norm": 30.767967224121094, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8759146332740784, + "num_tokens": 710466879.0, + "step": 18620 + }, + { + "epoch": 2.368782597633889, + "ewc_loss": 0.048300765454769135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4150382159859873e-05, + "grad_norm": 30.570085525512695, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8655920624732971, + "num_tokens": 710502808.0, + "step": 18621 + }, + { + "epoch": 2.3689098079124795, + "ewc_loss": 0.04837420955300331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4187103917938657e-05, + "grad_norm": 30.861186981201172, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8610602021217346, + "num_tokens": 710546190.0, + "step": 18622 + }, + { + "epoch": 2.3690370181910696, + "ewc_loss": 0.048375993967056274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4187997041735798e-05, + "grad_norm": 30.733169555664062, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8731502294540405, + "num_tokens": 710585556.0, + "step": 18623 + }, + { + "epoch": 2.3691642284696606, + "ewc_loss": 0.048260170966386795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.413008587609511e-05, + "grad_norm": 30.71041488647461, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8745449185371399, + "num_tokens": 710621366.0, + "step": 18624 + }, + { + "epoch": 2.3692914387482507, + "ewc_loss": 0.04835529625415802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4177648811019026e-05, + "grad_norm": 30.734800338745117, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8768371939659119, + "num_tokens": 710660217.0, + "step": 18625 + }, + { + "epoch": 2.369418649026841, + "ewc_loss": 0.048257358372211456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4128679797286168e-05, + "grad_norm": 30.650054931640625, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8683178424835205, + "num_tokens": 710695798.0, + "step": 18626 + }, + { + "epoch": 2.3695458593054317, + "ewc_loss": 0.048288412392139435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4144206690834835e-05, + "grad_norm": 30.64938735961914, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.854873538017273, + "num_tokens": 710732739.0, + "step": 18627 + }, + { + "epoch": 2.3696730695840222, + "ewc_loss": 0.04840593412518501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.420296732452698e-05, + "grad_norm": 30.620182037353516, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8639870285987854, + "num_tokens": 710769651.0, + "step": 18628 + }, + { + "epoch": 2.3698002798626128, + "ewc_loss": 0.04832156002521515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.416077950329054e-05, + "grad_norm": 30.71625328063965, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8703678846359253, + "num_tokens": 710809487.0, + "step": 18629 + }, + { + "epoch": 2.3699274901412033, + "ewc_loss": 0.048397939652204514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4198969185817987e-05, + "grad_norm": 30.72636604309082, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.874968409538269, + "num_tokens": 710849152.0, + "step": 18630 + }, + { + "epoch": 2.370054700419794, + "ewc_loss": 0.04831644147634506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4158220185199752e-05, + "grad_norm": 30.676408767700195, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8581212759017944, + "num_tokens": 710890933.0, + "step": 18631 + }, + { + "epoch": 2.3701819106983844, + "ewc_loss": 0.04834923893213272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4174620193662122e-05, + "grad_norm": 30.743921279907227, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8756899833679199, + "num_tokens": 710928262.0, + "step": 18632 + }, + { + "epoch": 2.370309120976975, + "ewc_loss": 0.0483795627951622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4189781470340677e-05, + "grad_norm": 30.743728637695312, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8770825862884521, + "num_tokens": 710962969.0, + "step": 18633 + }, + { + "epoch": 2.3704363312555654, + "ewc_loss": 0.04828774556517601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4143871996784583e-05, + "grad_norm": 30.60755729675293, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8674144744873047, + "num_tokens": 710998717.0, + "step": 18634 + }, + { + "epoch": 2.370563541534156, + "ewc_loss": 0.04828055948019028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.414027949271258e-05, + "grad_norm": 30.68004035949707, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8572812080383301, + "num_tokens": 711032819.0, + "step": 18635 + }, + { + "epoch": 2.3706907518127465, + "ewc_loss": 0.0483471043407917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.417355244688224e-05, + "grad_norm": 30.595687866210938, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8613249659538269, + "num_tokens": 711069395.0, + "step": 18636 + }, + { + "epoch": 2.370817962091337, + "ewc_loss": 0.04830204322934151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.415102244412992e-05, + "grad_norm": 30.61524772644043, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8737059831619263, + "num_tokens": 711105754.0, + "step": 18637 + }, + { + "epoch": 2.3709451723699275, + "ewc_loss": 0.04846363887190819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4231820134446025e-05, + "grad_norm": 30.62498664855957, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8665384650230408, + "num_tokens": 711145291.0, + "step": 18638 + }, + { + "epoch": 2.371072382648518, + "ewc_loss": 0.048380061984062195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4190030671888962e-05, + "grad_norm": 30.594892501831055, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8725374937057495, + "num_tokens": 711183431.0, + "step": 18639 + }, + { + "epoch": 2.3711995929271086, + "ewc_loss": 0.04844806343317032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.422403122182004e-05, + "grad_norm": 30.67828369140625, + "learning_rate": 1e-06, + "loss": 0.4619, + "mean_token_accuracy": 0.8572564125061035, + "num_tokens": 711229754.0, + "step": 18640 + }, + { + "epoch": 2.371326803205699, + "ewc_loss": 0.04847009479999542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4235047021647915e-05, + "grad_norm": 30.764585494995117, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8729926347732544, + "num_tokens": 711269071.0, + "step": 18641 + }, + { + "epoch": 2.3714540134842896, + "ewc_loss": 0.04838811606168747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4194057914428413e-05, + "grad_norm": 30.688358306884766, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8683046102523804, + "num_tokens": 711312036.0, + "step": 18642 + }, + { + "epoch": 2.37158122376288, + "ewc_loss": 0.04836597293615341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.418298572592903e-05, + "grad_norm": 30.713741302490234, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8790931701660156, + "num_tokens": 711347945.0, + "step": 18643 + }, + { + "epoch": 2.3717084340414707, + "ewc_loss": 0.048370447009801865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.418522308289539e-05, + "grad_norm": 30.652318954467773, + "learning_rate": 1e-06, + "loss": 0.4897, + "mean_token_accuracy": 0.8500317335128784, + "num_tokens": 711388433.0, + "step": 18644 + }, + { + "epoch": 2.371835644320061, + "ewc_loss": 0.04831154644489288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4155773644451983e-05, + "grad_norm": 30.63043212890625, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8756218552589417, + "num_tokens": 711425821.0, + "step": 18645 + }, + { + "epoch": 2.3719628545986517, + "ewc_loss": 0.04834086075425148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4170431061065756e-05, + "grad_norm": 30.62056541442871, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.8652163743972778, + "num_tokens": 711462725.0, + "step": 18646 + }, + { + "epoch": 2.3720900648772423, + "ewc_loss": 0.04838726669549942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4193634089897387e-05, + "grad_norm": 30.738494873046875, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8857423663139343, + "num_tokens": 711499987.0, + "step": 18647 + }, + { + "epoch": 2.3722172751558324, + "ewc_loss": 0.0483613982796669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4180699256248772e-05, + "grad_norm": 30.590654373168945, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.85673588514328, + "num_tokens": 711543658.0, + "step": 18648 + }, + { + "epoch": 2.3723444854344233, + "ewc_loss": 0.0482938177883625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4146909709088504e-05, + "grad_norm": 30.619827270507812, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8834895491600037, + "num_tokens": 711583875.0, + "step": 18649 + }, + { + "epoch": 2.3724716957130134, + "ewc_loss": 0.04841415956616402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4207080059568398e-05, + "grad_norm": 30.738277435302734, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8539432287216187, + "num_tokens": 711625543.0, + "step": 18650 + }, + { + "epoch": 2.372598905991604, + "ewc_loss": 0.04839792475104332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4198961909860373e-05, + "grad_norm": 30.63033103942871, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.874654233455658, + "num_tokens": 711660528.0, + "step": 18651 + }, + { + "epoch": 2.3727261162701945, + "ewc_loss": 0.048388902097940445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.419445081613958e-05, + "grad_norm": 30.622865676879883, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8872517347335815, + "num_tokens": 711695646.0, + "step": 18652 + }, + { + "epoch": 2.372853326548785, + "ewc_loss": 0.04845895245671272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4229475457104854e-05, + "grad_norm": 30.70152473449707, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8570317029953003, + "num_tokens": 711734691.0, + "step": 18653 + }, + { + "epoch": 2.3729805368273755, + "ewc_loss": 0.048283252865076065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4141625544871204e-05, + "grad_norm": 30.631250381469727, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8743187785148621, + "num_tokens": 711771518.0, + "step": 18654 + }, + { + "epoch": 2.373107747105966, + "ewc_loss": 0.04839172586798668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.419586235191673e-05, + "grad_norm": 30.745609283447266, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8614935874938965, + "num_tokens": 711808968.0, + "step": 18655 + }, + { + "epoch": 2.3732349573845566, + "ewc_loss": 0.04833647608757019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.416823735984508e-05, + "grad_norm": 30.55024528503418, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8798207640647888, + "num_tokens": 711842944.0, + "step": 18656 + }, + { + "epoch": 2.373362167663147, + "ewc_loss": 0.048347778618335724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4173888959921896e-05, + "grad_norm": 30.695552825927734, + "learning_rate": 1e-06, + "loss": 0.4268, + "mean_token_accuracy": 0.8679912686347961, + "num_tokens": 711882740.0, + "step": 18657 + }, + { + "epoch": 2.3734893779417376, + "ewc_loss": 0.048445820808410645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4222910724347457e-05, + "grad_norm": 30.65203857421875, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8646246194839478, + "num_tokens": 711925362.0, + "step": 18658 + }, + { + "epoch": 2.373616588220328, + "ewc_loss": 0.048351678997278214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4175838916562498e-05, + "grad_norm": 30.711912155151367, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8760021924972534, + "num_tokens": 711971285.0, + "step": 18659 + }, + { + "epoch": 2.3737437984989187, + "ewc_loss": 0.04838642105460167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.419321026536636e-05, + "grad_norm": 30.638792037963867, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8668256402015686, + "num_tokens": 712013306.0, + "step": 18660 + }, + { + "epoch": 2.3738710087775092, + "ewc_loss": 0.04835077375173569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4175387807190418e-05, + "grad_norm": 30.668899536132812, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8728432655334473, + "num_tokens": 712050837.0, + "step": 18661 + }, + { + "epoch": 2.3739982190560998, + "ewc_loss": 0.04842573404312134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4212866264861077e-05, + "grad_norm": 30.784318923950195, + "learning_rate": 1e-06, + "loss": 0.3718, + "mean_token_accuracy": 0.887652575969696, + "num_tokens": 712085140.0, + "step": 18662 + }, + { + "epoch": 2.3741254293346903, + "ewc_loss": 0.04836022108793259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4180109903682023e-05, + "grad_norm": 30.676244735717773, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8844864368438721, + "num_tokens": 712122495.0, + "step": 18663 + }, + { + "epoch": 2.374252639613281, + "ewc_loss": 0.048332542181015015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4166271032299846e-05, + "grad_norm": 30.668968200683594, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8655434250831604, + "num_tokens": 712161312.0, + "step": 18664 + }, + { + "epoch": 2.3743798498918713, + "ewc_loss": 0.0484030619263649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.420153032289818e-05, + "grad_norm": 30.766239166259766, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8701416850090027, + "num_tokens": 712197275.0, + "step": 18665 + }, + { + "epoch": 2.374507060170462, + "ewc_loss": 0.04833661764860153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4168308300431818e-05, + "grad_norm": 30.678068161010742, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.858141303062439, + "num_tokens": 712231309.0, + "step": 18666 + }, + { + "epoch": 2.3746342704490524, + "ewc_loss": 0.0483865812420845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4193290300900117e-05, + "grad_norm": 30.846851348876953, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8670409917831421, + "num_tokens": 712264864.0, + "step": 18667 + }, + { + "epoch": 2.374761480727643, + "ewc_loss": 0.048357076942920685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.417853829683736e-05, + "grad_norm": 30.752243041992188, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8715289831161499, + "num_tokens": 712303564.0, + "step": 18668 + }, + { + "epoch": 2.3748886910062335, + "ewc_loss": 0.04824594408273697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.412297180853784e-05, + "grad_norm": 30.69962501525879, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8822774887084961, + "num_tokens": 712341872.0, + "step": 18669 + }, + { + "epoch": 2.375015901284824, + "ewc_loss": 0.04831521585583687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4157607185770757e-05, + "grad_norm": 30.718748092651367, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8602533340454102, + "num_tokens": 712379835.0, + "step": 18670 + }, + { + "epoch": 2.3751431115634145, + "ewc_loss": 0.04825252667069435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.412626417935826e-05, + "grad_norm": 30.7110538482666, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8679475784301758, + "num_tokens": 712417462.0, + "step": 18671 + }, + { + "epoch": 2.375270321842005, + "ewc_loss": 0.04834999889135361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4175000362447463e-05, + "grad_norm": 30.76812171936035, + "learning_rate": 1e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8454686403274536, + "num_tokens": 712461581.0, + "step": 18672 + }, + { + "epoch": 2.375397532120595, + "ewc_loss": 0.04832470044493675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4162351110135205e-05, + "grad_norm": 30.75094985961914, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.870510458946228, + "num_tokens": 712499615.0, + "step": 18673 + }, + { + "epoch": 2.375524742399186, + "ewc_loss": 0.04828455671668053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4142278562067077e-05, + "grad_norm": 30.706369400024414, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8823444843292236, + "num_tokens": 712533264.0, + "step": 18674 + }, + { + "epoch": 2.375651952677776, + "ewc_loss": 0.04832882434129715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4164412025129423e-05, + "grad_norm": 30.806013107299805, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8639516830444336, + "num_tokens": 712571496.0, + "step": 18675 + }, + { + "epoch": 2.3757791629563667, + "ewc_loss": 0.04828345775604248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4141729227267206e-05, + "grad_norm": 30.629819869995117, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8781839609146118, + "num_tokens": 712606510.0, + "step": 18676 + }, + { + "epoch": 2.3759063732349572, + "ewc_loss": 0.04833351820707321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4166758521459997e-05, + "grad_norm": 30.83234214782715, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8800867795944214, + "num_tokens": 712646218.0, + "step": 18677 + }, + { + "epoch": 2.3760335835135478, + "ewc_loss": 0.048305101692676544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.41525503952289e-05, + "grad_norm": 30.717796325683594, + "learning_rate": 1e-06, + "loss": 0.5218, + "mean_token_accuracy": 0.8402865529060364, + "num_tokens": 712681905.0, + "step": 18678 + }, + { + "epoch": 2.3761607937921383, + "ewc_loss": 0.04829036444425583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4145181669155136e-05, + "grad_norm": 30.722148895263672, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8837566375732422, + "num_tokens": 712719916.0, + "step": 18679 + }, + { + "epoch": 2.376288004070729, + "ewc_loss": 0.04847349599003792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.423674777674023e-05, + "grad_norm": 30.776355743408203, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8715084791183472, + "num_tokens": 712760849.0, + "step": 18680 + }, + { + "epoch": 2.3764152143493193, + "ewc_loss": 0.0482349656522274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.411748209851794e-05, + "grad_norm": 30.62583351135254, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8830101490020752, + "num_tokens": 712801199.0, + "step": 18681 + }, + { + "epoch": 2.37654242462791, + "ewc_loss": 0.048376310616731644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4188155293813907e-05, + "grad_norm": 30.811431884765625, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8656959533691406, + "num_tokens": 712843471.0, + "step": 18682 + }, + { + "epoch": 2.3766696349065004, + "ewc_loss": 0.048322394490242004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4161197870853357e-05, + "grad_norm": 30.634288787841797, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8722532391548157, + "num_tokens": 712880649.0, + "step": 18683 + }, + { + "epoch": 2.376796845185091, + "ewc_loss": 0.04836426302790642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4182130800909363e-05, + "grad_norm": 30.700925827026367, + "learning_rate": 1e-06, + "loss": 0.4858, + "mean_token_accuracy": 0.8482472896575928, + "num_tokens": 712920574.0, + "step": 18684 + }, + { + "epoch": 2.3769240554636815, + "ewc_loss": 0.04844556003808975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4222779757110402e-05, + "grad_norm": 30.792110443115234, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8804389238357544, + "num_tokens": 712962544.0, + "step": 18685 + }, + { + "epoch": 2.377051265742272, + "ewc_loss": 0.04829683154821396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.414841583231464e-05, + "grad_norm": 30.655559539794922, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8751509189605713, + "num_tokens": 712999138.0, + "step": 18686 + }, + { + "epoch": 2.3771784760208625, + "ewc_loss": 0.048318419605493546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.415920971543528e-05, + "grad_norm": 30.612363815307617, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8726492524147034, + "num_tokens": 713041799.0, + "step": 18687 + }, + { + "epoch": 2.377305686299453, + "ewc_loss": 0.04832058399915695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.416029201413039e-05, + "grad_norm": 30.71458625793457, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8638712763786316, + "num_tokens": 713088248.0, + "step": 18688 + }, + { + "epoch": 2.3774328965780436, + "ewc_loss": 0.04836685210466385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4183425921364687e-05, + "grad_norm": 30.490758895874023, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8807612061500549, + "num_tokens": 713127239.0, + "step": 18689 + }, + { + "epoch": 2.377560106856634, + "ewc_loss": 0.04837292805314064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.418646363366861e-05, + "grad_norm": 30.772815704345703, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.8749262094497681, + "num_tokens": 713168389.0, + "step": 18690 + }, + { + "epoch": 2.3776873171352246, + "ewc_loss": 0.04852859303355217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4264296371256933e-05, + "grad_norm": 30.714717864990234, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8643108606338501, + "num_tokens": 713208137.0, + "step": 18691 + }, + { + "epoch": 2.377814527413815, + "ewc_loss": 0.048243556171655655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4121778551489115e-05, + "grad_norm": 30.676130294799805, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8621976375579834, + "num_tokens": 713249344.0, + "step": 18692 + }, + { + "epoch": 2.3779417376924057, + "ewc_loss": 0.04840857908129692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.420428972982336e-05, + "grad_norm": 30.663145065307617, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8684385418891907, + "num_tokens": 713291376.0, + "step": 18693 + }, + { + "epoch": 2.378068947970996, + "ewc_loss": 0.048344168812036514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.417208452243358e-05, + "grad_norm": 30.777908325195312, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8694906234741211, + "num_tokens": 713329925.0, + "step": 18694 + }, + { + "epoch": 2.3781961582495867, + "ewc_loss": 0.04829162731766701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4145812858478166e-05, + "grad_norm": 30.512042999267578, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8726022839546204, + "num_tokens": 713372509.0, + "step": 18695 + }, + { + "epoch": 2.378323368528177, + "ewc_loss": 0.0483318530023098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4165927243302576e-05, + "grad_norm": 30.822059631347656, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8660380840301514, + "num_tokens": 713410055.0, + "step": 18696 + }, + { + "epoch": 2.378450578806768, + "ewc_loss": 0.04845612868666649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4228063921327703e-05, + "grad_norm": 30.658164978027344, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8696291446685791, + "num_tokens": 713453958.0, + "step": 18697 + }, + { + "epoch": 2.378577789085358, + "ewc_loss": 0.04825849086046219, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.412924550299067e-05, + "grad_norm": 30.74603271484375, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8713247179985046, + "num_tokens": 713485683.0, + "step": 18698 + }, + { + "epoch": 2.3787049993639484, + "ewc_loss": 0.048339955508708954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4169978132704273e-05, + "grad_norm": 30.65230941772461, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.871854841709137, + "num_tokens": 713528565.0, + "step": 18699 + }, + { + "epoch": 2.378832209642539, + "ewc_loss": 0.04830361530184746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4151808247552253e-05, + "grad_norm": 30.71257972717285, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8682201504707336, + "num_tokens": 713560932.0, + "step": 18700 + }, + { + "epoch": 2.3789594199211295, + "ewc_loss": 0.04833687096834183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4168435629690066e-05, + "grad_norm": 30.71381950378418, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8677271604537964, + "num_tokens": 713592891.0, + "step": 18701 + }, + { + "epoch": 2.37908663019972, + "ewc_loss": 0.04832202568650246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4161012333934195e-05, + "grad_norm": 30.642396926879883, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8682408332824707, + "num_tokens": 713631285.0, + "step": 18702 + }, + { + "epoch": 2.3792138404783105, + "ewc_loss": 0.04828193783760071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4140968889696524e-05, + "grad_norm": 30.60161018371582, + "learning_rate": 1e-06, + "loss": 0.4875, + "mean_token_accuracy": 0.8538320660591125, + "num_tokens": 713675143.0, + "step": 18703 + }, + { + "epoch": 2.379341050756901, + "ewc_loss": 0.04840869829058647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4204349756473675e-05, + "grad_norm": 30.76187515258789, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8754493594169617, + "num_tokens": 713717556.0, + "step": 18704 + }, + { + "epoch": 2.3794682610354916, + "ewc_loss": 0.0483512207865715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.417560972389765e-05, + "grad_norm": 30.565380096435547, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8636196851730347, + "num_tokens": 713763515.0, + "step": 18705 + }, + { + "epoch": 2.379595471314082, + "ewc_loss": 0.04838920012116432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.419459997327067e-05, + "grad_norm": 30.72905158996582, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8756873607635498, + "num_tokens": 713801064.0, + "step": 18706 + }, + { + "epoch": 2.3797226815926726, + "ewc_loss": 0.048459142446517944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4229571863543242e-05, + "grad_norm": 30.691287994384766, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8682780861854553, + "num_tokens": 713841662.0, + "step": 18707 + }, + { + "epoch": 2.379849891871263, + "ewc_loss": 0.04836927354335785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.418463736830745e-05, + "grad_norm": 30.67275047302246, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8739075660705566, + "num_tokens": 713883646.0, + "step": 18708 + }, + { + "epoch": 2.3799771021498537, + "ewc_loss": 0.0483703538775444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4185177608160302e-05, + "grad_norm": 30.606069564819336, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8913713693618774, + "num_tokens": 713925365.0, + "step": 18709 + }, + { + "epoch": 2.3801043124284442, + "ewc_loss": 0.04842442646622658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4212213247665204e-05, + "grad_norm": 30.706661224365234, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.86726975440979, + "num_tokens": 713961142.0, + "step": 18710 + }, + { + "epoch": 2.3802315227070348, + "ewc_loss": 0.04843536391854286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4217681129812263e-05, + "grad_norm": 30.656145095825195, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.878874659538269, + "num_tokens": 714001652.0, + "step": 18711 + }, + { + "epoch": 2.3803587329856253, + "ewc_loss": 0.048504605889320374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4252303774119355e-05, + "grad_norm": 30.760047912597656, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8736053705215454, + "num_tokens": 714043023.0, + "step": 18712 + }, + { + "epoch": 2.380485943264216, + "ewc_loss": 0.048487339168787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4243669031420723e-05, + "grad_norm": 30.731353759765625, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8708651661872864, + "num_tokens": 714080389.0, + "step": 18713 + }, + { + "epoch": 2.3806131535428063, + "ewc_loss": 0.04849148914217949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.424574449833017e-05, + "grad_norm": 30.738975524902344, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8753641247749329, + "num_tokens": 714117695.0, + "step": 18714 + }, + { + "epoch": 2.380740363821397, + "ewc_loss": 0.04844995215535164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4224975277320482e-05, + "grad_norm": 30.77631950378418, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8782839775085449, + "num_tokens": 714154204.0, + "step": 18715 + }, + { + "epoch": 2.3808675740999874, + "ewc_loss": 0.04834938049316406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.417469113424886e-05, + "grad_norm": 30.743669509887695, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8542687892913818, + "num_tokens": 714194857.0, + "step": 18716 + }, + { + "epoch": 2.380994784378578, + "ewc_loss": 0.048461396247148514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4230697817984037e-05, + "grad_norm": 30.757047653198242, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8822721838951111, + "num_tokens": 714228034.0, + "step": 18717 + }, + { + "epoch": 2.3811219946571685, + "ewc_loss": 0.04838702827692032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4193514036596753e-05, + "grad_norm": 30.78640365600586, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8769108057022095, + "num_tokens": 714269688.0, + "step": 18718 + }, + { + "epoch": 2.381249204935759, + "ewc_loss": 0.04836566746234894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.418283293081913e-05, + "grad_norm": 30.711009979248047, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8656081557273865, + "num_tokens": 714309039.0, + "step": 18719 + }, + { + "epoch": 2.3813764152143495, + "ewc_loss": 0.048348963260650635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4174481950467452e-05, + "grad_norm": 30.75245475769043, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8770820498466492, + "num_tokens": 714347453.0, + "step": 18720 + }, + { + "epoch": 2.3815036254929396, + "ewc_loss": 0.048386164009571075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4193082936108112e-05, + "grad_norm": 30.705020904541016, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8688080310821533, + "num_tokens": 714385850.0, + "step": 18721 + }, + { + "epoch": 2.3816308357715306, + "ewc_loss": 0.048307787626981735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.415389462839812e-05, + "grad_norm": 30.77985382080078, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8658205270767212, + "num_tokens": 714426735.0, + "step": 18722 + }, + { + "epoch": 2.3817580460501206, + "ewc_loss": 0.04835567995905876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4177839804906398e-05, + "grad_norm": 30.70001983642578, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8832728862762451, + "num_tokens": 714465279.0, + "step": 18723 + }, + { + "epoch": 2.381885256328711, + "ewc_loss": 0.04824763163924217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4123815819621086e-05, + "grad_norm": 30.701509475708008, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8605272173881531, + "num_tokens": 714505703.0, + "step": 18724 + }, + { + "epoch": 2.3820124666073017, + "ewc_loss": 0.04835149645805359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.417574796709232e-05, + "grad_norm": 30.719192504882812, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8648431301116943, + "num_tokens": 714543642.0, + "step": 18725 + }, + { + "epoch": 2.3821396768858922, + "ewc_loss": 0.0482589416205883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.412947105767671e-05, + "grad_norm": 30.664257049560547, + "learning_rate": 1e-06, + "loss": 0.3696, + "mean_token_accuracy": 0.8877383470535278, + "num_tokens": 714583533.0, + "step": 18726 + }, + { + "epoch": 2.3822668871644828, + "ewc_loss": 0.0483328141272068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4166407456505112e-05, + "grad_norm": 30.741600036621094, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8759580254554749, + "num_tokens": 714625577.0, + "step": 18727 + }, + { + "epoch": 2.3823940974430733, + "ewc_loss": 0.048269566148519516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4134782506735064e-05, + "grad_norm": 30.736570358276367, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8676744699478149, + "num_tokens": 714664987.0, + "step": 18728 + }, + { + "epoch": 2.382521307721664, + "ewc_loss": 0.04838837310671806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4194187062676065e-05, + "grad_norm": 30.771312713623047, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8652133941650391, + "num_tokens": 714705881.0, + "step": 18729 + }, + { + "epoch": 2.3826485180002543, + "ewc_loss": 0.0481925904750824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4096294509945437e-05, + "grad_norm": 30.763938903808594, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8619083762168884, + "num_tokens": 714746012.0, + "step": 18730 + }, + { + "epoch": 2.382775728278845, + "ewc_loss": 0.04832954704761505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4164774004020728e-05, + "grad_norm": 30.633939743041992, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8737292289733887, + "num_tokens": 714790001.0, + "step": 18731 + }, + { + "epoch": 2.3829029385574354, + "ewc_loss": 0.04821804165840149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4109020159812644e-05, + "grad_norm": 30.84811782836914, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.878868579864502, + "num_tokens": 714824515.0, + "step": 18732 + }, + { + "epoch": 2.383030148836026, + "ewc_loss": 0.04834670200943947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4173350539058447e-05, + "grad_norm": 30.677396774291992, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.864643931388855, + "num_tokens": 714860841.0, + "step": 18733 + }, + { + "epoch": 2.3831573591146165, + "ewc_loss": 0.04817396402359009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4086981284199283e-05, + "grad_norm": 30.771596908569336, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.873564600944519, + "num_tokens": 714896373.0, + "step": 18734 + }, + { + "epoch": 2.383284569393207, + "ewc_loss": 0.048400480300188065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4200240659411065e-05, + "grad_norm": 30.72010040283203, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8737170696258545, + "num_tokens": 714926684.0, + "step": 18735 + }, + { + "epoch": 2.3834117796717975, + "ewc_loss": 0.04824487119913101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4122435206663795e-05, + "grad_norm": 30.817358016967773, + "learning_rate": 1e-06, + "loss": 0.3793, + "mean_token_accuracy": 0.8859255313873291, + "num_tokens": 714961057.0, + "step": 18736 + }, + { + "epoch": 2.383538989950388, + "ewc_loss": 0.048259884119033813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.412994217593223e-05, + "grad_norm": 30.625873565673828, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8767511248588562, + "num_tokens": 715001215.0, + "step": 18737 + }, + { + "epoch": 2.3836662002289786, + "ewc_loss": 0.04825557395815849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4127786673489027e-05, + "grad_norm": 30.790481567382812, + "learning_rate": 1e-06, + "loss": 0.4817, + "mean_token_accuracy": 0.8525583744049072, + "num_tokens": 715038252.0, + "step": 18738 + }, + { + "epoch": 2.383793410507569, + "ewc_loss": 0.048295360058546066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4147679141606204e-05, + "grad_norm": 30.606903076171875, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8707730770111084, + "num_tokens": 715078320.0, + "step": 18739 + }, + { + "epoch": 2.3839206207861596, + "ewc_loss": 0.04822274670004845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4111373932100832e-05, + "grad_norm": 30.744375228881836, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8649629950523376, + "num_tokens": 715115628.0, + "step": 18740 + }, + { + "epoch": 2.38404783106475, + "ewc_loss": 0.04834730550646782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4173652491299435e-05, + "grad_norm": 30.705995559692383, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8814963102340698, + "num_tokens": 715153294.0, + "step": 18741 + }, + { + "epoch": 2.3841750413433407, + "ewc_loss": 0.048268552869558334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.413427682768088e-05, + "grad_norm": 30.556856155395508, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8717200756072998, + "num_tokens": 715193736.0, + "step": 18742 + }, + { + "epoch": 2.384302251621931, + "ewc_loss": 0.048396214842796326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4198106984840706e-05, + "grad_norm": 30.676462173461914, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8725085854530334, + "num_tokens": 715234539.0, + "step": 18743 + }, + { + "epoch": 2.3844294619005217, + "ewc_loss": 0.048418596386909485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.420929740765132e-05, + "grad_norm": 30.630435943603516, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.883517861366272, + "num_tokens": 715271567.0, + "step": 18744 + }, + { + "epoch": 2.3845566721791123, + "ewc_loss": 0.04835744947195053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.417872383375652e-05, + "grad_norm": 30.5915584564209, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8832336068153381, + "num_tokens": 715314850.0, + "step": 18745 + }, + { + "epoch": 2.3846838824577024, + "ewc_loss": 0.04837951436638832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.418975782347843e-05, + "grad_norm": 30.664308547973633, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8782839775085449, + "num_tokens": 715353353.0, + "step": 18746 + }, + { + "epoch": 2.3848110927362933, + "ewc_loss": 0.048335473984479904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4167737137759104e-05, + "grad_norm": 30.852636337280273, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8826563358306885, + "num_tokens": 715387843.0, + "step": 18747 + }, + { + "epoch": 2.3849383030148834, + "ewc_loss": 0.04833970591425896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4169852622435428e-05, + "grad_norm": 30.600749969482422, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8592960834503174, + "num_tokens": 715428037.0, + "step": 18748 + }, + { + "epoch": 2.385065513293474, + "ewc_loss": 0.04838850721716881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4194254365283996e-05, + "grad_norm": 30.879369735717773, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.885054349899292, + "num_tokens": 715464403.0, + "step": 18749 + }, + { + "epoch": 2.3851927235720645, + "ewc_loss": 0.0484565831720829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4228291295003146e-05, + "grad_norm": 30.723628997802734, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8807104825973511, + "num_tokens": 715501820.0, + "step": 18750 + }, + { + "epoch": 2.385319933850655, + "ewc_loss": 0.048231467604637146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4115734049701132e-05, + "grad_norm": 30.85624885559082, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8822429180145264, + "num_tokens": 715543287.0, + "step": 18751 + }, + { + "epoch": 2.3854471441292455, + "ewc_loss": 0.04830548167228699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4152741389116272e-05, + "grad_norm": 30.6414794921875, + "learning_rate": 1e-06, + "loss": 0.382, + "mean_token_accuracy": 0.8807514905929565, + "num_tokens": 715578189.0, + "step": 18752 + }, + { + "epoch": 2.385574354407836, + "ewc_loss": 0.04834374785423279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4171873519662768e-05, + "grad_norm": 30.819101333618164, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.862175464630127, + "num_tokens": 715616447.0, + "step": 18753 + }, + { + "epoch": 2.3857015646864266, + "ewc_loss": 0.04834899306297302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.417449650238268e-05, + "grad_norm": 30.701038360595703, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8760828971862793, + "num_tokens": 715651692.0, + "step": 18754 + }, + { + "epoch": 2.385828774965017, + "ewc_loss": 0.0482746884226799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4137343643815257e-05, + "grad_norm": 30.714458465576172, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8642212748527527, + "num_tokens": 715687733.0, + "step": 18755 + }, + { + "epoch": 2.3859559852436076, + "ewc_loss": 0.04834364354610443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.417182258795947e-05, + "grad_norm": 30.680992126464844, + "learning_rate": 1e-06, + "loss": 0.4771, + "mean_token_accuracy": 0.8490564823150635, + "num_tokens": 715727481.0, + "step": 18756 + }, + { + "epoch": 2.386083195522198, + "ewc_loss": 0.048355210572481155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.417760515527334e-05, + "grad_norm": 30.65739631652832, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.8539628386497498, + "num_tokens": 715767549.0, + "step": 18757 + }, + { + "epoch": 2.3862104058007887, + "ewc_loss": 0.048370346426963806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4185173970181495e-05, + "grad_norm": 30.67796516418457, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8643723726272583, + "num_tokens": 715807251.0, + "step": 18758 + }, + { + "epoch": 2.386337616079379, + "ewc_loss": 0.048286400735378265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4143200789694674e-05, + "grad_norm": 30.666513442993164, + "learning_rate": 1e-06, + "loss": 0.476, + "mean_token_accuracy": 0.8558390140533447, + "num_tokens": 715846893.0, + "step": 18759 + }, + { + "epoch": 2.3864648263579697, + "ewc_loss": 0.0483480840921402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4174041755031794e-05, + "grad_norm": 30.650754928588867, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8583939075469971, + "num_tokens": 715883611.0, + "step": 18760 + }, + { + "epoch": 2.3865920366365603, + "ewc_loss": 0.048459503799676895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4229751943494193e-05, + "grad_norm": 30.698341369628906, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8727326989173889, + "num_tokens": 715915598.0, + "step": 18761 + }, + { + "epoch": 2.386719246915151, + "ewc_loss": 0.0483907088637352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.419535485387314e-05, + "grad_norm": 30.673851013183594, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8790502548217773, + "num_tokens": 715956647.0, + "step": 18762 + }, + { + "epoch": 2.3868464571937413, + "ewc_loss": 0.04854442924261093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4272214432130568e-05, + "grad_norm": 30.862707138061523, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.878344714641571, + "num_tokens": 715996627.0, + "step": 18763 + }, + { + "epoch": 2.386973667472332, + "ewc_loss": 0.04843589663505554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4217948521254584e-05, + "grad_norm": 30.7016658782959, + "learning_rate": 1e-06, + "loss": 0.3507, + "mean_token_accuracy": 0.8923828601837158, + "num_tokens": 716027823.0, + "step": 18764 + }, + { + "epoch": 2.3871008777509224, + "ewc_loss": 0.04845646396279335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.422823126835283e-05, + "grad_norm": 30.86198616027832, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8576284646987915, + "num_tokens": 716067430.0, + "step": 18765 + }, + { + "epoch": 2.387228088029513, + "ewc_loss": 0.04838992655277252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.419496377115138e-05, + "grad_norm": 30.71720314025879, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8644310832023621, + "num_tokens": 716109436.0, + "step": 18766 + }, + { + "epoch": 2.3873552983081034, + "ewc_loss": 0.04832446575164795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4162232875823975e-05, + "grad_norm": 30.742300033569336, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8675835132598877, + "num_tokens": 716148157.0, + "step": 18767 + }, + { + "epoch": 2.387482508586694, + "ewc_loss": 0.04833092913031578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4165465219994076e-05, + "grad_norm": 30.601654052734375, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8728965520858765, + "num_tokens": 716184662.0, + "step": 18768 + }, + { + "epoch": 2.3876097188652845, + "ewc_loss": 0.048395391553640366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4197695893235505e-05, + "grad_norm": 30.766244888305664, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.864665687084198, + "num_tokens": 716227382.0, + "step": 18769 + }, + { + "epoch": 2.387736929143875, + "ewc_loss": 0.04845675081014633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.422837496851571e-05, + "grad_norm": 30.72710609436035, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8749855756759644, + "num_tokens": 716262236.0, + "step": 18770 + }, + { + "epoch": 2.387864139422465, + "ewc_loss": 0.04843074083328247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.421537101326976e-05, + "grad_norm": 30.667457580566406, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8796723484992981, + "num_tokens": 716295541.0, + "step": 18771 + }, + { + "epoch": 2.387991349701056, + "ewc_loss": 0.048539143055677414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4269571440527216e-05, + "grad_norm": 30.809833526611328, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8747153878211975, + "num_tokens": 716331796.0, + "step": 18772 + }, + { + "epoch": 2.388118559979646, + "ewc_loss": 0.048452865332365036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4226432287832722e-05, + "grad_norm": 30.581462860107422, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8618585467338562, + "num_tokens": 716369539.0, + "step": 18773 + }, + { + "epoch": 2.3882457702582367, + "ewc_loss": 0.048439349979162216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.421967474219855e-05, + "grad_norm": 30.871801376342773, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8889316320419312, + "num_tokens": 716409806.0, + "step": 18774 + }, + { + "epoch": 2.3883729805368272, + "ewc_loss": 0.048499006778001785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4249502530437894e-05, + "grad_norm": 30.65826988220215, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8642010688781738, + "num_tokens": 716448191.0, + "step": 18775 + }, + { + "epoch": 2.3885001908154178, + "ewc_loss": 0.04839816689491272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.419908378215041e-05, + "grad_norm": 30.89759063720703, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8730813264846802, + "num_tokens": 716486522.0, + "step": 18776 + }, + { + "epoch": 2.3886274010940083, + "ewc_loss": 0.04841751605272293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4208757167798467e-05, + "grad_norm": 30.67567253112793, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8728248476982117, + "num_tokens": 716519383.0, + "step": 18777 + }, + { + "epoch": 2.388754611372599, + "ewc_loss": 0.04824768751859665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.412384310446214e-05, + "grad_norm": 30.825603485107422, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8737526535987854, + "num_tokens": 716556136.0, + "step": 18778 + }, + { + "epoch": 2.3888818216511893, + "ewc_loss": 0.04860597103834152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4302986275870353e-05, + "grad_norm": 30.84547996520996, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8640406727790833, + "num_tokens": 716592346.0, + "step": 18779 + }, + { + "epoch": 2.38900903192978, + "ewc_loss": 0.048367299139499664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4183649657061324e-05, + "grad_norm": 30.653703689575195, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8655248880386353, + "num_tokens": 716633996.0, + "step": 18780 + }, + { + "epoch": 2.3891362422083704, + "ewc_loss": 0.048413388431072235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4206694433814846e-05, + "grad_norm": 30.74724769592285, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8672837018966675, + "num_tokens": 716669638.0, + "step": 18781 + }, + { + "epoch": 2.389263452486961, + "ewc_loss": 0.04843258112668991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4216291421907954e-05, + "grad_norm": 30.686851501464844, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8624551296234131, + "num_tokens": 716711194.0, + "step": 18782 + }, + { + "epoch": 2.3893906627655515, + "ewc_loss": 0.0484255850315094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.421279168629553e-05, + "grad_norm": 30.69965934753418, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8784515857696533, + "num_tokens": 716753402.0, + "step": 18783 + }, + { + "epoch": 2.389517873044142, + "ewc_loss": 0.04842768982052803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4213844881160185e-05, + "grad_norm": 30.675119400024414, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8734827041625977, + "num_tokens": 716794638.0, + "step": 18784 + }, + { + "epoch": 2.3896450833227325, + "ewc_loss": 0.04846832901239395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4234164811787196e-05, + "grad_norm": 30.73289680480957, + "learning_rate": 1e-06, + "loss": 0.4988, + "mean_token_accuracy": 0.8431532382965088, + "num_tokens": 716834003.0, + "step": 18785 + }, + { + "epoch": 2.389772293601323, + "ewc_loss": 0.048501282930374146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4250641217804514e-05, + "grad_norm": 30.687538146972656, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8770753145217896, + "num_tokens": 716867349.0, + "step": 18786 + }, + { + "epoch": 2.3898995038799136, + "ewc_loss": 0.04857634752988815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.428817424515728e-05, + "grad_norm": 30.805410385131836, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8705872297286987, + "num_tokens": 716904271.0, + "step": 18787 + }, + { + "epoch": 2.390026714158504, + "ewc_loss": 0.04853563383221626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4267816115752794e-05, + "grad_norm": 30.74224090576172, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8732912540435791, + "num_tokens": 716936165.0, + "step": 18788 + }, + { + "epoch": 2.3901539244370946, + "ewc_loss": 0.04833238944411278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4166194634744897e-05, + "grad_norm": 30.646263122558594, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8699297308921814, + "num_tokens": 716974011.0, + "step": 18789 + }, + { + "epoch": 2.390281134715685, + "ewc_loss": 0.04855496436357498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4277482225443237e-05, + "grad_norm": 30.748554229736328, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8709681630134583, + "num_tokens": 717012438.0, + "step": 18790 + }, + { + "epoch": 2.3904083449942757, + "ewc_loss": 0.04847268387675285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.423634214210324e-05, + "grad_norm": 30.819408416748047, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.861907958984375, + "num_tokens": 717051755.0, + "step": 18791 + }, + { + "epoch": 2.390535555272866, + "ewc_loss": 0.04849417880177498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.424708873149939e-05, + "grad_norm": 30.691810607910156, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8710433840751648, + "num_tokens": 717090740.0, + "step": 18792 + }, + { + "epoch": 2.3906627655514567, + "ewc_loss": 0.0484258197247982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4212909920606762e-05, + "grad_norm": 30.781021118164062, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8462802171707153, + "num_tokens": 717132033.0, + "step": 18793 + }, + { + "epoch": 2.390789975830047, + "ewc_loss": 0.0485113263130188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4255663447547704e-05, + "grad_norm": 30.655445098876953, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8777009844779968, + "num_tokens": 717168255.0, + "step": 18794 + }, + { + "epoch": 2.390917186108638, + "ewc_loss": 0.04845859482884407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4229297196143307e-05, + "grad_norm": 30.772235870361328, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8695591688156128, + "num_tokens": 717206863.0, + "step": 18795 + }, + { + "epoch": 2.391044396387228, + "ewc_loss": 0.048611756414175034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.430587846902199e-05, + "grad_norm": 30.800451278686523, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8504042029380798, + "num_tokens": 717245374.0, + "step": 18796 + }, + { + "epoch": 2.3911716066658184, + "ewc_loss": 0.048434533178806305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4217266400228254e-05, + "grad_norm": 30.769865036010742, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8890057802200317, + "num_tokens": 717279821.0, + "step": 18797 + }, + { + "epoch": 2.391298816944409, + "ewc_loss": 0.048430029302835464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4215014491346665e-05, + "grad_norm": 30.69266700744629, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.863335371017456, + "num_tokens": 717318896.0, + "step": 18798 + }, + { + "epoch": 2.3914260272229995, + "ewc_loss": 0.04846508428454399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4232542273239233e-05, + "grad_norm": 30.739227294921875, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8617546558380127, + "num_tokens": 717361493.0, + "step": 18799 + }, + { + "epoch": 2.39155323750159, + "ewc_loss": 0.04846341535449028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4231707357103005e-05, + "grad_norm": 30.767396926879883, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8630790710449219, + "num_tokens": 717402801.0, + "step": 18800 + }, + { + "epoch": 2.3916804477801805, + "ewc_loss": 0.048391811549663544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4195906007662416e-05, + "grad_norm": 30.6212215423584, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8884563446044922, + "num_tokens": 717441275.0, + "step": 18801 + }, + { + "epoch": 2.391807658058771, + "ewc_loss": 0.04853720963001251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4268605557153933e-05, + "grad_norm": 30.853776931762695, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8585593700408936, + "num_tokens": 717477423.0, + "step": 18802 + }, + { + "epoch": 2.3919348683373616, + "ewc_loss": 0.048572637140750885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4286318875965662e-05, + "grad_norm": 30.76235008239746, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8641819953918457, + "num_tokens": 717512978.0, + "step": 18803 + }, + { + "epoch": 2.392062078615952, + "ewc_loss": 0.048376865684986115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.418843359919265e-05, + "grad_norm": 30.584426879882812, + "learning_rate": 1e-06, + "loss": 0.3612, + "mean_token_accuracy": 0.8893222808837891, + "num_tokens": 717549710.0, + "step": 18804 + }, + { + "epoch": 2.3921892888945426, + "ewc_loss": 0.04849421605467796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4247108740382828e-05, + "grad_norm": 30.833267211914062, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8664455413818359, + "num_tokens": 717586588.0, + "step": 18805 + }, + { + "epoch": 2.392316499173133, + "ewc_loss": 0.048489656299352646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.424482772767078e-05, + "grad_norm": 30.669557571411133, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8828913569450378, + "num_tokens": 717621507.0, + "step": 18806 + }, + { + "epoch": 2.3924437094517237, + "ewc_loss": 0.04849410057067871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4247050532721914e-05, + "grad_norm": 30.67569351196289, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8715707063674927, + "num_tokens": 717655915.0, + "step": 18807 + }, + { + "epoch": 2.392570919730314, + "ewc_loss": 0.048528026789426804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.426401260890998e-05, + "grad_norm": 30.81640625, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.867715060710907, + "num_tokens": 717696357.0, + "step": 18808 + }, + { + "epoch": 2.3926981300089047, + "ewc_loss": 0.04843026399612427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4215132725657895e-05, + "grad_norm": 30.65488624572754, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8847612142562866, + "num_tokens": 717735397.0, + "step": 18809 + }, + { + "epoch": 2.3928253402874953, + "ewc_loss": 0.04845353588461876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4226768800872378e-05, + "grad_norm": 30.813261032104492, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8752487897872925, + "num_tokens": 717770081.0, + "step": 18810 + }, + { + "epoch": 2.392952550566086, + "ewc_loss": 0.048538170754909515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.426908577035647e-05, + "grad_norm": 30.72075653076172, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8873978853225708, + "num_tokens": 717805262.0, + "step": 18811 + }, + { + "epoch": 2.3930797608446763, + "ewc_loss": 0.04839639365673065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.419819611532148e-05, + "grad_norm": 30.620079040527344, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8701769709587097, + "num_tokens": 717841877.0, + "step": 18812 + }, + { + "epoch": 2.393206971123267, + "ewc_loss": 0.04846928268671036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4234641387010925e-05, + "grad_norm": 30.716842651367188, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8553372621536255, + "num_tokens": 717881435.0, + "step": 18813 + }, + { + "epoch": 2.3933341814018574, + "ewc_loss": 0.048482317477464676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.424115882604383e-05, + "grad_norm": 30.653169631958008, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8899751305580139, + "num_tokens": 717922240.0, + "step": 18814 + }, + { + "epoch": 2.393461391680448, + "ewc_loss": 0.04852119833230972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4260598365799524e-05, + "grad_norm": 30.7672061920166, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8677477836608887, + "num_tokens": 717958438.0, + "step": 18815 + }, + { + "epoch": 2.3935886019590384, + "ewc_loss": 0.04849463701248169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4247317924164236e-05, + "grad_norm": 30.664030075073242, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.862829864025116, + "num_tokens": 717994451.0, + "step": 18816 + }, + { + "epoch": 2.393715812237629, + "ewc_loss": 0.04852084815502167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4260423742816783e-05, + "grad_norm": 30.738624572753906, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8725121021270752, + "num_tokens": 718034318.0, + "step": 18817 + }, + { + "epoch": 2.3938430225162195, + "ewc_loss": 0.04849659278988838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4248296540463343e-05, + "grad_norm": 30.66903305053711, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8701475858688354, + "num_tokens": 718074994.0, + "step": 18818 + }, + { + "epoch": 2.3939702327948096, + "ewc_loss": 0.048573531210422516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4286766347358935e-05, + "grad_norm": 30.830425262451172, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8689352869987488, + "num_tokens": 718118849.0, + "step": 18819 + }, + { + "epoch": 2.3940974430734006, + "ewc_loss": 0.04845153167843819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4225766537711024e-05, + "grad_norm": 30.643108367919922, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8769981861114502, + "num_tokens": 718152874.0, + "step": 18820 + }, + { + "epoch": 2.3942246533519906, + "ewc_loss": 0.04854222759604454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4271113943541422e-05, + "grad_norm": 30.842960357666016, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8635830283164978, + "num_tokens": 718194206.0, + "step": 18821 + }, + { + "epoch": 2.394351863630581, + "ewc_loss": 0.04861481487751007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4307408239110373e-05, + "grad_norm": 30.84398078918457, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8741492033004761, + "num_tokens": 718227408.0, + "step": 18822 + }, + { + "epoch": 2.3944790739091717, + "ewc_loss": 0.0484609454870224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4230472263297997e-05, + "grad_norm": 30.719871520996094, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8650712966918945, + "num_tokens": 718266962.0, + "step": 18823 + }, + { + "epoch": 2.3946062841877622, + "ewc_loss": 0.04852711781859398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4263559680548497e-05, + "grad_norm": 30.846513748168945, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8758704662322998, + "num_tokens": 718307873.0, + "step": 18824 + }, + { + "epoch": 2.3947334944663528, + "ewc_loss": 0.04846079275012016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.423039586574305e-05, + "grad_norm": 30.669870376586914, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8767702579498291, + "num_tokens": 718350296.0, + "step": 18825 + }, + { + "epoch": 2.3948607047449433, + "ewc_loss": 0.04844777658581734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.422388752165716e-05, + "grad_norm": 30.770410537719727, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8736268877983093, + "num_tokens": 718390734.0, + "step": 18826 + }, + { + "epoch": 2.394987915023534, + "ewc_loss": 0.04853735491633415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4268678316730075e-05, + "grad_norm": 30.775127410888672, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8869056701660156, + "num_tokens": 718435630.0, + "step": 18827 + }, + { + "epoch": 2.3951151253021243, + "ewc_loss": 0.048468925058841705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4234463126049377e-05, + "grad_norm": 30.812652587890625, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8780950307846069, + "num_tokens": 718474791.0, + "step": 18828 + }, + { + "epoch": 2.395242335580715, + "ewc_loss": 0.04847513884305954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4237569959950633e-05, + "grad_norm": 30.843908309936523, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8536449670791626, + "num_tokens": 718514957.0, + "step": 18829 + }, + { + "epoch": 2.3953695458593054, + "ewc_loss": 0.04846208915114403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4231045244960114e-05, + "grad_norm": 30.86347770690918, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8832184076309204, + "num_tokens": 718551761.0, + "step": 18830 + }, + { + "epoch": 2.395496756137896, + "ewc_loss": 0.04834868386387825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.417434188828338e-05, + "grad_norm": 30.888126373291016, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8697143197059631, + "num_tokens": 718586445.0, + "step": 18831 + }, + { + "epoch": 2.3956239664164865, + "ewc_loss": 0.04836808145046234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4184040739783086e-05, + "grad_norm": 30.6817684173584, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8666189908981323, + "num_tokens": 718631105.0, + "step": 18832 + }, + { + "epoch": 2.395751176695077, + "ewc_loss": 0.04837029427289963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4185146685340442e-05, + "grad_norm": 30.768014907836914, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.867766261100769, + "num_tokens": 718672278.0, + "step": 18833 + }, + { + "epoch": 2.3958783869736675, + "ewc_loss": 0.04848560318350792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.424280137347523e-05, + "grad_norm": 30.898521423339844, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.87682044506073, + "num_tokens": 718713191.0, + "step": 18834 + }, + { + "epoch": 2.396005597252258, + "ewc_loss": 0.048330724239349365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4165361537598073e-05, + "grad_norm": 30.85240364074707, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8656065464019775, + "num_tokens": 718745012.0, + "step": 18835 + }, + { + "epoch": 2.3961328075308486, + "ewc_loss": 0.048365991562604904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.418299663986545e-05, + "grad_norm": 30.709686279296875, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8714603781700134, + "num_tokens": 718780149.0, + "step": 18836 + }, + { + "epoch": 2.396260017809439, + "ewc_loss": 0.04836491495370865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4182458219002e-05, + "grad_norm": 30.858585357666016, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8666232228279114, + "num_tokens": 718812831.0, + "step": 18837 + }, + { + "epoch": 2.3963872280880296, + "ewc_loss": 0.04846265912055969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4231329007307068e-05, + "grad_norm": 30.942012786865234, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8808174133300781, + "num_tokens": 718847650.0, + "step": 18838 + }, + { + "epoch": 2.39651443836662, + "ewc_loss": 0.04837816581130028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4189082978409715e-05, + "grad_norm": 30.711448669433594, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8726320266723633, + "num_tokens": 718883417.0, + "step": 18839 + }, + { + "epoch": 2.3966416486452107, + "ewc_loss": 0.04837248474359512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4186241716961376e-05, + "grad_norm": 30.740495681762695, + "learning_rate": 1e-06, + "loss": 0.3368, + "mean_token_accuracy": 0.8960779309272766, + "num_tokens": 718915501.0, + "step": 18840 + }, + { + "epoch": 2.396768858923801, + "ewc_loss": 0.04850368946790695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.425184538878966e-05, + "grad_norm": 30.90621566772461, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.885042130947113, + "num_tokens": 718954485.0, + "step": 18841 + }, + { + "epoch": 2.3968960692023917, + "ewc_loss": 0.048488717526197433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4244358428404666e-05, + "grad_norm": 30.733409881591797, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.857799768447876, + "num_tokens": 718995815.0, + "step": 18842 + }, + { + "epoch": 2.3970232794809823, + "ewc_loss": 0.04843517020344734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4217584723373875e-05, + "grad_norm": 30.881155014038086, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8855571150779724, + "num_tokens": 719032820.0, + "step": 18843 + }, + { + "epoch": 2.3971504897595723, + "ewc_loss": 0.04846004769206047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.423002297291532e-05, + "grad_norm": 30.7534122467041, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8697523474693298, + "num_tokens": 719066966.0, + "step": 18844 + }, + { + "epoch": 2.3972777000381633, + "ewc_loss": 0.04834600165486336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4173001293092966e-05, + "grad_norm": 30.681529998779297, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8706740140914917, + "num_tokens": 719108768.0, + "step": 18845 + }, + { + "epoch": 2.3974049103167534, + "ewc_loss": 0.0484820231795311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4241011487902142e-05, + "grad_norm": 30.753610610961914, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8683156967163086, + "num_tokens": 719151451.0, + "step": 18846 + }, + { + "epoch": 2.397532120595344, + "ewc_loss": 0.04847746342420578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.42387322941795e-05, + "grad_norm": 30.804203033447266, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8637881875038147, + "num_tokens": 719192103.0, + "step": 18847 + }, + { + "epoch": 2.3976593308739345, + "ewc_loss": 0.04848930984735489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4244654923677444e-05, + "grad_norm": 30.790874481201172, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8672393560409546, + "num_tokens": 719226514.0, + "step": 18848 + }, + { + "epoch": 2.397786541152525, + "ewc_loss": 0.04840657487511635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4203287466662005e-05, + "grad_norm": 30.627920150756836, + "learning_rate": 1e-06, + "loss": 0.3422, + "mean_token_accuracy": 0.8950204849243164, + "num_tokens": 719262944.0, + "step": 18849 + }, + { + "epoch": 2.3979137514311155, + "ewc_loss": 0.048448216170072556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.422410761937499e-05, + "grad_norm": 30.720169067382812, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8766836524009705, + "num_tokens": 719306707.0, + "step": 18850 + }, + { + "epoch": 2.398040961709706, + "ewc_loss": 0.04853177070617676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4265886167995632e-05, + "grad_norm": 30.825048446655273, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8782564401626587, + "num_tokens": 719347523.0, + "step": 18851 + }, + { + "epoch": 2.3981681719882966, + "ewc_loss": 0.04846341907978058, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.423170917609241e-05, + "grad_norm": 30.721424102783203, + "learning_rate": 1e-06, + "loss": 0.3499, + "mean_token_accuracy": 0.8903261423110962, + "num_tokens": 719379474.0, + "step": 18852 + }, + { + "epoch": 2.398295382266887, + "ewc_loss": 0.048509903252124786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4254952222690918e-05, + "grad_norm": 30.823631286621094, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8566787242889404, + "num_tokens": 719421624.0, + "step": 18853 + }, + { + "epoch": 2.3984225925454776, + "ewc_loss": 0.048558615148067474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4279306671814993e-05, + "grad_norm": 30.86061668395996, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8673714995384216, + "num_tokens": 719455570.0, + "step": 18854 + }, + { + "epoch": 2.398549802824068, + "ewc_loss": 0.048461515456438065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4230757844634354e-05, + "grad_norm": 30.89024543762207, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8878301382064819, + "num_tokens": 719500300.0, + "step": 18855 + }, + { + "epoch": 2.3986770131026587, + "ewc_loss": 0.04846465587615967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4232327632489614e-05, + "grad_norm": 30.96908187866211, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8560752272605896, + "num_tokens": 719538119.0, + "step": 18856 + }, + { + "epoch": 2.398804223381249, + "ewc_loss": 0.04848552495241165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4242763174697757e-05, + "grad_norm": 30.83713722229004, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8604273200035095, + "num_tokens": 719574012.0, + "step": 18857 + }, + { + "epoch": 2.3989314336598397, + "ewc_loss": 0.04840385541319847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4201926862588152e-05, + "grad_norm": 30.730098724365234, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8834025263786316, + "num_tokens": 719609548.0, + "step": 18858 + }, + { + "epoch": 2.3990586439384303, + "ewc_loss": 0.04843844473361969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4219221813837066e-05, + "grad_norm": 30.811906814575195, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8649874925613403, + "num_tokens": 719647280.0, + "step": 18859 + }, + { + "epoch": 2.399185854217021, + "ewc_loss": 0.04852893203496933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4264465537271462e-05, + "grad_norm": 30.8199462890625, + "learning_rate": 1e-06, + "loss": 0.3683, + "mean_token_accuracy": 0.8866415619850159, + "num_tokens": 719686987.0, + "step": 18860 + }, + { + "epoch": 2.3993130644956113, + "ewc_loss": 0.04851987957954407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.425993989163544e-05, + "grad_norm": 30.85446548461914, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8899222016334534, + "num_tokens": 719727974.0, + "step": 18861 + }, + { + "epoch": 2.399440274774202, + "ewc_loss": 0.04842875152826309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.421437602606602e-05, + "grad_norm": 30.66545295715332, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8719969987869263, + "num_tokens": 719764308.0, + "step": 18862 + }, + { + "epoch": 2.3995674850527924, + "ewc_loss": 0.04854045808315277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.42702299146913e-05, + "grad_norm": 30.77462387084961, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8607914447784424, + "num_tokens": 719801872.0, + "step": 18863 + }, + { + "epoch": 2.399694695331383, + "ewc_loss": 0.04855677857995033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4278389901155606e-05, + "grad_norm": 30.817724227905273, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8733196258544922, + "num_tokens": 719842504.0, + "step": 18864 + }, + { + "epoch": 2.3998219056099734, + "ewc_loss": 0.04836701229214668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4183505956898443e-05, + "grad_norm": 30.668275833129883, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8661728501319885, + "num_tokens": 719878885.0, + "step": 18865 + }, + { + "epoch": 2.399949115888564, + "ewc_loss": 0.04851607233285904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4258035409729928e-05, + "grad_norm": 30.770639419555664, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8550746440887451, + "num_tokens": 719922891.0, + "step": 18866 + }, + { + "epoch": 2.4000763261671545, + "ewc_loss": 0.04851914197206497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.425957063678652e-05, + "grad_norm": 30.768489837646484, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8696773052215576, + "num_tokens": 719960113.0, + "step": 18867 + }, + { + "epoch": 2.400203536445745, + "ewc_loss": 0.04842206463217735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4211032723542303e-05, + "grad_norm": 30.674354553222656, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8715252876281738, + "num_tokens": 720006079.0, + "step": 18868 + }, + { + "epoch": 2.400330746724335, + "ewc_loss": 0.048590246587991714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4295122784678824e-05, + "grad_norm": 30.89700698852539, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8595796823501587, + "num_tokens": 720042219.0, + "step": 18869 + }, + { + "epoch": 2.400457957002926, + "ewc_loss": 0.048525989055633545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4262993974843994e-05, + "grad_norm": 30.695316314697266, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8730484247207642, + "num_tokens": 720082574.0, + "step": 18870 + }, + { + "epoch": 2.400585167281516, + "ewc_loss": 0.048479363322257996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.423968180664815e-05, + "grad_norm": 30.86360740661621, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8700627088546753, + "num_tokens": 720115583.0, + "step": 18871 + }, + { + "epoch": 2.4007123775601067, + "ewc_loss": 0.048576705157756805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4288352506118827e-05, + "grad_norm": 30.827444076538086, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8627161979675293, + "num_tokens": 720151350.0, + "step": 18872 + }, + { + "epoch": 2.400839587838697, + "ewc_loss": 0.048466674983501434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4233337171608582e-05, + "grad_norm": 30.863698959350586, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8628513813018799, + "num_tokens": 720187072.0, + "step": 18873 + }, + { + "epoch": 2.4009667981172877, + "ewc_loss": 0.0484599694609642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4229984774137847e-05, + "grad_norm": 30.734195709228516, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8746596574783325, + "num_tokens": 720222249.0, + "step": 18874 + }, + { + "epoch": 2.4010940083958783, + "ewc_loss": 0.0484829843044281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4241491701104678e-05, + "grad_norm": 30.743499755859375, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8699172139167786, + "num_tokens": 720264682.0, + "step": 18875 + }, + { + "epoch": 2.401221218674469, + "ewc_loss": 0.04848610237240791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4243050575023517e-05, + "grad_norm": 30.673364639282227, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8681179881095886, + "num_tokens": 720309935.0, + "step": 18876 + }, + { + "epoch": 2.4013484289530593, + "ewc_loss": 0.04846775904297829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.423387923045084e-05, + "grad_norm": 30.68250274658203, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8652969598770142, + "num_tokens": 720343992.0, + "step": 18877 + }, + { + "epoch": 2.40147563923165, + "ewc_loss": 0.04864460974931717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4322305762325414e-05, + "grad_norm": 30.964534759521484, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8702129125595093, + "num_tokens": 720379707.0, + "step": 18878 + }, + { + "epoch": 2.4016028495102404, + "ewc_loss": 0.048658013343811035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4329006919288076e-05, + "grad_norm": 30.719358444213867, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8678141832351685, + "num_tokens": 720419796.0, + "step": 18879 + }, + { + "epoch": 2.401730059788831, + "ewc_loss": 0.04855510964989662, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.427755498501938e-05, + "grad_norm": 30.669403076171875, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.873079776763916, + "num_tokens": 720460919.0, + "step": 18880 + }, + { + "epoch": 2.4018572700674214, + "ewc_loss": 0.048647817224264145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4323908291989937e-05, + "grad_norm": 30.806758880615234, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8674260377883911, + "num_tokens": 720498403.0, + "step": 18881 + }, + { + "epoch": 2.401984480346012, + "ewc_loss": 0.04858434945344925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4292174202855676e-05, + "grad_norm": 30.62677001953125, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8718421459197998, + "num_tokens": 720534931.0, + "step": 18882 + }, + { + "epoch": 2.4021116906246025, + "ewc_loss": 0.04863681644201279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4318407668033615e-05, + "grad_norm": 30.943588256835938, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8754284381866455, + "num_tokens": 720574062.0, + "step": 18883 + }, + { + "epoch": 2.402238900903193, + "ewc_loss": 0.048634402453899384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4317201678059064e-05, + "grad_norm": 30.650663375854492, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8623135089874268, + "num_tokens": 720611552.0, + "step": 18884 + }, + { + "epoch": 2.4023661111817836, + "ewc_loss": 0.048556894063949585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4278446289827116e-05, + "grad_norm": 30.797025680541992, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8716835379600525, + "num_tokens": 720647886.0, + "step": 18885 + }, + { + "epoch": 2.402493321460374, + "ewc_loss": 0.04861370846629143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.430685344734229e-05, + "grad_norm": 30.650617599487305, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8770348429679871, + "num_tokens": 720688034.0, + "step": 18886 + }, + { + "epoch": 2.4026205317389646, + "ewc_loss": 0.04862063005566597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.431031498417724e-05, + "grad_norm": 30.79114532470703, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8779710531234741, + "num_tokens": 720727805.0, + "step": 18887 + }, + { + "epoch": 2.402747742017555, + "ewc_loss": 0.048645053058862686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4322525860043243e-05, + "grad_norm": 30.802560806274414, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8734173774719238, + "num_tokens": 720759717.0, + "step": 18888 + }, + { + "epoch": 2.4028749522961457, + "ewc_loss": 0.04863861948251724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4319309886777773e-05, + "grad_norm": 30.83674430847168, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8648437857627869, + "num_tokens": 720790435.0, + "step": 18889 + }, + { + "epoch": 2.403002162574736, + "ewc_loss": 0.048597317188978195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4298658900079317e-05, + "grad_norm": 30.802106857299805, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.867242157459259, + "num_tokens": 720831715.0, + "step": 18890 + }, + { + "epoch": 2.4031293728533267, + "ewc_loss": 0.048661842942237854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.433092231513001e-05, + "grad_norm": 30.781030654907227, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8710647821426392, + "num_tokens": 720869598.0, + "step": 18891 + }, + { + "epoch": 2.403256583131917, + "ewc_loss": 0.048615798354148865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.430789936624933e-05, + "grad_norm": 30.842561721801758, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8665685653686523, + "num_tokens": 720911122.0, + "step": 18892 + }, + { + "epoch": 2.403383793410508, + "ewc_loss": 0.04860591143369675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4302955353050493e-05, + "grad_norm": 30.729549407958984, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8755990862846375, + "num_tokens": 720952265.0, + "step": 18893 + }, + { + "epoch": 2.403511003689098, + "ewc_loss": 0.0486169196665287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4308459614985622e-05, + "grad_norm": 30.892314910888672, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8760037422180176, + "num_tokens": 720991191.0, + "step": 18894 + }, + { + "epoch": 2.4036382139676884, + "ewc_loss": 0.048571206629276276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.428560401313007e-05, + "grad_norm": 30.632017135620117, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8809247016906738, + "num_tokens": 721032524.0, + "step": 18895 + }, + { + "epoch": 2.403765424246279, + "ewc_loss": 0.048589352518320084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4294677132274956e-05, + "grad_norm": 30.907289505004883, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.870847225189209, + "num_tokens": 721072699.0, + "step": 18896 + }, + { + "epoch": 2.4038926345248695, + "ewc_loss": 0.048634354025125504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4317176212207414e-05, + "grad_norm": 30.63791847229004, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8841143250465393, + "num_tokens": 721109467.0, + "step": 18897 + }, + { + "epoch": 2.40401984480346, + "ewc_loss": 0.048497848212718964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4248924091807567e-05, + "grad_norm": 30.91282844543457, + "learning_rate": 1e-06, + "loss": 0.5056, + "mean_token_accuracy": 0.8459917902946472, + "num_tokens": 721142108.0, + "step": 18898 + }, + { + "epoch": 2.4041470550820505, + "ewc_loss": 0.04872284457087517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4361423129448667e-05, + "grad_norm": 30.683330535888672, + "learning_rate": 1e-06, + "loss": 0.4468, + "mean_token_accuracy": 0.8646024465560913, + "num_tokens": 721179875.0, + "step": 18899 + }, + { + "epoch": 2.404274265360641, + "ewc_loss": 0.0485445037484169, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4272252630908042e-05, + "grad_norm": 30.894123077392578, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8680232763290405, + "num_tokens": 721217141.0, + "step": 18900 + }, + { + "epoch": 2.4044014756392316, + "ewc_loss": 0.0486188642680645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.430943277431652e-05, + "grad_norm": 30.726863861083984, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8644137382507324, + "num_tokens": 721256357.0, + "step": 18901 + }, + { + "epoch": 2.404528685917822, + "ewc_loss": 0.0485815703868866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4290784494951367e-05, + "grad_norm": 30.892045974731445, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8709613680839539, + "num_tokens": 721292610.0, + "step": 18902 + }, + { + "epoch": 2.4046558961964126, + "ewc_loss": 0.04871825873851776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4359129383810796e-05, + "grad_norm": 30.876802444458008, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8775469064712524, + "num_tokens": 721329048.0, + "step": 18903 + }, + { + "epoch": 2.404783106475003, + "ewc_loss": 0.048552073538303375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4276036128867418e-05, + "grad_norm": 30.75758934020996, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8855372071266174, + "num_tokens": 721364249.0, + "step": 18904 + }, + { + "epoch": 2.4049103167535937, + "ewc_loss": 0.04863588139414787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4317940187756903e-05, + "grad_norm": 30.767534255981445, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.879744291305542, + "num_tokens": 721391154.0, + "step": 18905 + }, + { + "epoch": 2.405037527032184, + "ewc_loss": 0.04863153398036957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4315766495419666e-05, + "grad_norm": 30.7020263671875, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8813319206237793, + "num_tokens": 721428443.0, + "step": 18906 + }, + { + "epoch": 2.4051647373107747, + "ewc_loss": 0.04856175184249878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4280876459670253e-05, + "grad_norm": 30.796375274658203, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8613815307617188, + "num_tokens": 721474545.0, + "step": 18907 + }, + { + "epoch": 2.4052919475893653, + "ewc_loss": 0.048627227544784546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.431361463095527e-05, + "grad_norm": 30.692543029785156, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8713064193725586, + "num_tokens": 721513153.0, + "step": 18908 + }, + { + "epoch": 2.405419157867956, + "ewc_loss": 0.04860573261976242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.430286622256972e-05, + "grad_norm": 30.761146545410156, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.867417573928833, + "num_tokens": 721556287.0, + "step": 18909 + }, + { + "epoch": 2.4055463681465463, + "ewc_loss": 0.04862536862492561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.431268512737006e-05, + "grad_norm": 30.722761154174805, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8773835301399231, + "num_tokens": 721590219.0, + "step": 18910 + }, + { + "epoch": 2.405673578425137, + "ewc_loss": 0.048667144030332565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4333572582690977e-05, + "grad_norm": 30.732316970825195, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8698631525039673, + "num_tokens": 721627308.0, + "step": 18911 + }, + { + "epoch": 2.4058007887037274, + "ewc_loss": 0.048712387681007385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4356193534913473e-05, + "grad_norm": 30.894216537475586, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8662115335464478, + "num_tokens": 721664218.0, + "step": 18912 + }, + { + "epoch": 2.405927998982318, + "ewc_loss": 0.04864522069692612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4322609533555806e-05, + "grad_norm": 30.616838455200195, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8810811042785645, + "num_tokens": 721703511.0, + "step": 18913 + }, + { + "epoch": 2.4060552092609084, + "ewc_loss": 0.04869633540511131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.434816815366503e-05, + "grad_norm": 30.85148048400879, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.883041262626648, + "num_tokens": 721741512.0, + "step": 18914 + }, + { + "epoch": 2.406182419539499, + "ewc_loss": 0.04866959899663925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4334798581548966e-05, + "grad_norm": 30.662046432495117, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8752448558807373, + "num_tokens": 721778443.0, + "step": 18915 + }, + { + "epoch": 2.4063096298180895, + "ewc_loss": 0.04864594340324402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.432297151244711e-05, + "grad_norm": 30.846345901489258, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8682661056518555, + "num_tokens": 721817226.0, + "step": 18916 + }, + { + "epoch": 2.4064368400966796, + "ewc_loss": 0.04865264892578125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4326323909917846e-05, + "grad_norm": 30.692523956298828, + "learning_rate": 1e-06, + "loss": 0.4712, + "mean_token_accuracy": 0.8569430112838745, + "num_tokens": 721859419.0, + "step": 18917 + }, + { + "epoch": 2.4065640503752705, + "ewc_loss": 0.04865675047039986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4328375729965046e-05, + "grad_norm": 30.826644897460938, + "learning_rate": 1e-06, + "loss": 0.3729, + "mean_token_accuracy": 0.8844475746154785, + "num_tokens": 721895183.0, + "step": 18918 + }, + { + "epoch": 2.4066912606538606, + "ewc_loss": 0.0485740527510643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.428702646284364e-05, + "grad_norm": 30.72102165222168, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8570104241371155, + "num_tokens": 721933566.0, + "step": 18919 + }, + { + "epoch": 2.406818470932451, + "ewc_loss": 0.048571497201919556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.428574771329295e-05, + "grad_norm": 30.888437271118164, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8745934963226318, + "num_tokens": 721968827.0, + "step": 18920 + }, + { + "epoch": 2.4069456812110417, + "ewc_loss": 0.04869014769792557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4345074052689597e-05, + "grad_norm": 30.86030387878418, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8742112517356873, + "num_tokens": 722004989.0, + "step": 18921 + }, + { + "epoch": 2.407072891489632, + "ewc_loss": 0.04859713464975357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.429856795060914e-05, + "grad_norm": 30.8553409576416, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8744732141494751, + "num_tokens": 722043681.0, + "step": 18922 + }, + { + "epoch": 2.4072001017682227, + "ewc_loss": 0.048539210110902786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.426960418233648e-05, + "grad_norm": 30.682485580444336, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8712299466133118, + "num_tokens": 722086541.0, + "step": 18923 + }, + { + "epoch": 2.4073273120468133, + "ewc_loss": 0.04851597547531128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4257988116005436e-05, + "grad_norm": 30.766645431518555, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8713712692260742, + "num_tokens": 722125519.0, + "step": 18924 + }, + { + "epoch": 2.407454522325404, + "ewc_loss": 0.048525385558605194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4262692022603005e-05, + "grad_norm": 30.768659591674805, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8663442134857178, + "num_tokens": 722157665.0, + "step": 18925 + }, + { + "epoch": 2.4075817326039943, + "ewc_loss": 0.048548389226198196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.427419531159103e-05, + "grad_norm": 30.904296875, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8833012580871582, + "num_tokens": 722191819.0, + "step": 18926 + }, + { + "epoch": 2.407708942882585, + "ewc_loss": 0.04852285981178284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4261429643956944e-05, + "grad_norm": 30.80971336364746, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8604812622070312, + "num_tokens": 722229755.0, + "step": 18927 + }, + { + "epoch": 2.4078361531611754, + "ewc_loss": 0.04850827157497406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4254135496448725e-05, + "grad_norm": 30.869478225708008, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8612765073776245, + "num_tokens": 722267440.0, + "step": 18928 + }, + { + "epoch": 2.407963363439766, + "ewc_loss": 0.04855737090110779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.427868457743898e-05, + "grad_norm": 30.82538414001465, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8672063946723938, + "num_tokens": 722310102.0, + "step": 18929 + }, + { + "epoch": 2.4080905737183564, + "ewc_loss": 0.048500221222639084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.425011007289868e-05, + "grad_norm": 30.858417510986328, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8646094799041748, + "num_tokens": 722346713.0, + "step": 18930 + }, + { + "epoch": 2.408217783996947, + "ewc_loss": 0.04855172336101532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4275861505884677e-05, + "grad_norm": 30.753711700439453, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.866047739982605, + "num_tokens": 722387803.0, + "step": 18931 + }, + { + "epoch": 2.4083449942755375, + "ewc_loss": 0.048547033220529556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4273516828543507e-05, + "grad_norm": 30.938312530517578, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8625639081001282, + "num_tokens": 722428943.0, + "step": 18932 + }, + { + "epoch": 2.408472204554128, + "ewc_loss": 0.0485631562769413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4281578589580022e-05, + "grad_norm": 30.88389015197754, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8645325899124146, + "num_tokens": 722467860.0, + "step": 18933 + }, + { + "epoch": 2.4085994148327186, + "ewc_loss": 0.04845856502652168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.422928264422808e-05, + "grad_norm": 30.78015899658203, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.867353618144989, + "num_tokens": 722503067.0, + "step": 18934 + }, + { + "epoch": 2.408726625111309, + "ewc_loss": 0.048507947474718094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.425397360639181e-05, + "grad_norm": 30.79283332824707, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8737985491752625, + "num_tokens": 722537842.0, + "step": 18935 + }, + { + "epoch": 2.4088538353898996, + "ewc_loss": 0.04849084094166756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4245420718216337e-05, + "grad_norm": 30.660934448242188, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8777547478675842, + "num_tokens": 722574036.0, + "step": 18936 + }, + { + "epoch": 2.40898104566849, + "ewc_loss": 0.048527855426073074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4263927116408013e-05, + "grad_norm": 30.794281005859375, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8718265295028687, + "num_tokens": 722606087.0, + "step": 18937 + }, + { + "epoch": 2.4091082559470807, + "ewc_loss": 0.04857965186238289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.42898258875357e-05, + "grad_norm": 30.673749923706055, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8706283569335938, + "num_tokens": 722648745.0, + "step": 18938 + }, + { + "epoch": 2.409235466225671, + "ewc_loss": 0.048478659242391586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4239328922703862e-05, + "grad_norm": 30.685277938842773, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8618292212486267, + "num_tokens": 722688589.0, + "step": 18939 + }, + { + "epoch": 2.4093626765042617, + "ewc_loss": 0.04860273376107216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4301367375301197e-05, + "grad_norm": 30.6627197265625, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.888907253742218, + "num_tokens": 722723941.0, + "step": 18940 + }, + { + "epoch": 2.4094898867828523, + "ewc_loss": 0.048640038818120956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4320019292645156e-05, + "grad_norm": 30.775625228881836, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8540311455726624, + "num_tokens": 722768242.0, + "step": 18941 + }, + { + "epoch": 2.4096170970614423, + "ewc_loss": 0.0486675500869751, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.433377449051477e-05, + "grad_norm": 30.77347183227539, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8714366555213928, + "num_tokens": 722800970.0, + "step": 18942 + }, + { + "epoch": 2.4097443073400333, + "ewc_loss": 0.048635341227054596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.431767097732518e-05, + "grad_norm": 30.678998947143555, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8618245720863342, + "num_tokens": 722839168.0, + "step": 18943 + }, + { + "epoch": 2.4098715176186234, + "ewc_loss": 0.048669472336769104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4334736735909246e-05, + "grad_norm": 30.6668643951416, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.881949782371521, + "num_tokens": 722883451.0, + "step": 18944 + }, + { + "epoch": 2.409998727897214, + "ewc_loss": 0.04870813339948654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4354067136300728e-05, + "grad_norm": 30.810352325439453, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8726958632469177, + "num_tokens": 722928581.0, + "step": 18945 + }, + { + "epoch": 2.4101259381758044, + "ewc_loss": 0.04861339554190636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4306697014253587e-05, + "grad_norm": 30.77361297607422, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8818073868751526, + "num_tokens": 722967141.0, + "step": 18946 + }, + { + "epoch": 2.410253148454395, + "ewc_loss": 0.04869496822357178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4347484213649295e-05, + "grad_norm": 30.807003021240234, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.86808842420578, + "num_tokens": 723006263.0, + "step": 18947 + }, + { + "epoch": 2.4103803587329855, + "ewc_loss": 0.048627402633428574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.431370194244664e-05, + "grad_norm": 30.779401779174805, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8743915557861328, + "num_tokens": 723050843.0, + "step": 18948 + }, + { + "epoch": 2.410507569011576, + "ewc_loss": 0.048600971698760986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4300485165440477e-05, + "grad_norm": 30.86887550354004, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8717601299285889, + "num_tokens": 723092124.0, + "step": 18949 + }, + { + "epoch": 2.4106347792901666, + "ewc_loss": 0.048579175025224686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4289587599923834e-05, + "grad_norm": 30.822166442871094, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8819595575332642, + "num_tokens": 723122478.0, + "step": 18950 + }, + { + "epoch": 2.410761989568757, + "ewc_loss": 0.04863809421658516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4319047952303663e-05, + "grad_norm": 30.84349822998047, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8650766015052795, + "num_tokens": 723163054.0, + "step": 18951 + }, + { + "epoch": 2.4108891998473476, + "ewc_loss": 0.04846695065498352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.423347541480325e-05, + "grad_norm": 30.82683563232422, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8715885877609253, + "num_tokens": 723201518.0, + "step": 18952 + }, + { + "epoch": 2.411016410125938, + "ewc_loss": 0.048582203686237335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4291100999107584e-05, + "grad_norm": 30.83987808227539, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8724421858787537, + "num_tokens": 723235685.0, + "step": 18953 + }, + { + "epoch": 2.4111436204045287, + "ewc_loss": 0.04850533604621887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4252667572000064e-05, + "grad_norm": 30.931894302368164, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8716023564338684, + "num_tokens": 723266510.0, + "step": 18954 + }, + { + "epoch": 2.411270830683119, + "ewc_loss": 0.04852898418903351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4264492822112516e-05, + "grad_norm": 30.810203552246094, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8735094666481018, + "num_tokens": 723302800.0, + "step": 18955 + }, + { + "epoch": 2.4113980409617097, + "ewc_loss": 0.04851687327027321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.425843740638811e-05, + "grad_norm": 30.814687728881836, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8734080791473389, + "num_tokens": 723340669.0, + "step": 18956 + }, + { + "epoch": 2.4115252512403003, + "ewc_loss": 0.048527806997299194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4263903469545767e-05, + "grad_norm": 30.808792114257812, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8736612796783447, + "num_tokens": 723377556.0, + "step": 18957 + }, + { + "epoch": 2.411652461518891, + "ewc_loss": 0.04853645712137222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.42682290263474e-05, + "grad_norm": 30.754932403564453, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8755906820297241, + "num_tokens": 723410942.0, + "step": 18958 + }, + { + "epoch": 2.4117796717974813, + "ewc_loss": 0.04848862439393997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4244312953669578e-05, + "grad_norm": 30.721757888793945, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8844951391220093, + "num_tokens": 723451131.0, + "step": 18959 + }, + { + "epoch": 2.411906882076072, + "ewc_loss": 0.04871826991438866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4359134840779006e-05, + "grad_norm": 30.858970642089844, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.875020444393158, + "num_tokens": 723485876.0, + "step": 18960 + }, + { + "epoch": 2.4120340923546624, + "ewc_loss": 0.0485847145318985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4292357920785435e-05, + "grad_norm": 30.748462677001953, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8789814114570618, + "num_tokens": 723521252.0, + "step": 18961 + }, + { + "epoch": 2.412161302633253, + "ewc_loss": 0.048543356359004974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4271677830256522e-05, + "grad_norm": 30.793987274169922, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8557881116867065, + "num_tokens": 723561094.0, + "step": 18962 + }, + { + "epoch": 2.4122885129118434, + "ewc_loss": 0.048715099692344666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4357550501008518e-05, + "grad_norm": 30.894371032714844, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8705347776412964, + "num_tokens": 723597281.0, + "step": 18963 + }, + { + "epoch": 2.412415723190434, + "ewc_loss": 0.04857644438743591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.428822153888177e-05, + "grad_norm": 30.788970947265625, + "learning_rate": 1e-06, + "loss": 0.3454, + "mean_token_accuracy": 0.8951244950294495, + "num_tokens": 723637690.0, + "step": 18964 + }, + { + "epoch": 2.4125429334690245, + "ewc_loss": 0.0486663319170475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4333165129064582e-05, + "grad_norm": 30.810840606689453, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.871884286403656, + "num_tokens": 723676250.0, + "step": 18965 + }, + { + "epoch": 2.412670143747615, + "ewc_loss": 0.0486593060195446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4329652660526335e-05, + "grad_norm": 30.85516929626465, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8803560137748718, + "num_tokens": 723713157.0, + "step": 18966 + }, + { + "epoch": 2.412797354026205, + "ewc_loss": 0.048674359917640686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4337179638678208e-05, + "grad_norm": 30.855712890625, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8761932253837585, + "num_tokens": 723745877.0, + "step": 18967 + }, + { + "epoch": 2.412924564304796, + "ewc_loss": 0.048609938472509384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4304968974320218e-05, + "grad_norm": 30.752092361450195, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8732419013977051, + "num_tokens": 723784285.0, + "step": 18968 + }, + { + "epoch": 2.413051774583386, + "ewc_loss": 0.04854239150881767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4271195798064582e-05, + "grad_norm": 30.84461784362793, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8598792552947998, + "num_tokens": 723817085.0, + "step": 18969 + }, + { + "epoch": 2.4131789848619767, + "ewc_loss": 0.04864507541060448, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4322538592969067e-05, + "grad_norm": 30.793630599975586, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8670377731323242, + "num_tokens": 723858466.0, + "step": 18970 + }, + { + "epoch": 2.413306195140567, + "ewc_loss": 0.04860341176390648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4301705707330257e-05, + "grad_norm": 30.73590850830078, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.871938943862915, + "num_tokens": 723896627.0, + "step": 18971 + }, + { + "epoch": 2.4134334054191577, + "ewc_loss": 0.04861491918563843, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4307459170813672e-05, + "grad_norm": 30.779062271118164, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8663953542709351, + "num_tokens": 723930727.0, + "step": 18972 + }, + { + "epoch": 2.4135606156977483, + "ewc_loss": 0.04868901148438454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.434450652799569e-05, + "grad_norm": 30.900653839111328, + "learning_rate": 1e-06, + "loss": 0.4569, + "mean_token_accuracy": 0.8604596853256226, + "num_tokens": 723965996.0, + "step": 18973 + }, + { + "epoch": 2.413687825976339, + "ewc_loss": 0.04872732236981392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.436366048641503e-05, + "grad_norm": 30.806161880493164, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8671875, + "num_tokens": 724002441.0, + "step": 18974 + }, + { + "epoch": 2.4138150362549293, + "ewc_loss": 0.04867497831583023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.433748886687681e-05, + "grad_norm": 30.752851486206055, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8763330578804016, + "num_tokens": 724039238.0, + "step": 18975 + }, + { + "epoch": 2.41394224653352, + "ewc_loss": 0.04873361811041832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4366809157072566e-05, + "grad_norm": 30.89132308959961, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8772827386856079, + "num_tokens": 724077430.0, + "step": 18976 + }, + { + "epoch": 2.4140694568121104, + "ewc_loss": 0.048697732388973236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.434886664559599e-05, + "grad_norm": 30.65973472595215, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8690793514251709, + "num_tokens": 724113175.0, + "step": 18977 + }, + { + "epoch": 2.414196667090701, + "ewc_loss": 0.04867999255657196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4339995434274897e-05, + "grad_norm": 30.846206665039062, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8660224080085754, + "num_tokens": 724156409.0, + "step": 18978 + }, + { + "epoch": 2.4143238773692914, + "ewc_loss": 0.0487707257270813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4385362848988734e-05, + "grad_norm": 30.75334358215332, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8717821836471558, + "num_tokens": 724199879.0, + "step": 18979 + }, + { + "epoch": 2.414451087647882, + "ewc_loss": 0.04871159791946411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4355798814212903e-05, + "grad_norm": 30.828569412231445, + "learning_rate": 1e-06, + "loss": 0.3714, + "mean_token_accuracy": 0.8849239349365234, + "num_tokens": 724238065.0, + "step": 18980 + }, + { + "epoch": 2.4145782979264725, + "ewc_loss": 0.04880700632929802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4403503630310297e-05, + "grad_norm": 30.903928756713867, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8638676404953003, + "num_tokens": 724272336.0, + "step": 18981 + }, + { + "epoch": 2.414705508205063, + "ewc_loss": 0.048737287521362305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4368644517380744e-05, + "grad_norm": 30.707027435302734, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8620177507400513, + "num_tokens": 724305200.0, + "step": 18982 + }, + { + "epoch": 2.4148327184836536, + "ewc_loss": 0.04872550815343857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4362754629692063e-05, + "grad_norm": 30.86265754699707, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.876366913318634, + "num_tokens": 724341295.0, + "step": 18983 + }, + { + "epoch": 2.414959928762244, + "ewc_loss": 0.0487753227353096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4387662051594816e-05, + "grad_norm": 30.776927947998047, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8739917278289795, + "num_tokens": 724384336.0, + "step": 18984 + }, + { + "epoch": 2.4150871390408346, + "ewc_loss": 0.04877037927508354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4385190044995397e-05, + "grad_norm": 30.85445213317871, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8670852780342102, + "num_tokens": 724417279.0, + "step": 18985 + }, + { + "epoch": 2.415214349319425, + "ewc_loss": 0.048643168061971664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4321583623532206e-05, + "grad_norm": 30.831707000732422, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8708249926567078, + "num_tokens": 724456509.0, + "step": 18986 + }, + { + "epoch": 2.4153415595980157, + "ewc_loss": 0.048788152635097504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.439407580823172e-05, + "grad_norm": 30.818693161010742, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8826507329940796, + "num_tokens": 724498812.0, + "step": 18987 + }, + { + "epoch": 2.415468769876606, + "ewc_loss": 0.0487196259200573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.435981332382653e-05, + "grad_norm": 30.908790588378906, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8602229356765747, + "num_tokens": 724538174.0, + "step": 18988 + }, + { + "epoch": 2.4155959801551967, + "ewc_loss": 0.0486765094101429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4338254661415704e-05, + "grad_norm": 30.72702980041504, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8768213987350464, + "num_tokens": 724577171.0, + "step": 18989 + }, + { + "epoch": 2.415723190433787, + "ewc_loss": 0.04875259846448898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4376298824790865e-05, + "grad_norm": 30.937162399291992, + "learning_rate": 1e-06, + "loss": 0.3528, + "mean_token_accuracy": 0.8929005861282349, + "num_tokens": 724610626.0, + "step": 18990 + }, + { + "epoch": 2.4158504007123778, + "ewc_loss": 0.04873128980398178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4365645003854297e-05, + "grad_norm": 30.828981399536133, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8840643167495728, + "num_tokens": 724647613.0, + "step": 18991 + }, + { + "epoch": 2.415977610990968, + "ewc_loss": 0.04864559695124626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4322798708453774e-05, + "grad_norm": 30.85624122619629, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8738171458244324, + "num_tokens": 724684898.0, + "step": 18992 + }, + { + "epoch": 2.4161048212695584, + "ewc_loss": 0.048707086592912674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4353543267352507e-05, + "grad_norm": 30.871845245361328, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8671079874038696, + "num_tokens": 724725566.0, + "step": 18993 + }, + { + "epoch": 2.416232031548149, + "ewc_loss": 0.048689693212509155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4344846679014154e-05, + "grad_norm": 30.990379333496094, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8625860214233398, + "num_tokens": 724763701.0, + "step": 18994 + }, + { + "epoch": 2.4163592418267394, + "ewc_loss": 0.048627354204654694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4313676476594992e-05, + "grad_norm": 30.769493103027344, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8713589310646057, + "num_tokens": 724798108.0, + "step": 18995 + }, + { + "epoch": 2.41648645210533, + "ewc_loss": 0.04861089959740639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4305449187522754e-05, + "grad_norm": 30.919954299926758, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8504430651664734, + "num_tokens": 724842430.0, + "step": 18996 + }, + { + "epoch": 2.4166136623839205, + "ewc_loss": 0.048691749572753906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4345874408027157e-05, + "grad_norm": 30.73942756652832, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8671119213104248, + "num_tokens": 724884585.0, + "step": 18997 + }, + { + "epoch": 2.416740872662511, + "ewc_loss": 0.048549458384513855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.427473009447567e-05, + "grad_norm": 30.976160049438477, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8824275135993958, + "num_tokens": 724919338.0, + "step": 18998 + }, + { + "epoch": 2.4168680829411016, + "ewc_loss": 0.048768337815999985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4384169591940008e-05, + "grad_norm": 30.8908748626709, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8928791284561157, + "num_tokens": 724956007.0, + "step": 18999 + }, + { + "epoch": 2.416995293219692, + "ewc_loss": 0.04847926273941994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.423963087494485e-05, + "grad_norm": 30.866092681884766, + "learning_rate": 1e-06, + "loss": 0.3689, + "mean_token_accuracy": 0.8868503570556641, + "num_tokens": 724996994.0, + "step": 19000 + }, + { + "epoch": 2.4171225034982826, + "ewc_loss": 0.048540279269218445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4270138965221122e-05, + "grad_norm": 30.806718826293945, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8655734062194824, + "num_tokens": 725035607.0, + "step": 19001 + }, + { + "epoch": 2.417249713776873, + "ewc_loss": 0.04854229837656021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.427114850434009e-05, + "grad_norm": 30.905820846557617, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8743689060211182, + "num_tokens": 725075370.0, + "step": 19002 + }, + { + "epoch": 2.4173769240554637, + "ewc_loss": 0.048622503876686096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4311251763720065e-05, + "grad_norm": 30.865135192871094, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8701053857803345, + "num_tokens": 725119536.0, + "step": 19003 + }, + { + "epoch": 2.417504134334054, + "ewc_loss": 0.04850868135690689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4254341042251326e-05, + "grad_norm": 30.860549926757812, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8715777397155762, + "num_tokens": 725158048.0, + "step": 19004 + }, + { + "epoch": 2.4176313446126447, + "ewc_loss": 0.04850735515356064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4253677111119032e-05, + "grad_norm": 30.755977630615234, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8809341192245483, + "num_tokens": 725196343.0, + "step": 19005 + }, + { + "epoch": 2.4177585548912353, + "ewc_loss": 0.048498064279556274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4249031412182376e-05, + "grad_norm": 30.89188575744629, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8734164237976074, + "num_tokens": 725234613.0, + "step": 19006 + }, + { + "epoch": 2.417885765169826, + "ewc_loss": 0.048558540642261505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4279270292026922e-05, + "grad_norm": 30.9469051361084, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8703933358192444, + "num_tokens": 725272129.0, + "step": 19007 + }, + { + "epoch": 2.4180129754484163, + "ewc_loss": 0.04850943014025688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4254715754068457e-05, + "grad_norm": 30.80449676513672, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8620021343231201, + "num_tokens": 725305740.0, + "step": 19008 + }, + { + "epoch": 2.418140185727007, + "ewc_loss": 0.04856548458337784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.428274274279829e-05, + "grad_norm": 30.958587646484375, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.868360698223114, + "num_tokens": 725340579.0, + "step": 19009 + }, + { + "epoch": 2.4182673960055974, + "ewc_loss": 0.04852316901087761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4261584258056246e-05, + "grad_norm": 30.799604415893555, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8747580051422119, + "num_tokens": 725380492.0, + "step": 19010 + }, + { + "epoch": 2.418394606284188, + "ewc_loss": 0.04853075370192528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.426537685096264e-05, + "grad_norm": 31.04643440246582, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.87395179271698, + "num_tokens": 725418043.0, + "step": 19011 + }, + { + "epoch": 2.4185218165627784, + "ewc_loss": 0.04854247719049454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4271239453810267e-05, + "grad_norm": 30.845033645629883, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8636684417724609, + "num_tokens": 725456260.0, + "step": 19012 + }, + { + "epoch": 2.418649026841369, + "ewc_loss": 0.04842348024249077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4211740310420282e-05, + "grad_norm": 30.848974227905273, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8698872923851013, + "num_tokens": 725496605.0, + "step": 19013 + }, + { + "epoch": 2.4187762371199595, + "ewc_loss": 0.04851950332522392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4259752535726875e-05, + "grad_norm": 30.79584503173828, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8744850158691406, + "num_tokens": 725532395.0, + "step": 19014 + }, + { + "epoch": 2.4189034473985496, + "ewc_loss": 0.04846622794866562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4233113435911946e-05, + "grad_norm": 30.918102264404297, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8667031526565552, + "num_tokens": 725575335.0, + "step": 19015 + }, + { + "epoch": 2.4190306576771405, + "ewc_loss": 0.04857320711016655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.428660445730202e-05, + "grad_norm": 30.905698776245117, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.878553032875061, + "num_tokens": 725612345.0, + "step": 19016 + }, + { + "epoch": 2.4191578679557306, + "ewc_loss": 0.04848112538456917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4240562197519466e-05, + "grad_norm": 30.836475372314453, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8750919699668884, + "num_tokens": 725645566.0, + "step": 19017 + }, + { + "epoch": 2.419285078234321, + "ewc_loss": 0.04852187633514404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4260938516817987e-05, + "grad_norm": 30.92157745361328, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8660368919372559, + "num_tokens": 725684218.0, + "step": 19018 + }, + { + "epoch": 2.4194122885129117, + "ewc_loss": 0.048590101301670074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4295050025102682e-05, + "grad_norm": 30.895015716552734, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8750841021537781, + "num_tokens": 725718478.0, + "step": 19019 + }, + { + "epoch": 2.419539498791502, + "ewc_loss": 0.04848761856555939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4243809093604796e-05, + "grad_norm": 30.973068237304688, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.864159107208252, + "num_tokens": 725756208.0, + "step": 19020 + }, + { + "epoch": 2.4196667090700927, + "ewc_loss": 0.04846905916929245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4234530428657308e-05, + "grad_norm": 30.83357810974121, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8658447265625, + "num_tokens": 725791884.0, + "step": 19021 + }, + { + "epoch": 2.4197939193486833, + "ewc_loss": 0.04852169379591942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.426084756734781e-05, + "grad_norm": 30.97183609008789, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8754839897155762, + "num_tokens": 725829138.0, + "step": 19022 + }, + { + "epoch": 2.419921129627274, + "ewc_loss": 0.04853197559714317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.426598803140223e-05, + "grad_norm": 30.916759490966797, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8725507259368896, + "num_tokens": 725868301.0, + "step": 19023 + }, + { + "epoch": 2.4200483399058643, + "ewc_loss": 0.04846895858645439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.423447949695401e-05, + "grad_norm": 30.864856719970703, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8615082502365112, + "num_tokens": 725906186.0, + "step": 19024 + }, + { + "epoch": 2.420175550184455, + "ewc_loss": 0.04849226400256157, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4246131943073124e-05, + "grad_norm": 30.844438552856445, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8597707748413086, + "num_tokens": 725947006.0, + "step": 19025 + }, + { + "epoch": 2.4203027604630454, + "ewc_loss": 0.04858054965734482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4290275177918375e-05, + "grad_norm": 30.899465560913086, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8758865594863892, + "num_tokens": 725987363.0, + "step": 19026 + }, + { + "epoch": 2.420429970741636, + "ewc_loss": 0.048532839864492416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.426641913189087e-05, + "grad_norm": 30.855255126953125, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8629666566848755, + "num_tokens": 726026176.0, + "step": 19027 + }, + { + "epoch": 2.4205571810202264, + "ewc_loss": 0.04850677400827408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4253386072814465e-05, + "grad_norm": 30.789173126220703, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8704668283462524, + "num_tokens": 726068124.0, + "step": 19028 + }, + { + "epoch": 2.420684391298817, + "ewc_loss": 0.048563431948423386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.428171683277469e-05, + "grad_norm": 30.86716651916504, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8627450466156006, + "num_tokens": 726110270.0, + "step": 19029 + }, + { + "epoch": 2.4208116015774075, + "ewc_loss": 0.04866814240813255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4334070985787548e-05, + "grad_norm": 30.942564010620117, + "learning_rate": 1e-06, + "loss": 0.366, + "mean_token_accuracy": 0.8872637152671814, + "num_tokens": 726143908.0, + "step": 19030 + }, + { + "epoch": 2.420938811855998, + "ewc_loss": 0.0485408678650856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4270433641504496e-05, + "grad_norm": 30.797080993652344, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8699851632118225, + "num_tokens": 726179082.0, + "step": 19031 + }, + { + "epoch": 2.4210660221345885, + "ewc_loss": 0.04858670383691788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.429335108899977e-05, + "grad_norm": 30.842702865600586, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8645440340042114, + "num_tokens": 726216387.0, + "step": 19032 + }, + { + "epoch": 2.421193232413179, + "ewc_loss": 0.048616133630275726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4308066713274457e-05, + "grad_norm": 30.815149307250977, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8806027173995972, + "num_tokens": 726258831.0, + "step": 19033 + }, + { + "epoch": 2.4213204426917696, + "ewc_loss": 0.048529189079999924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4264594685519114e-05, + "grad_norm": 30.751779556274414, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8734922409057617, + "num_tokens": 726301929.0, + "step": 19034 + }, + { + "epoch": 2.42144765297036, + "ewc_loss": 0.04861496388912201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.430748281767592e-05, + "grad_norm": 30.87394905090332, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8748243451118469, + "num_tokens": 726339938.0, + "step": 19035 + }, + { + "epoch": 2.4215748632489507, + "ewc_loss": 0.048561178147792816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4280589059344493e-05, + "grad_norm": 30.876462936401367, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8735079169273376, + "num_tokens": 726377197.0, + "step": 19036 + }, + { + "epoch": 2.421702073527541, + "ewc_loss": 0.048554301261901855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.427715116937179e-05, + "grad_norm": 30.89168357849121, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8749647736549377, + "num_tokens": 726417591.0, + "step": 19037 + }, + { + "epoch": 2.4218292838061317, + "ewc_loss": 0.04860766604542732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4303832105943002e-05, + "grad_norm": 30.92872428894043, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8617116808891296, + "num_tokens": 726456094.0, + "step": 19038 + }, + { + "epoch": 2.4219564940847222, + "ewc_loss": 0.04855583608150482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4277918782900088e-05, + "grad_norm": 30.85225486755371, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8803648948669434, + "num_tokens": 726491158.0, + "step": 19039 + }, + { + "epoch": 2.4220837043633123, + "ewc_loss": 0.04850555583834648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.425277853035368e-05, + "grad_norm": 30.812772750854492, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8579526543617249, + "num_tokens": 726531334.0, + "step": 19040 + }, + { + "epoch": 2.4222109146419033, + "ewc_loss": 0.04858921840786934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4294609829667024e-05, + "grad_norm": 30.83779525756836, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8611059188842773, + "num_tokens": 726571999.0, + "step": 19041 + }, + { + "epoch": 2.4223381249204934, + "ewc_loss": 0.048648346215486526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4324173864442855e-05, + "grad_norm": 30.99078941345215, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8629409074783325, + "num_tokens": 726604945.0, + "step": 19042 + }, + { + "epoch": 2.422465335199084, + "ewc_loss": 0.04866187646985054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4330938686034642e-05, + "grad_norm": 30.809728622436523, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8632224798202515, + "num_tokens": 726645023.0, + "step": 19043 + }, + { + "epoch": 2.4225925454776744, + "ewc_loss": 0.04858243092894554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4291215595440008e-05, + "grad_norm": 30.998674392700195, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8708091974258423, + "num_tokens": 726684297.0, + "step": 19044 + }, + { + "epoch": 2.422719755756265, + "ewc_loss": 0.048619262874126434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4309631044161506e-05, + "grad_norm": 30.82819938659668, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8598495125770569, + "num_tokens": 726726950.0, + "step": 19045 + }, + { + "epoch": 2.4228469660348555, + "ewc_loss": 0.04858517646789551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4292588932439685e-05, + "grad_norm": 30.97407341003418, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8525097370147705, + "num_tokens": 726761944.0, + "step": 19046 + }, + { + "epoch": 2.422974176313446, + "ewc_loss": 0.0486331433057785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4316572307725437e-05, + "grad_norm": 30.78898811340332, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8805437684059143, + "num_tokens": 726799041.0, + "step": 19047 + }, + { + "epoch": 2.4231013865920366, + "ewc_loss": 0.04858876392245293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.429438245599158e-05, + "grad_norm": 30.80955696105957, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8646475672721863, + "num_tokens": 726838160.0, + "step": 19048 + }, + { + "epoch": 2.423228596870627, + "ewc_loss": 0.0486857071518898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4342853066627868e-05, + "grad_norm": 30.853288650512695, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8815313577651978, + "num_tokens": 726873150.0, + "step": 19049 + }, + { + "epoch": 2.4233558071492176, + "ewc_loss": 0.048646848648786545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4323424440808594e-05, + "grad_norm": 30.735790252685547, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8761024475097656, + "num_tokens": 726905253.0, + "step": 19050 + }, + { + "epoch": 2.423483017427808, + "ewc_loss": 0.04862089082598686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4310445951414295e-05, + "grad_norm": 30.884262084960938, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8821535110473633, + "num_tokens": 726942842.0, + "step": 19051 + }, + { + "epoch": 2.4236102277063987, + "ewc_loss": 0.04881985858082771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4409930119873025e-05, + "grad_norm": 30.891666412353516, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8752154111862183, + "num_tokens": 726987974.0, + "step": 19052 + }, + { + "epoch": 2.423737437984989, + "ewc_loss": 0.048627860844135284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.431393113511149e-05, + "grad_norm": 30.729888916015625, + "learning_rate": 1e-06, + "loss": 0.364, + "mean_token_accuracy": 0.8899080753326416, + "num_tokens": 727021614.0, + "step": 19053 + }, + { + "epoch": 2.4238646482635797, + "ewc_loss": 0.04859975352883339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4299877622979693e-05, + "grad_norm": 30.89887046813965, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8692483305931091, + "num_tokens": 727061773.0, + "step": 19054 + }, + { + "epoch": 2.4239918585421703, + "ewc_loss": 0.048745229840278625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4372615371248685e-05, + "grad_norm": 30.928844451904297, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8662943840026855, + "num_tokens": 727095877.0, + "step": 19055 + }, + { + "epoch": 2.424119068820761, + "ewc_loss": 0.048613931983709335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.430696622468531e-05, + "grad_norm": 30.839956283569336, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8733012080192566, + "num_tokens": 727128676.0, + "step": 19056 + }, + { + "epoch": 2.4242462790993513, + "ewc_loss": 0.04873969033360481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4369845050387084e-05, + "grad_norm": 31.015962600708008, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8802708387374878, + "num_tokens": 727164528.0, + "step": 19057 + }, + { + "epoch": 2.424373489377942, + "ewc_loss": 0.04862048476934433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4310242224601097e-05, + "grad_norm": 30.838207244873047, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8779241442680359, + "num_tokens": 727202403.0, + "step": 19058 + }, + { + "epoch": 2.4245006996565324, + "ewc_loss": 0.04860522225499153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4302611564053223e-05, + "grad_norm": 30.90870475769043, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8743000030517578, + "num_tokens": 727242231.0, + "step": 19059 + }, + { + "epoch": 2.424627909935123, + "ewc_loss": 0.04865927994251251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.432963992760051e-05, + "grad_norm": 30.901182174682617, + "learning_rate": 1e-06, + "loss": 0.4603, + "mean_token_accuracy": 0.8577749133110046, + "num_tokens": 727276371.0, + "step": 19060 + }, + { + "epoch": 2.4247551202137134, + "ewc_loss": 0.04854106903076172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.427053368592169e-05, + "grad_norm": 30.818490982055664, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8752025365829468, + "num_tokens": 727320101.0, + "step": 19061 + }, + { + "epoch": 2.424882330492304, + "ewc_loss": 0.04862610250711441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4313050744240172e-05, + "grad_norm": 30.884521484375, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8630434274673462, + "num_tokens": 727363687.0, + "step": 19062 + }, + { + "epoch": 2.4250095407708945, + "ewc_loss": 0.04866807907819748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4334040062967688e-05, + "grad_norm": 30.84803009033203, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8770244717597961, + "num_tokens": 727399772.0, + "step": 19063 + }, + { + "epoch": 2.425136751049485, + "ewc_loss": 0.04862123727798462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.431061875540763e-05, + "grad_norm": 30.925512313842773, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8802309632301331, + "num_tokens": 727442944.0, + "step": 19064 + }, + { + "epoch": 2.425263961328075, + "ewc_loss": 0.04872790351510048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4363951524719596e-05, + "grad_norm": 30.927871704101562, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8800131678581238, + "num_tokens": 727483283.0, + "step": 19065 + }, + { + "epoch": 2.425391171606666, + "ewc_loss": 0.04858879745006561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4294398826896213e-05, + "grad_norm": 30.891826629638672, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8702480792999268, + "num_tokens": 727526316.0, + "step": 19066 + }, + { + "epoch": 2.425518381885256, + "ewc_loss": 0.048615097999572754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4307548301294446e-05, + "grad_norm": 30.724754333496094, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8785213828086853, + "num_tokens": 727563218.0, + "step": 19067 + }, + { + "epoch": 2.4256455921638467, + "ewc_loss": 0.048605743795633316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.430287167953793e-05, + "grad_norm": 30.833715438842773, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8841674327850342, + "num_tokens": 727594877.0, + "step": 19068 + }, + { + "epoch": 2.425772802442437, + "ewc_loss": 0.048731692135334015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.436584691167809e-05, + "grad_norm": 30.805002212524414, + "learning_rate": 1e-06, + "loss": 0.3755, + "mean_token_accuracy": 0.8847677111625671, + "num_tokens": 727627561.0, + "step": 19069 + }, + { + "epoch": 2.4259000127210277, + "ewc_loss": 0.04865489900112152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.432744986435864e-05, + "grad_norm": 30.894119262695312, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8642427921295166, + "num_tokens": 727664384.0, + "step": 19070 + }, + { + "epoch": 2.4260272229996183, + "ewc_loss": 0.0486377589404583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4318878786289133e-05, + "grad_norm": 30.8137149810791, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8681317567825317, + "num_tokens": 727700866.0, + "step": 19071 + }, + { + "epoch": 2.426154433278209, + "ewc_loss": 0.04862234368920326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.431117172818631e-05, + "grad_norm": 30.923559188842773, + "learning_rate": 1e-06, + "loss": 0.37, + "mean_token_accuracy": 0.8894956111907959, + "num_tokens": 727741539.0, + "step": 19072 + }, + { + "epoch": 2.4262816435567993, + "ewc_loss": 0.04868748411536217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.43437425524462e-05, + "grad_norm": 30.891918182373047, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8716549873352051, + "num_tokens": 727777828.0, + "step": 19073 + }, + { + "epoch": 2.42640885383539, + "ewc_loss": 0.04860350489616394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.430175300105475e-05, + "grad_norm": 30.8848819732666, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8797705769538879, + "num_tokens": 727818979.0, + "step": 19074 + }, + { + "epoch": 2.4265360641139804, + "ewc_loss": 0.04874112457036972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.437056173221208e-05, + "grad_norm": 30.815507888793945, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8714947700500488, + "num_tokens": 727856440.0, + "step": 19075 + }, + { + "epoch": 2.426663274392571, + "ewc_loss": 0.048594266176223755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4297132767969742e-05, + "grad_norm": 30.78297233581543, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8806763887405396, + "num_tokens": 727895036.0, + "step": 19076 + }, + { + "epoch": 2.4267904846711614, + "ewc_loss": 0.048742882907390594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4371442123083398e-05, + "grad_norm": 30.844493865966797, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8614113330841064, + "num_tokens": 727932292.0, + "step": 19077 + }, + { + "epoch": 2.426917694949752, + "ewc_loss": 0.04875428229570389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4377141016884707e-05, + "grad_norm": 30.902481079101562, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8723008632659912, + "num_tokens": 727967005.0, + "step": 19078 + }, + { + "epoch": 2.4270449052283425, + "ewc_loss": 0.048697199672460556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.434859925415367e-05, + "grad_norm": 30.815616607666016, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8625824451446533, + "num_tokens": 728004262.0, + "step": 19079 + }, + { + "epoch": 2.427172115506933, + "ewc_loss": 0.04869496449828148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4347482394659892e-05, + "grad_norm": 30.725074768066406, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.881711483001709, + "num_tokens": 728041451.0, + "step": 19080 + }, + { + "epoch": 2.4272993257855235, + "ewc_loss": 0.048740774393081665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.437038710922934e-05, + "grad_norm": 30.954021453857422, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8707391023635864, + "num_tokens": 728079217.0, + "step": 19081 + }, + { + "epoch": 2.427426536064114, + "ewc_loss": 0.04871512949466705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4357565052923746e-05, + "grad_norm": 30.757343292236328, + "learning_rate": 1e-06, + "loss": 0.3562, + "mean_token_accuracy": 0.8876655101776123, + "num_tokens": 728116221.0, + "step": 19082 + }, + { + "epoch": 2.4275537463427046, + "ewc_loss": 0.048702191561460495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4351094907615334e-05, + "grad_norm": 30.954805374145508, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8645517826080322, + "num_tokens": 728154861.0, + "step": 19083 + }, + { + "epoch": 2.427680956621295, + "ewc_loss": 0.04870675876736641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4353379558306187e-05, + "grad_norm": 30.74759292602539, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8610563278198242, + "num_tokens": 728188276.0, + "step": 19084 + }, + { + "epoch": 2.4278081668998857, + "ewc_loss": 0.048747047781944275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4373523046961054e-05, + "grad_norm": 30.930938720703125, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.88271164894104, + "num_tokens": 728229313.0, + "step": 19085 + }, + { + "epoch": 2.427935377178476, + "ewc_loss": 0.04885975643992424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4429878976661712e-05, + "grad_norm": 30.991966247558594, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8603599071502686, + "num_tokens": 728268706.0, + "step": 19086 + }, + { + "epoch": 2.4280625874570667, + "ewc_loss": 0.04875248670578003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4376242436119355e-05, + "grad_norm": 31.06311798095703, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8854855298995972, + "num_tokens": 728297490.0, + "step": 19087 + }, + { + "epoch": 2.428189797735657, + "ewc_loss": 0.04869387671351433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.434693851682823e-05, + "grad_norm": 30.89249038696289, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.864607572555542, + "num_tokens": 728336041.0, + "step": 19088 + }, + { + "epoch": 2.4283170080142478, + "ewc_loss": 0.04863940551877022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.431970278848894e-05, + "grad_norm": 30.93092918395996, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.87890625, + "num_tokens": 728372701.0, + "step": 19089 + }, + { + "epoch": 2.428444218292838, + "ewc_loss": 0.04868444427847862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4342221877304837e-05, + "grad_norm": 30.915904998779297, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8747638463973999, + "num_tokens": 728408138.0, + "step": 19090 + }, + { + "epoch": 2.4285714285714284, + "ewc_loss": 0.048618167638778687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.430908352835104e-05, + "grad_norm": 30.84551239013672, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8793050050735474, + "num_tokens": 728452649.0, + "step": 19091 + }, + { + "epoch": 2.428698638850019, + "ewc_loss": 0.048747468739748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4373734049731866e-05, + "grad_norm": 30.910615921020508, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8631911873817444, + "num_tokens": 728490551.0, + "step": 19092 + }, + { + "epoch": 2.4288258491286094, + "ewc_loss": 0.048676393926143646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.433819645375479e-05, + "grad_norm": 30.964675903320312, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8781615495681763, + "num_tokens": 728529558.0, + "step": 19093 + }, + { + "epoch": 2.4289530594072, + "ewc_loss": 0.04866698011755943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4333490728167817e-05, + "grad_norm": 30.90569496154785, + "learning_rate": 1e-06, + "loss": 0.4932, + "mean_token_accuracy": 0.8482305407524109, + "num_tokens": 728573495.0, + "step": 19094 + }, + { + "epoch": 2.4290802696857905, + "ewc_loss": 0.048681531101465225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4340764866792597e-05, + "grad_norm": 31.005653381347656, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8727280497550964, + "num_tokens": 728609166.0, + "step": 19095 + }, + { + "epoch": 2.429207479964381, + "ewc_loss": 0.04867366701364517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4336834030691534e-05, + "grad_norm": 30.731048583984375, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8787637948989868, + "num_tokens": 728653070.0, + "step": 19096 + }, + { + "epoch": 2.4293346902429716, + "ewc_loss": 0.04864365980029106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4321829187101685e-05, + "grad_norm": 31.01878547668457, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8746696710586548, + "num_tokens": 728689142.0, + "step": 19097 + }, + { + "epoch": 2.429461900521562, + "ewc_loss": 0.048689477145671844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4344739358639345e-05, + "grad_norm": 30.772750854492188, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8760691285133362, + "num_tokens": 728725615.0, + "step": 19098 + }, + { + "epoch": 2.4295891108001526, + "ewc_loss": 0.04860147833824158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4300739823956974e-05, + "grad_norm": 30.79717445373535, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.862374484539032, + "num_tokens": 728762366.0, + "step": 19099 + }, + { + "epoch": 2.429716321078743, + "ewc_loss": 0.04871299862861633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.435649912513327e-05, + "grad_norm": 30.76555824279785, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8598122000694275, + "num_tokens": 728801951.0, + "step": 19100 + }, + { + "epoch": 2.4298435313573337, + "ewc_loss": 0.04870583117008209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4352915716008283e-05, + "grad_norm": 30.851816177368164, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.880840539932251, + "num_tokens": 728836403.0, + "step": 19101 + }, + { + "epoch": 2.429970741635924, + "ewc_loss": 0.048756830394268036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4378416128456593e-05, + "grad_norm": 30.90274429321289, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8792334794998169, + "num_tokens": 728878485.0, + "step": 19102 + }, + { + "epoch": 2.4300979519145147, + "ewc_loss": 0.0486699678003788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4334984118468128e-05, + "grad_norm": 30.742692947387695, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8817031979560852, + "num_tokens": 728915935.0, + "step": 19103 + }, + { + "epoch": 2.4302251621931052, + "ewc_loss": 0.04872068017721176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4360340830753557e-05, + "grad_norm": 30.92117691040039, + "learning_rate": 1e-06, + "loss": 0.4751, + "mean_token_accuracy": 0.8548469543457031, + "num_tokens": 728956847.0, + "step": 19104 + }, + { + "epoch": 2.4303523724716958, + "ewc_loss": 0.04876057058572769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4380286049563438e-05, + "grad_norm": 30.84770965576172, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8773619532585144, + "num_tokens": 728994865.0, + "step": 19105 + }, + { + "epoch": 2.4304795827502863, + "ewc_loss": 0.04868495464324951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4342476535821334e-05, + "grad_norm": 30.88703155517578, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.865143895149231, + "num_tokens": 729033030.0, + "step": 19106 + }, + { + "epoch": 2.430606793028877, + "ewc_loss": 0.04875590652227402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4377954105148092e-05, + "grad_norm": 31.05079460144043, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8819576501846313, + "num_tokens": 729070430.0, + "step": 19107 + }, + { + "epoch": 2.4307340033074674, + "ewc_loss": 0.048663169145584106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.43315844272729e-05, + "grad_norm": 30.89723777770996, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8811732530593872, + "num_tokens": 729102127.0, + "step": 19108 + }, + { + "epoch": 2.430861213586058, + "ewc_loss": 0.04860367998480797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.430184031254612e-05, + "grad_norm": 30.886127471923828, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8758882284164429, + "num_tokens": 729138128.0, + "step": 19109 + }, + { + "epoch": 2.4309884238646484, + "ewc_loss": 0.04870959743857384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4354798370040953e-05, + "grad_norm": 30.905057907104492, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8689620494842529, + "num_tokens": 729179756.0, + "step": 19110 + }, + { + "epoch": 2.431115634143239, + "ewc_loss": 0.04865996912121773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.432998371659778e-05, + "grad_norm": 30.857650756835938, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.860610842704773, + "num_tokens": 729221356.0, + "step": 19111 + }, + { + "epoch": 2.4312428444218295, + "ewc_loss": 0.04863697290420532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4318485884577967e-05, + "grad_norm": 30.856691360473633, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8754847645759583, + "num_tokens": 729260456.0, + "step": 19112 + }, + { + "epoch": 2.4313700547004196, + "ewc_loss": 0.0487838014960289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4391900296905078e-05, + "grad_norm": 30.965396881103516, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8601473569869995, + "num_tokens": 729299654.0, + "step": 19113 + }, + { + "epoch": 2.4314972649790105, + "ewc_loss": 0.048658180981874466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.432909059280064e-05, + "grad_norm": 30.779743194580078, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8750607967376709, + "num_tokens": 729341296.0, + "step": 19114 + }, + { + "epoch": 2.4316244752576006, + "ewc_loss": 0.04874897375702858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4374487111344934e-05, + "grad_norm": 30.93160057067871, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.876109778881073, + "num_tokens": 729379632.0, + "step": 19115 + }, + { + "epoch": 2.431751685536191, + "ewc_loss": 0.04875481501221657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4377406589337625e-05, + "grad_norm": 30.940458297729492, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.873711109161377, + "num_tokens": 729423202.0, + "step": 19116 + }, + { + "epoch": 2.4318788958147817, + "ewc_loss": 0.04865425452589989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4327127903234214e-05, + "grad_norm": 30.82421875, + "learning_rate": 1e-06, + "loss": 0.3527, + "mean_token_accuracy": 0.8899973034858704, + "num_tokens": 729457108.0, + "step": 19117 + }, + { + "epoch": 2.432006106093372, + "ewc_loss": 0.0487581230700016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.437906186969485e-05, + "grad_norm": 31.00986671447754, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8679807782173157, + "num_tokens": 729494065.0, + "step": 19118 + }, + { + "epoch": 2.4321333163719627, + "ewc_loss": 0.04871965944766998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.435982969473116e-05, + "grad_norm": 30.76520347595215, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.875696063041687, + "num_tokens": 729536621.0, + "step": 19119 + }, + { + "epoch": 2.4322605266505533, + "ewc_loss": 0.048738475888967514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.43692375079263e-05, + "grad_norm": 30.931137084960938, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.872984766960144, + "num_tokens": 729574423.0, + "step": 19120 + }, + { + "epoch": 2.432387736929144, + "ewc_loss": 0.04872751235961914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4363756892853417e-05, + "grad_norm": 30.909626007080078, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8666070699691772, + "num_tokens": 729614673.0, + "step": 19121 + }, + { + "epoch": 2.4325149472077343, + "ewc_loss": 0.048692841082811356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.434642010484822e-05, + "grad_norm": 30.795045852661133, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8735511302947998, + "num_tokens": 729653049.0, + "step": 19122 + }, + { + "epoch": 2.432642157486325, + "ewc_loss": 0.04871232435107231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4356162612093613e-05, + "grad_norm": 31.005434036254883, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8731074929237366, + "num_tokens": 729689667.0, + "step": 19123 + }, + { + "epoch": 2.4327693677649154, + "ewc_loss": 0.04871537908911705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4357688744203188e-05, + "grad_norm": 30.866716384887695, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8772444725036621, + "num_tokens": 729734747.0, + "step": 19124 + }, + { + "epoch": 2.432896578043506, + "ewc_loss": 0.048626165837049484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4313083486049436e-05, + "grad_norm": 30.880216598510742, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8674780130386353, + "num_tokens": 729770135.0, + "step": 19125 + }, + { + "epoch": 2.4330237883220964, + "ewc_loss": 0.04866838827729225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.433419467706699e-05, + "grad_norm": 30.828109741210938, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8709475994110107, + "num_tokens": 729809203.0, + "step": 19126 + }, + { + "epoch": 2.433150998600687, + "ewc_loss": 0.04875112697482109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4375563953071833e-05, + "grad_norm": 30.981088638305664, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8780361413955688, + "num_tokens": 729850164.0, + "step": 19127 + }, + { + "epoch": 2.4332782088792775, + "ewc_loss": 0.0486236996948719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4311850211233832e-05, + "grad_norm": 30.856983184814453, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8616450428962708, + "num_tokens": 729884337.0, + "step": 19128 + }, + { + "epoch": 2.433405419157868, + "ewc_loss": 0.04864580184221268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4322900571860373e-05, + "grad_norm": 30.86866569519043, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8845627307891846, + "num_tokens": 729924098.0, + "step": 19129 + }, + { + "epoch": 2.4335326294364585, + "ewc_loss": 0.04871170222759247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4355851564905606e-05, + "grad_norm": 31.053497314453125, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8900557160377502, + "num_tokens": 729960671.0, + "step": 19130 + }, + { + "epoch": 2.433659839715049, + "ewc_loss": 0.04864712059497833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.432356086501386e-05, + "grad_norm": 30.912355422973633, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8732818365097046, + "num_tokens": 730000780.0, + "step": 19131 + }, + { + "epoch": 2.4337870499936396, + "ewc_loss": 0.04859791696071625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4298959033330902e-05, + "grad_norm": 30.908493041992188, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8699517846107483, + "num_tokens": 730045198.0, + "step": 19132 + }, + { + "epoch": 2.43391426027223, + "ewc_loss": 0.048636432737112045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4318216674146242e-05, + "grad_norm": 30.94481658935547, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8652602434158325, + "num_tokens": 730085010.0, + "step": 19133 + }, + { + "epoch": 2.4340414705508207, + "ewc_loss": 0.04865257814526558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.432628934911918e-05, + "grad_norm": 30.94416046142578, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8664200305938721, + "num_tokens": 730121512.0, + "step": 19134 + }, + { + "epoch": 2.434168680829411, + "ewc_loss": 0.048638343811035156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4319171643583104e-05, + "grad_norm": 30.972763061523438, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8705657720565796, + "num_tokens": 730164859.0, + "step": 19135 + }, + { + "epoch": 2.4342958911080017, + "ewc_loss": 0.048615019768476486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.430751010251697e-05, + "grad_norm": 31.01443862915039, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8727548122406006, + "num_tokens": 730202273.0, + "step": 19136 + }, + { + "epoch": 2.4344231013865922, + "ewc_loss": 0.048603836447000504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.430191852909047e-05, + "grad_norm": 30.894811630249023, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8649736642837524, + "num_tokens": 730248208.0, + "step": 19137 + }, + { + "epoch": 2.4345503116651823, + "ewc_loss": 0.04851692542433739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.425846287223976e-05, + "grad_norm": 30.850839614868164, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8812637329101562, + "num_tokens": 730291084.0, + "step": 19138 + }, + { + "epoch": 2.4346775219437733, + "ewc_loss": 0.04863162711262703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.431581378914416e-05, + "grad_norm": 30.91383934020996, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8773365020751953, + "num_tokens": 730334504.0, + "step": 19139 + }, + { + "epoch": 2.4348047322223634, + "ewc_loss": 0.048600345849990845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4300172299263068e-05, + "grad_norm": 30.882030487060547, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8755509853363037, + "num_tokens": 730373356.0, + "step": 19140 + }, + { + "epoch": 2.434931942500954, + "ewc_loss": 0.04858574643731117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4292872694786638e-05, + "grad_norm": 30.867206573486328, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8589121103286743, + "num_tokens": 730416229.0, + "step": 19141 + }, + { + "epoch": 2.4350591527795444, + "ewc_loss": 0.04863177239894867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.43158865487203e-05, + "grad_norm": 31.03079605102539, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8839893341064453, + "num_tokens": 730458922.0, + "step": 19142 + }, + { + "epoch": 2.435186363058135, + "ewc_loss": 0.04857165366411209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.42858259298373e-05, + "grad_norm": 30.96773910522461, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8698340654373169, + "num_tokens": 730494583.0, + "step": 19143 + }, + { + "epoch": 2.4353135733367255, + "ewc_loss": 0.04858715832233429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4293578462675214e-05, + "grad_norm": 31.04337501525879, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8743778467178345, + "num_tokens": 730529526.0, + "step": 19144 + }, + { + "epoch": 2.435440783615316, + "ewc_loss": 0.04855078458786011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4275392206618562e-05, + "grad_norm": 30.901451110839844, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8756980895996094, + "num_tokens": 730558584.0, + "step": 19145 + }, + { + "epoch": 2.4355679938939065, + "ewc_loss": 0.0485386960208416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.426934770483058e-05, + "grad_norm": 31.033002853393555, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8734878301620483, + "num_tokens": 730598677.0, + "step": 19146 + }, + { + "epoch": 2.435695204172497, + "ewc_loss": 0.048620909452438354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4310455046361312e-05, + "grad_norm": 30.88327407836914, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8815135955810547, + "num_tokens": 730636985.0, + "step": 19147 + }, + { + "epoch": 2.4358224144510876, + "ewc_loss": 0.04852106422185898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4260532882180996e-05, + "grad_norm": 30.957834243774414, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8542947173118591, + "num_tokens": 730675409.0, + "step": 19148 + }, + { + "epoch": 2.435949624729678, + "ewc_loss": 0.04857772961258888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4288865461130626e-05, + "grad_norm": 30.83074951171875, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8615192174911499, + "num_tokens": 730714739.0, + "step": 19149 + }, + { + "epoch": 2.4360768350082687, + "ewc_loss": 0.04851552098989487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4257760742329992e-05, + "grad_norm": 30.94568634033203, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8710027933120728, + "num_tokens": 730757059.0, + "step": 19150 + }, + { + "epoch": 2.436204045286859, + "ewc_loss": 0.04873540624976158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4367702280869707e-05, + "grad_norm": 30.95637321472168, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8713548183441162, + "num_tokens": 730793469.0, + "step": 19151 + }, + { + "epoch": 2.4363312555654497, + "ewc_loss": 0.04853193089365959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4265966203529388e-05, + "grad_norm": 30.89981460571289, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8773082494735718, + "num_tokens": 730829336.0, + "step": 19152 + }, + { + "epoch": 2.4364584658440402, + "ewc_loss": 0.048708200454711914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.435409987810999e-05, + "grad_norm": 30.929607391357422, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8701428771018982, + "num_tokens": 730869304.0, + "step": 19153 + }, + { + "epoch": 2.4365856761226308, + "ewc_loss": 0.04860657826066017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4303290047100745e-05, + "grad_norm": 30.862598419189453, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8605127334594727, + "num_tokens": 730905315.0, + "step": 19154 + }, + { + "epoch": 2.4367128864012213, + "ewc_loss": 0.04869670793414116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.434835369058419e-05, + "grad_norm": 31.120126724243164, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8633988499641418, + "num_tokens": 730947346.0, + "step": 19155 + }, + { + "epoch": 2.436840096679812, + "ewc_loss": 0.048751287162303925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.437564398860559e-05, + "grad_norm": 30.88957405090332, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8765885829925537, + "num_tokens": 730976907.0, + "step": 19156 + }, + { + "epoch": 2.4369673069584024, + "ewc_loss": 0.04856911301612854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4284556275233626e-05, + "grad_norm": 31.01890754699707, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8677771091461182, + "num_tokens": 731015940.0, + "step": 19157 + }, + { + "epoch": 2.437094517236993, + "ewc_loss": 0.048619892448186874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.430994572932832e-05, + "grad_norm": 30.81462287902832, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8888092637062073, + "num_tokens": 731051345.0, + "step": 19158 + }, + { + "epoch": 2.4372217275155834, + "ewc_loss": 0.048594266176223755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4297132767969742e-05, + "grad_norm": 30.932397842407227, + "learning_rate": 1e-06, + "loss": 0.5021, + "mean_token_accuracy": 0.8402730822563171, + "num_tokens": 731089642.0, + "step": 19159 + }, + { + "epoch": 2.437348937794174, + "ewc_loss": 0.04878632724285126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4393164494540542e-05, + "grad_norm": 30.982158660888672, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8891127109527588, + "num_tokens": 731133196.0, + "step": 19160 + }, + { + "epoch": 2.4374761480727645, + "ewc_loss": 0.04865628853440285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4328144718310796e-05, + "grad_norm": 30.8214111328125, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8609004020690918, + "num_tokens": 731175510.0, + "step": 19161 + }, + { + "epoch": 2.437603358351355, + "ewc_loss": 0.04863333702087402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4316668714163825e-05, + "grad_norm": 30.84798240661621, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8796151876449585, + "num_tokens": 731217526.0, + "step": 19162 + }, + { + "epoch": 2.437730568629945, + "ewc_loss": 0.04878552258014679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4392760678892955e-05, + "grad_norm": 30.813732147216797, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8596538305282593, + "num_tokens": 731259525.0, + "step": 19163 + }, + { + "epoch": 2.437857778908536, + "ewc_loss": 0.04868466034531593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.434233101666905e-05, + "grad_norm": 30.835676193237305, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8755184412002563, + "num_tokens": 731301480.0, + "step": 19164 + }, + { + "epoch": 2.437984989187126, + "ewc_loss": 0.0487944632768631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.439723175484687e-05, + "grad_norm": 30.847068786621094, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8809129595756531, + "num_tokens": 731347385.0, + "step": 19165 + }, + { + "epoch": 2.4381121994657167, + "ewc_loss": 0.04872201755642891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.436100839986466e-05, + "grad_norm": 30.805505752563477, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8762288093566895, + "num_tokens": 731383666.0, + "step": 19166 + }, + { + "epoch": 2.438239409744307, + "ewc_loss": 0.04875456914305687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4377284717047587e-05, + "grad_norm": 30.877729415893555, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8717449903488159, + "num_tokens": 731420297.0, + "step": 19167 + }, + { + "epoch": 2.4383666200228977, + "ewc_loss": 0.04873388633131981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.436694376228843e-05, + "grad_norm": 30.953454971313477, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8679245710372925, + "num_tokens": 731455083.0, + "step": 19168 + }, + { + "epoch": 2.4384938303014883, + "ewc_loss": 0.0487312413752079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.436562135699205e-05, + "grad_norm": 30.80020523071289, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8805205225944519, + "num_tokens": 731482555.0, + "step": 19169 + }, + { + "epoch": 2.438621040580079, + "ewc_loss": 0.04870394244790077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4351971660507843e-05, + "grad_norm": 30.89516258239746, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8837695717811584, + "num_tokens": 731519459.0, + "step": 19170 + }, + { + "epoch": 2.4387482508586693, + "ewc_loss": 0.04874502867460251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4372513507842086e-05, + "grad_norm": 30.939361572265625, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8764071464538574, + "num_tokens": 731554577.0, + "step": 19171 + }, + { + "epoch": 2.43887546113726, + "ewc_loss": 0.048727184534072876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4363591364817694e-05, + "grad_norm": 30.790502548217773, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8629356026649475, + "num_tokens": 731594993.0, + "step": 19172 + }, + { + "epoch": 2.4390026714158504, + "ewc_loss": 0.048680949956178665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4340475647477433e-05, + "grad_norm": 30.919998168945312, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8769198656082153, + "num_tokens": 731629182.0, + "step": 19173 + }, + { + "epoch": 2.439129881694441, + "ewc_loss": 0.04878689721226692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4393448256887496e-05, + "grad_norm": 30.845294952392578, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8710461854934692, + "num_tokens": 731667661.0, + "step": 19174 + }, + { + "epoch": 2.4392570919730314, + "ewc_loss": 0.04871707782149315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4358538212254643e-05, + "grad_norm": 31.0535945892334, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8725399971008301, + "num_tokens": 731704911.0, + "step": 19175 + }, + { + "epoch": 2.439384302251622, + "ewc_loss": 0.04880492389202118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4402461349382065e-05, + "grad_norm": 30.85439682006836, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.85612952709198, + "num_tokens": 731744990.0, + "step": 19176 + }, + { + "epoch": 2.4395115125302125, + "ewc_loss": 0.048724815249443054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4362407202715985e-05, + "grad_norm": 30.962629318237305, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8732625842094421, + "num_tokens": 731783335.0, + "step": 19177 + }, + { + "epoch": 2.439638722808803, + "ewc_loss": 0.04883727803826332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4418639441137202e-05, + "grad_norm": 31.008092880249023, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8753488659858704, + "num_tokens": 731819206.0, + "step": 19178 + }, + { + "epoch": 2.4397659330873935, + "ewc_loss": 0.04869919642806053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4349597879336216e-05, + "grad_norm": 30.963199615478516, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8684163093566895, + "num_tokens": 731855291.0, + "step": 19179 + }, + { + "epoch": 2.439893143365984, + "ewc_loss": 0.048672303557395935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4336151909665205e-05, + "grad_norm": 30.910316467285156, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8760697841644287, + "num_tokens": 731899392.0, + "step": 19180 + }, + { + "epoch": 2.4400203536445746, + "ewc_loss": 0.048649828881025314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4324914193130098e-05, + "grad_norm": 30.86359214782715, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8706387281417847, + "num_tokens": 731937502.0, + "step": 19181 + }, + { + "epoch": 2.440147563923165, + "ewc_loss": 0.04876785725355148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.438392948533874e-05, + "grad_norm": 30.966293334960938, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8743041753768921, + "num_tokens": 731974946.0, + "step": 19182 + }, + { + "epoch": 2.4402747742017556, + "ewc_loss": 0.04879087954759598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4395440050284378e-05, + "grad_norm": 30.775390625, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8592642545700073, + "num_tokens": 732018732.0, + "step": 19183 + }, + { + "epoch": 2.440401984480346, + "ewc_loss": 0.0487021766602993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.435108763165772e-05, + "grad_norm": 30.91401481628418, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8711032867431641, + "num_tokens": 732052036.0, + "step": 19184 + }, + { + "epoch": 2.4405291947589367, + "ewc_loss": 0.048832882195711136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4416440282948315e-05, + "grad_norm": 30.87847137451172, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8673320412635803, + "num_tokens": 732090944.0, + "step": 19185 + }, + { + "epoch": 2.440656405037527, + "ewc_loss": 0.04873892664909363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.436946306261234e-05, + "grad_norm": 30.871423721313477, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.874601423740387, + "num_tokens": 732131406.0, + "step": 19186 + }, + { + "epoch": 2.4407836153161178, + "ewc_loss": 0.048723816871643066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4361908799619414e-05, + "grad_norm": 30.7225284576416, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.87051922082901, + "num_tokens": 732171701.0, + "step": 19187 + }, + { + "epoch": 2.440910825594708, + "ewc_loss": 0.04875536262989044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.437768125673756e-05, + "grad_norm": 30.867393493652344, + "learning_rate": 1e-06, + "loss": 0.3841, + "mean_token_accuracy": 0.8801479339599609, + "num_tokens": 732209658.0, + "step": 19188 + }, + { + "epoch": 2.4410380358732984, + "ewc_loss": 0.048896677792072296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4448338081128895e-05, + "grad_norm": 30.813962936401367, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8603094220161438, + "num_tokens": 732251395.0, + "step": 19189 + }, + { + "epoch": 2.441165246151889, + "ewc_loss": 0.04872573912143707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4362869226024486e-05, + "grad_norm": 30.808168411254883, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8619791269302368, + "num_tokens": 732289306.0, + "step": 19190 + }, + { + "epoch": 2.4412924564304794, + "ewc_loss": 0.048894040286540985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4447019313811325e-05, + "grad_norm": 30.79579734802246, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8575670123100281, + "num_tokens": 732337916.0, + "step": 19191 + }, + { + "epoch": 2.44141966670907, + "ewc_loss": 0.048809751868247986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.440487514832057e-05, + "grad_norm": 30.83674430847168, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8585419654846191, + "num_tokens": 732380358.0, + "step": 19192 + }, + { + "epoch": 2.4415468769876605, + "ewc_loss": 0.04876922070980072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4384609787375666e-05, + "grad_norm": 30.758529663085938, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8824512958526611, + "num_tokens": 732416534.0, + "step": 19193 + }, + { + "epoch": 2.441674087266251, + "ewc_loss": 0.04883170500397682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.441585274937097e-05, + "grad_norm": 30.880020141601562, + "learning_rate": 1e-06, + "loss": 0.4769, + "mean_token_accuracy": 0.8513250350952148, + "num_tokens": 732458927.0, + "step": 19194 + }, + { + "epoch": 2.4418012975448415, + "ewc_loss": 0.04892738163471222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4463690351694822e-05, + "grad_norm": 30.93149757385254, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.8633747696876526, + "num_tokens": 732501261.0, + "step": 19195 + }, + { + "epoch": 2.441928507823432, + "ewc_loss": 0.048807211220264435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4403605493716896e-05, + "grad_norm": 30.741989135742188, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8804451823234558, + "num_tokens": 732541162.0, + "step": 19196 + }, + { + "epoch": 2.4420557181020226, + "ewc_loss": 0.04881119728088379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4405599106103182e-05, + "grad_norm": 30.9173526763916, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8698064684867859, + "num_tokens": 732578237.0, + "step": 19197 + }, + { + "epoch": 2.442182928380613, + "ewc_loss": 0.048909034579992294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.445451718813274e-05, + "grad_norm": 30.87213134765625, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8609390258789062, + "num_tokens": 732618624.0, + "step": 19198 + }, + { + "epoch": 2.4423101386592037, + "ewc_loss": 0.04879974573850632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.439987292746082e-05, + "grad_norm": 30.802167892456055, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.867861270904541, + "num_tokens": 732661747.0, + "step": 19199 + }, + { + "epoch": 2.442437348937794, + "ewc_loss": 0.04885486885905266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4427434254903346e-05, + "grad_norm": 30.873342514038086, + "learning_rate": 1e-06, + "loss": 0.482, + "mean_token_accuracy": 0.850846529006958, + "num_tokens": 732703511.0, + "step": 19200 + }, + { + "epoch": 2.4425645592163847, + "ewc_loss": 0.04882650077342987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4413249775534496e-05, + "grad_norm": 30.917802810668945, + "learning_rate": 1e-06, + "loss": 0.4803, + "mean_token_accuracy": 0.8514705896377563, + "num_tokens": 732746749.0, + "step": 19201 + }, + { + "epoch": 2.4426917694949752, + "ewc_loss": 0.04883122816681862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4415614461759105e-05, + "grad_norm": 30.790231704711914, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.871832549571991, + "num_tokens": 732783184.0, + "step": 19202 + }, + { + "epoch": 2.4428189797735658, + "ewc_loss": 0.048748668283224106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4374334316235036e-05, + "grad_norm": 30.907968521118164, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8671784996986389, + "num_tokens": 732818664.0, + "step": 19203 + }, + { + "epoch": 2.4429461900521563, + "ewc_loss": 0.04880501329898834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4402506824117154e-05, + "grad_norm": 30.97083282470703, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8616034984588623, + "num_tokens": 732854286.0, + "step": 19204 + }, + { + "epoch": 2.443073400330747, + "ewc_loss": 0.04870688170194626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4353441403945908e-05, + "grad_norm": 30.805898666381836, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8785711526870728, + "num_tokens": 732893032.0, + "step": 19205 + }, + { + "epoch": 2.4432006106093374, + "ewc_loss": 0.04879964143037796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4399820176768117e-05, + "grad_norm": 30.996536254882812, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8706489205360413, + "num_tokens": 732929942.0, + "step": 19206 + }, + { + "epoch": 2.443327820887928, + "ewc_loss": 0.048799823969602585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4399911126238294e-05, + "grad_norm": 30.81329345703125, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8666510581970215, + "num_tokens": 732968218.0, + "step": 19207 + }, + { + "epoch": 2.4434550311665184, + "ewc_loss": 0.048837557435035706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4418779503321275e-05, + "grad_norm": 31.0585994720459, + "learning_rate": 1e-06, + "loss": 0.3442, + "mean_token_accuracy": 0.8956813812255859, + "num_tokens": 733001464.0, + "step": 19208 + }, + { + "epoch": 2.443582241445109, + "ewc_loss": 0.04887406900525093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.443703488097526e-05, + "grad_norm": 30.843013763427734, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8632948994636536, + "num_tokens": 733036311.0, + "step": 19209 + }, + { + "epoch": 2.4437094517236995, + "ewc_loss": 0.04880472272634506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.440236130496487e-05, + "grad_norm": 30.9658260345459, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8781076073646545, + "num_tokens": 733070487.0, + "step": 19210 + }, + { + "epoch": 2.4438366620022896, + "ewc_loss": 0.048845190554857254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4422595743089914e-05, + "grad_norm": 30.77423667907715, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.869943380355835, + "num_tokens": 733110357.0, + "step": 19211 + }, + { + "epoch": 2.4439638722808805, + "ewc_loss": 0.04882132634520531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4410663172602654e-05, + "grad_norm": 30.94354820251465, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8642042279243469, + "num_tokens": 733149070.0, + "step": 19212 + }, + { + "epoch": 2.4440910825594706, + "ewc_loss": 0.048914410173892975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.445720565447118e-05, + "grad_norm": 30.880979537963867, + "learning_rate": 1e-06, + "loss": 0.3726, + "mean_token_accuracy": 0.8861404061317444, + "num_tokens": 733180848.0, + "step": 19213 + }, + { + "epoch": 2.444218292838061, + "ewc_loss": 0.048774462193250656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4387230951106176e-05, + "grad_norm": 30.835004806518555, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8738359808921814, + "num_tokens": 733219204.0, + "step": 19214 + }, + { + "epoch": 2.4443455031166517, + "ewc_loss": 0.04886072501540184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4430362827843055e-05, + "grad_norm": 30.932233810424805, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8616240620613098, + "num_tokens": 733255237.0, + "step": 19215 + }, + { + "epoch": 2.444472713395242, + "ewc_loss": 0.048904433846473694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4452216166537255e-05, + "grad_norm": 30.85693359375, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8827944993972778, + "num_tokens": 733293243.0, + "step": 19216 + }, + { + "epoch": 2.4445999236738327, + "ewc_loss": 0.048894450068473816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4447224859613925e-05, + "grad_norm": 30.881999969482422, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8742634654045105, + "num_tokens": 733333530.0, + "step": 19217 + }, + { + "epoch": 2.4447271339524232, + "ewc_loss": 0.048941388726234436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4470695279887877e-05, + "grad_norm": 30.908966064453125, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8536006212234497, + "num_tokens": 733373790.0, + "step": 19218 + }, + { + "epoch": 2.4448543442310138, + "ewc_loss": 0.04890672490000725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.445336212986149e-05, + "grad_norm": 30.852375030517578, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8741931915283203, + "num_tokens": 733416037.0, + "step": 19219 + }, + { + "epoch": 2.4449815545096043, + "ewc_loss": 0.048887982964515686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.444399069645442e-05, + "grad_norm": 30.881954193115234, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8684427738189697, + "num_tokens": 733453014.0, + "step": 19220 + }, + { + "epoch": 2.445108764788195, + "ewc_loss": 0.04884918034076691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.44245893554762e-05, + "grad_norm": 30.88536262512207, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8569482564926147, + "num_tokens": 733491197.0, + "step": 19221 + }, + { + "epoch": 2.4452359750667854, + "ewc_loss": 0.04890940710902214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4454702725051902e-05, + "grad_norm": 30.976797103881836, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8840146064758301, + "num_tokens": 733531142.0, + "step": 19222 + }, + { + "epoch": 2.445363185345376, + "ewc_loss": 0.04890725761651993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.445362952130381e-05, + "grad_norm": 30.7525691986084, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8651580810546875, + "num_tokens": 733573916.0, + "step": 19223 + }, + { + "epoch": 2.4454903956239664, + "ewc_loss": 0.04879859462380409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.43992981268093e-05, + "grad_norm": 30.867046356201172, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8662081956863403, + "num_tokens": 733613204.0, + "step": 19224 + }, + { + "epoch": 2.445617605902557, + "ewc_loss": 0.04901872202754021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.450936153763905e-05, + "grad_norm": 30.99369239807129, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.866300106048584, + "num_tokens": 733653618.0, + "step": 19225 + }, + { + "epoch": 2.4457448161811475, + "ewc_loss": 0.04881717637181282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4408587705693208e-05, + "grad_norm": 30.833620071411133, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8869455456733704, + "num_tokens": 733687795.0, + "step": 19226 + }, + { + "epoch": 2.445872026459738, + "ewc_loss": 0.04894612729549408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4473063604091294e-05, + "grad_norm": 31.035926818847656, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8714651465415955, + "num_tokens": 733723574.0, + "step": 19227 + }, + { + "epoch": 2.4459992367383285, + "ewc_loss": 0.04889579862356186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.444789970468264e-05, + "grad_norm": 30.98438262939453, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8776324987411499, + "num_tokens": 733762051.0, + "step": 19228 + }, + { + "epoch": 2.446126447016919, + "ewc_loss": 0.0488414391875267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.442072036501486e-05, + "grad_norm": 30.834510803222656, + "learning_rate": 1e-06, + "loss": 0.365, + "mean_token_accuracy": 0.8882026672363281, + "num_tokens": 733800640.0, + "step": 19229 + }, + { + "epoch": 2.4462536572955096, + "ewc_loss": 0.048873573541641235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.443678749841638e-05, + "grad_norm": 31.12141990661621, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8610734939575195, + "num_tokens": 733841321.0, + "step": 19230 + }, + { + "epoch": 2.4463808675741, + "ewc_loss": 0.04889734461903572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4448672775179148e-05, + "grad_norm": 30.913427352905273, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8747802376747131, + "num_tokens": 733874104.0, + "step": 19231 + }, + { + "epoch": 2.4465080778526906, + "ewc_loss": 0.04876810684800148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.438405317661818e-05, + "grad_norm": 30.976408004760742, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8752450346946716, + "num_tokens": 733913127.0, + "step": 19232 + }, + { + "epoch": 2.446635288131281, + "ewc_loss": 0.04890110343694687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.445055179123301e-05, + "grad_norm": 30.961143493652344, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8878301978111267, + "num_tokens": 733951900.0, + "step": 19233 + }, + { + "epoch": 2.4467624984098717, + "ewc_loss": 0.04881315305829048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4406575903412886e-05, + "grad_norm": 31.066682815551758, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8733672499656677, + "num_tokens": 733992866.0, + "step": 19234 + }, + { + "epoch": 2.4468897086884622, + "ewc_loss": 0.04883980005979538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.441990000079386e-05, + "grad_norm": 30.899681091308594, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8667691946029663, + "num_tokens": 734029895.0, + "step": 19235 + }, + { + "epoch": 2.4470169189670523, + "ewc_loss": 0.04867108166217804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4335540729225613e-05, + "grad_norm": 30.90930938720703, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8882324695587158, + "num_tokens": 734066954.0, + "step": 19236 + }, + { + "epoch": 2.4471441292456433, + "ewc_loss": 0.048782676458358765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4391338229179382e-05, + "grad_norm": 31.019203186035156, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.8931013345718384, + "num_tokens": 734102412.0, + "step": 19237 + }, + { + "epoch": 2.4472713395242334, + "ewc_loss": 0.04885678365826607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.442839104332961e-05, + "grad_norm": 30.95686149597168, + "learning_rate": 1e-06, + "loss": 0.3645, + "mean_token_accuracy": 0.8880126476287842, + "num_tokens": 734141400.0, + "step": 19238 + }, + { + "epoch": 2.447398549802824, + "ewc_loss": 0.048690345138311386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.434517227811739e-05, + "grad_norm": 30.93376350402832, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8518943786621094, + "num_tokens": 734176628.0, + "step": 19239 + }, + { + "epoch": 2.4475257600814144, + "ewc_loss": 0.048768505454063416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.438425326545257e-05, + "grad_norm": 30.94716453552246, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.877170205116272, + "num_tokens": 734219756.0, + "step": 19240 + }, + { + "epoch": 2.447652970360005, + "ewc_loss": 0.048750195652246475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4375098291784525e-05, + "grad_norm": 31.00664710998535, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8713803291320801, + "num_tokens": 734250509.0, + "step": 19241 + }, + { + "epoch": 2.4477801806385955, + "ewc_loss": 0.04880145192146301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4400726033491082e-05, + "grad_norm": 30.93208885192871, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8785340785980225, + "num_tokens": 734287887.0, + "step": 19242 + }, + { + "epoch": 2.447907390917186, + "ewc_loss": 0.04875698685646057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4378494345000945e-05, + "grad_norm": 31.001432418823242, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8771283030509949, + "num_tokens": 734330446.0, + "step": 19243 + }, + { + "epoch": 2.4480346011957765, + "ewc_loss": 0.048735152930021286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4367576770600863e-05, + "grad_norm": 30.78300666809082, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8778592348098755, + "num_tokens": 734368427.0, + "step": 19244 + }, + { + "epoch": 2.448161811474367, + "ewc_loss": 0.04873093217611313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.436546674289275e-05, + "grad_norm": 31.004167556762695, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.8904762268066406, + "num_tokens": 734405103.0, + "step": 19245 + }, + { + "epoch": 2.4482890217529576, + "ewc_loss": 0.04878699406981468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.439349736960139e-05, + "grad_norm": 30.880266189575195, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8682379722595215, + "num_tokens": 734448069.0, + "step": 19246 + }, + { + "epoch": 2.448416232031548, + "ewc_loss": 0.048787206411361694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4393602870986797e-05, + "grad_norm": 30.92477035522461, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8587847948074341, + "num_tokens": 734486016.0, + "step": 19247 + }, + { + "epoch": 2.4485434423101387, + "ewc_loss": 0.048851218074560165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.442560980853159e-05, + "grad_norm": 30.997392654418945, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.8605811595916748, + "num_tokens": 734526943.0, + "step": 19248 + }, + { + "epoch": 2.448670652588729, + "ewc_loss": 0.04882347583770752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.441173819534015e-05, + "grad_norm": 30.981651306152344, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8846808671951294, + "num_tokens": 734566137.0, + "step": 19249 + }, + { + "epoch": 2.4487978628673197, + "ewc_loss": 0.048786845058202744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4393422791035846e-05, + "grad_norm": 30.989946365356445, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8693784475326538, + "num_tokens": 734602366.0, + "step": 19250 + }, + { + "epoch": 2.4489250731459102, + "ewc_loss": 0.04878024011850357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4390119506279007e-05, + "grad_norm": 30.858896255493164, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.8805840015411377, + "num_tokens": 734644251.0, + "step": 19251 + }, + { + "epoch": 2.4490522834245008, + "ewc_loss": 0.04875889793038368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4379449314437807e-05, + "grad_norm": 30.92278480529785, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8685767650604248, + "num_tokens": 734682938.0, + "step": 19252 + }, + { + "epoch": 2.4491794937030913, + "ewc_loss": 0.04886883869767189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4434419174212962e-05, + "grad_norm": 30.942276000976562, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8734825849533081, + "num_tokens": 734722975.0, + "step": 19253 + }, + { + "epoch": 2.449306703981682, + "ewc_loss": 0.04880935698747635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4404678697464988e-05, + "grad_norm": 30.978845596313477, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.875397801399231, + "num_tokens": 734761361.0, + "step": 19254 + }, + { + "epoch": 2.4494339142602723, + "ewc_loss": 0.048824332654476166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4412165657849982e-05, + "grad_norm": 30.96004295349121, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8700870871543884, + "num_tokens": 734800597.0, + "step": 19255 + }, + { + "epoch": 2.449561124538863, + "ewc_loss": 0.048811983317136765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4405992007814348e-05, + "grad_norm": 30.982276916503906, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8608930706977844, + "num_tokens": 734840575.0, + "step": 19256 + }, + { + "epoch": 2.4496883348174534, + "ewc_loss": 0.04880279302597046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.440139724058099e-05, + "grad_norm": 30.885730743408203, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8626811504364014, + "num_tokens": 734876943.0, + "step": 19257 + }, + { + "epoch": 2.449815545096044, + "ewc_loss": 0.04879771173000336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4398856112384237e-05, + "grad_norm": 31.071165084838867, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8878369331359863, + "num_tokens": 734916436.0, + "step": 19258 + }, + { + "epoch": 2.4499427553746345, + "ewc_loss": 0.04888693243265152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.44434668275062e-05, + "grad_norm": 30.92266845703125, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8717805743217468, + "num_tokens": 734956336.0, + "step": 19259 + }, + { + "epoch": 2.450069965653225, + "ewc_loss": 0.048718325793743134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.435916212562006e-05, + "grad_norm": 30.993091583251953, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.870261013507843, + "num_tokens": 734997132.0, + "step": 19260 + }, + { + "epoch": 2.450197175931815, + "ewc_loss": 0.04888489842414856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4442450012429617e-05, + "grad_norm": 30.909687042236328, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.860305666923523, + "num_tokens": 735036192.0, + "step": 19261 + }, + { + "epoch": 2.450324386210406, + "ewc_loss": 0.048735205084085464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4367602236452512e-05, + "grad_norm": 30.9779109954834, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.885269284248352, + "num_tokens": 735079368.0, + "step": 19262 + }, + { + "epoch": 2.450451596488996, + "ewc_loss": 0.048951443284749985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4475721147609875e-05, + "grad_norm": 31.041854858398438, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8778813481330872, + "num_tokens": 735109714.0, + "step": 19263 + }, + { + "epoch": 2.4505788067675867, + "ewc_loss": 0.04870849847793579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4354249035241082e-05, + "grad_norm": 30.855621337890625, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8687490224838257, + "num_tokens": 735152712.0, + "step": 19264 + }, + { + "epoch": 2.450706017046177, + "ewc_loss": 0.04873409867286682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4367049263673835e-05, + "grad_norm": 30.841026306152344, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8670984506607056, + "num_tokens": 735193313.0, + "step": 19265 + }, + { + "epoch": 2.4508332273247677, + "ewc_loss": 0.04886539652943611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4432698410237208e-05, + "grad_norm": 30.915555953979492, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8866325616836548, + "num_tokens": 735235413.0, + "step": 19266 + }, + { + "epoch": 2.4509604376033582, + "ewc_loss": 0.04880331829190254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.44016591750551e-05, + "grad_norm": 30.891820907592773, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8726165294647217, + "num_tokens": 735270809.0, + "step": 19267 + }, + { + "epoch": 2.4510876478819488, + "ewc_loss": 0.0488026961684227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4401348127867095e-05, + "grad_norm": 30.8364200592041, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.865423858165741, + "num_tokens": 735309790.0, + "step": 19268 + }, + { + "epoch": 2.4512148581605393, + "ewc_loss": 0.048826709389686584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.44133552769199e-05, + "grad_norm": 30.964149475097656, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8731452226638794, + "num_tokens": 735341084.0, + "step": 19269 + }, + { + "epoch": 2.45134206843913, + "ewc_loss": 0.04886537045240402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4432685677311383e-05, + "grad_norm": 30.838096618652344, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8889212608337402, + "num_tokens": 735376964.0, + "step": 19270 + }, + { + "epoch": 2.4514692787177204, + "ewc_loss": 0.04884963482618332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4424816729151644e-05, + "grad_norm": 31.026121139526367, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8815377950668335, + "num_tokens": 735413296.0, + "step": 19271 + }, + { + "epoch": 2.451596488996311, + "ewc_loss": 0.048887938261032104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4443968868581578e-05, + "grad_norm": 30.821378707885742, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8771872520446777, + "num_tokens": 735454992.0, + "step": 19272 + }, + { + "epoch": 2.4517236992749014, + "ewc_loss": 0.048796940594911575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4398470486630686e-05, + "grad_norm": 30.94106101989746, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8737339377403259, + "num_tokens": 735493315.0, + "step": 19273 + }, + { + "epoch": 2.451850909553492, + "ewc_loss": 0.048960331827402115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.448016675771214e-05, + "grad_norm": 30.88563346862793, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.881550669670105, + "num_tokens": 735532666.0, + "step": 19274 + }, + { + "epoch": 2.4519781198320825, + "ewc_loss": 0.04876534640789032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.438267256366089e-05, + "grad_norm": 30.865968704223633, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8701940178871155, + "num_tokens": 735571987.0, + "step": 19275 + }, + { + "epoch": 2.452105330110673, + "ewc_loss": 0.04890204966068268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4451024728477933e-05, + "grad_norm": 30.89495086669922, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8700287342071533, + "num_tokens": 735609409.0, + "step": 19276 + }, + { + "epoch": 2.4522325403892635, + "ewc_loss": 0.048757150769233704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4378576199524105e-05, + "grad_norm": 30.784685134887695, + "learning_rate": 1e-06, + "loss": 0.502, + "mean_token_accuracy": 0.8458828926086426, + "num_tokens": 735647299.0, + "step": 19277 + }, + { + "epoch": 2.452359750667854, + "ewc_loss": 0.048911500722169876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4455750462948345e-05, + "grad_norm": 30.871658325195312, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8798699378967285, + "num_tokens": 735677757.0, + "step": 19278 + }, + { + "epoch": 2.4524869609464446, + "ewc_loss": 0.04896634817123413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4483173547196202e-05, + "grad_norm": 30.88528823852539, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8778838515281677, + "num_tokens": 735723924.0, + "step": 19279 + }, + { + "epoch": 2.452614171225035, + "ewc_loss": 0.048913560807704926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.445678001095075e-05, + "grad_norm": 30.84341049194336, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8574711084365845, + "num_tokens": 735758105.0, + "step": 19280 + }, + { + "epoch": 2.4527413815036256, + "ewc_loss": 0.048977408558130264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4488705093972385e-05, + "grad_norm": 30.86765480041504, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8687626719474792, + "num_tokens": 735797603.0, + "step": 19281 + }, + { + "epoch": 2.452868591782216, + "ewc_loss": 0.04896111786365509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4480559659423307e-05, + "grad_norm": 30.898067474365234, + "learning_rate": 1e-06, + "loss": 0.5016, + "mean_token_accuracy": 0.8458236455917358, + "num_tokens": 735838958.0, + "step": 19282 + }, + { + "epoch": 2.4529958020608067, + "ewc_loss": 0.048942603170871735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4471301003359258e-05, + "grad_norm": 30.839570999145508, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.864112138748169, + "num_tokens": 735878894.0, + "step": 19283 + }, + { + "epoch": 2.453123012339397, + "ewc_loss": 0.049016647040843964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4508322894689627e-05, + "grad_norm": 31.033050537109375, + "learning_rate": 1e-06, + "loss": 0.3272, + "mean_token_accuracy": 0.9015854001045227, + "num_tokens": 735913222.0, + "step": 19284 + }, + { + "epoch": 2.4532502226179878, + "ewc_loss": 0.048952359706163406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4476179532939568e-05, + "grad_norm": 30.93728256225586, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8791484832763672, + "num_tokens": 735951884.0, + "step": 19285 + }, + { + "epoch": 2.453377432896578, + "ewc_loss": 0.048850055783987045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4425027731922455e-05, + "grad_norm": 30.83450698852539, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8678975701332092, + "num_tokens": 735998955.0, + "step": 19286 + }, + { + "epoch": 2.4535046431751684, + "ewc_loss": 0.04888259992003441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4441300411126576e-05, + "grad_norm": 30.95157814025879, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8699169158935547, + "num_tokens": 736039631.0, + "step": 19287 + }, + { + "epoch": 2.453631853453759, + "ewc_loss": 0.048944052308797836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4472026780131273e-05, + "grad_norm": 30.928667068481445, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8683178424835205, + "num_tokens": 736085100.0, + "step": 19288 + }, + { + "epoch": 2.4537590637323494, + "ewc_loss": 0.04895316809415817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4476583348587155e-05, + "grad_norm": 31.008880615234375, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8866239786148071, + "num_tokens": 736119680.0, + "step": 19289 + }, + { + "epoch": 2.45388627401094, + "ewc_loss": 0.04885469749569893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.442734876240138e-05, + "grad_norm": 30.906606674194336, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8832768201828003, + "num_tokens": 736159769.0, + "step": 19290 + }, + { + "epoch": 2.4540134842895305, + "ewc_loss": 0.048833176493644714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4416587621090002e-05, + "grad_norm": 30.973873138427734, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8779861927032471, + "num_tokens": 736199566.0, + "step": 19291 + }, + { + "epoch": 2.454140694568121, + "ewc_loss": 0.048774950206279755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.438747469568625e-05, + "grad_norm": 30.884140014648438, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8766192197799683, + "num_tokens": 736239238.0, + "step": 19292 + }, + { + "epoch": 2.4542679048467115, + "ewc_loss": 0.04879302531480789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.439651325403247e-05, + "grad_norm": 31.05162239074707, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8768654465675354, + "num_tokens": 736274576.0, + "step": 19293 + }, + { + "epoch": 2.454395115125302, + "ewc_loss": 0.048855870962142944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4427936295978725e-05, + "grad_norm": 30.969493865966797, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8658947944641113, + "num_tokens": 736307509.0, + "step": 19294 + }, + { + "epoch": 2.4545223254038926, + "ewc_loss": 0.0488022081553936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.440110438328702e-05, + "grad_norm": 31.020170211791992, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8683652281761169, + "num_tokens": 736348313.0, + "step": 19295 + }, + { + "epoch": 2.454649535682483, + "ewc_loss": 0.04881708696484566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4408544049947523e-05, + "grad_norm": 30.92022705078125, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8681240081787109, + "num_tokens": 736380716.0, + "step": 19296 + }, + { + "epoch": 2.4547767459610736, + "ewc_loss": 0.048768579959869385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4384289645240642e-05, + "grad_norm": 30.957942962646484, + "learning_rate": 1e-06, + "loss": 0.4855, + "mean_token_accuracy": 0.850305438041687, + "num_tokens": 736422669.0, + "step": 19297 + }, + { + "epoch": 2.454903956239664, + "ewc_loss": 0.0488952174782753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4447608666378073e-05, + "grad_norm": 31.045604705810547, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8796718716621399, + "num_tokens": 736452929.0, + "step": 19298 + }, + { + "epoch": 2.4550311665182547, + "ewc_loss": 0.04879286512732506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4396433218498714e-05, + "grad_norm": 30.889009475708008, + "learning_rate": 1e-06, + "loss": 0.4072, + "mean_token_accuracy": 0.8741750717163086, + "num_tokens": 736486755.0, + "step": 19299 + }, + { + "epoch": 2.4551583767968452, + "ewc_loss": 0.04879346489906311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.43967333517503e-05, + "grad_norm": 30.925962448120117, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8812283277511597, + "num_tokens": 736521542.0, + "step": 19300 + }, + { + "epoch": 2.4552855870754358, + "ewc_loss": 0.04894215986132622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4471079086652026e-05, + "grad_norm": 30.883325576782227, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8734423518180847, + "num_tokens": 736557312.0, + "step": 19301 + }, + { + "epoch": 2.4554127973540263, + "ewc_loss": 0.04889177903532982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4445889721391723e-05, + "grad_norm": 30.94443702697754, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8675944805145264, + "num_tokens": 736596469.0, + "step": 19302 + }, + { + "epoch": 2.455540007632617, + "ewc_loss": 0.048931460827589035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.44657312578056e-05, + "grad_norm": 30.839384078979492, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8573029041290283, + "num_tokens": 736636078.0, + "step": 19303 + }, + { + "epoch": 2.4556672179112073, + "ewc_loss": 0.048928774893283844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.446438702463638e-05, + "grad_norm": 30.906898498535156, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8839174509048462, + "num_tokens": 736674117.0, + "step": 19304 + }, + { + "epoch": 2.455794428189798, + "ewc_loss": 0.04899723082780838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4498614948242903e-05, + "grad_norm": 30.888147354125977, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8695799708366394, + "num_tokens": 736709116.0, + "step": 19305 + }, + { + "epoch": 2.4559216384683884, + "ewc_loss": 0.049000196158885956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4500097424606793e-05, + "grad_norm": 31.00870132446289, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8638099431991577, + "num_tokens": 736750383.0, + "step": 19306 + }, + { + "epoch": 2.456048848746979, + "ewc_loss": 0.04900623485445976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.450311694701668e-05, + "grad_norm": 30.888771057128906, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8611160516738892, + "num_tokens": 736790836.0, + "step": 19307 + }, + { + "epoch": 2.4561760590255695, + "ewc_loss": 0.048930056393146515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4465029127895832e-05, + "grad_norm": 30.881542205810547, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8697233200073242, + "num_tokens": 736826085.0, + "step": 19308 + }, + { + "epoch": 2.4563032693041595, + "ewc_loss": 0.049079205840826035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4539602236473e-05, + "grad_norm": 30.95702362060547, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8715535402297974, + "num_tokens": 736860599.0, + "step": 19309 + }, + { + "epoch": 2.4564304795827505, + "ewc_loss": 0.048907551914453506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4453775040456094e-05, + "grad_norm": 30.763797760009766, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8886973857879639, + "num_tokens": 736893286.0, + "step": 19310 + }, + { + "epoch": 2.4565576898613406, + "ewc_loss": 0.04900575801730156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4502878659404814e-05, + "grad_norm": 31.031593322753906, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8818314671516418, + "num_tokens": 736931597.0, + "step": 19311 + }, + { + "epoch": 2.456684900139931, + "ewc_loss": 0.04905352741479874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4526763809262775e-05, + "grad_norm": 30.879623413085938, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8796809315681458, + "num_tokens": 736972183.0, + "step": 19312 + }, + { + "epoch": 2.4568121104185217, + "ewc_loss": 0.048938050866127014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4469025447615422e-05, + "grad_norm": 30.937732696533203, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8792368173599243, + "num_tokens": 737004184.0, + "step": 19313 + }, + { + "epoch": 2.456939320697112, + "ewc_loss": 0.049079857766628265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.453992965456564e-05, + "grad_norm": 30.986066818237305, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8725305199623108, + "num_tokens": 737042644.0, + "step": 19314 + }, + { + "epoch": 2.4570665309757027, + "ewc_loss": 0.04898112267255783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4490560463164002e-05, + "grad_norm": 30.96185302734375, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8626309633255005, + "num_tokens": 737074830.0, + "step": 19315 + }, + { + "epoch": 2.4571937412542932, + "ewc_loss": 0.04896426945924759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4482134904246777e-05, + "grad_norm": 30.94769287109375, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8792852163314819, + "num_tokens": 737107756.0, + "step": 19316 + }, + { + "epoch": 2.4573209515328838, + "ewc_loss": 0.04899632930755615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4498163838870823e-05, + "grad_norm": 31.053260803222656, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8811241984367371, + "num_tokens": 737143457.0, + "step": 19317 + }, + { + "epoch": 2.4574481618114743, + "ewc_loss": 0.04905257746577263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.452628905302845e-05, + "grad_norm": 30.94470977783203, + "learning_rate": 1e-06, + "loss": 0.4301, + "mean_token_accuracy": 0.8679126501083374, + "num_tokens": 737177005.0, + "step": 19318 + }, + { + "epoch": 2.457575372090065, + "ewc_loss": 0.048840176314115524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4420087356702425e-05, + "grad_norm": 30.896631240844727, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8638726472854614, + "num_tokens": 737218265.0, + "step": 19319 + }, + { + "epoch": 2.4577025823686554, + "ewc_loss": 0.04904463514685631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.452231819916051e-05, + "grad_norm": 31.023386001586914, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8627384305000305, + "num_tokens": 737259495.0, + "step": 19320 + }, + { + "epoch": 2.457829792647246, + "ewc_loss": 0.04896486923098564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4482435037498362e-05, + "grad_norm": 30.950794219970703, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8644012212753296, + "num_tokens": 737300073.0, + "step": 19321 + }, + { + "epoch": 2.4579570029258364, + "ewc_loss": 0.048953406512737274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.447670340188779e-05, + "grad_norm": 30.94923973083496, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8733384013175964, + "num_tokens": 737334716.0, + "step": 19322 + }, + { + "epoch": 2.458084213204427, + "ewc_loss": 0.04901767522096634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.450883766869083e-05, + "grad_norm": 31.010160446166992, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8762270212173462, + "num_tokens": 737377086.0, + "step": 19323 + }, + { + "epoch": 2.4582114234830175, + "ewc_loss": 0.049047958105802536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4523978936485946e-05, + "grad_norm": 30.955249786376953, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8607630729675293, + "num_tokens": 737418259.0, + "step": 19324 + }, + { + "epoch": 2.458338633761608, + "ewc_loss": 0.04905523359775543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4527616915293038e-05, + "grad_norm": 31.12171173095703, + "learning_rate": 1e-06, + "loss": 0.4457, + "mean_token_accuracy": 0.8623838424682617, + "num_tokens": 737453150.0, + "step": 19325 + }, + { + "epoch": 2.4584658440401985, + "ewc_loss": 0.048974379897117615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.448718987579923e-05, + "grad_norm": 30.842750549316406, + "learning_rate": 1e-06, + "loss": 0.3719, + "mean_token_accuracy": 0.8870645761489868, + "num_tokens": 737495552.0, + "step": 19326 + }, + { + "epoch": 2.458593054318789, + "ewc_loss": 0.04897298291325569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.448649138386827e-05, + "grad_norm": 31.053407669067383, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8777816295623779, + "num_tokens": 737537518.0, + "step": 19327 + }, + { + "epoch": 2.4587202645973796, + "ewc_loss": 0.04904457554221153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.452228727634065e-05, + "grad_norm": 30.958139419555664, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.867824912071228, + "num_tokens": 737577472.0, + "step": 19328 + }, + { + "epoch": 2.45884747487597, + "ewc_loss": 0.048915717750787735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4457858671667054e-05, + "grad_norm": 30.978239059448242, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8630695343017578, + "num_tokens": 737617250.0, + "step": 19329 + }, + { + "epoch": 2.4589746851545606, + "ewc_loss": 0.048997700214385986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.449884959787596e-05, + "grad_norm": 31.06356430053711, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8763254880905151, + "num_tokens": 737655544.0, + "step": 19330 + }, + { + "epoch": 2.459101895433151, + "ewc_loss": 0.04886027052998543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4430135454167612e-05, + "grad_norm": 30.980743408203125, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8754787445068359, + "num_tokens": 737688644.0, + "step": 19331 + }, + { + "epoch": 2.4592291057117417, + "ewc_loss": 0.04890841245651245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4454206140944734e-05, + "grad_norm": 30.91773223876953, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8763749599456787, + "num_tokens": 737730264.0, + "step": 19332 + }, + { + "epoch": 2.459356315990332, + "ewc_loss": 0.04892616346478462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4463080990244634e-05, + "grad_norm": 31.030057907104492, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8665681481361389, + "num_tokens": 737763578.0, + "step": 19333 + }, + { + "epoch": 2.4594835262689223, + "ewc_loss": 0.048839833587408066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.441991637169849e-05, + "grad_norm": 30.85918617248535, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8733072876930237, + "num_tokens": 737804062.0, + "step": 19334 + }, + { + "epoch": 2.4596107365475133, + "ewc_loss": 0.04886794090270996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4433969883830287e-05, + "grad_norm": 30.940919876098633, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8866083025932312, + "num_tokens": 737845608.0, + "step": 19335 + }, + { + "epoch": 2.4597379468261034, + "ewc_loss": 0.04888833686709404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4444168957415968e-05, + "grad_norm": 30.966022491455078, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8539441227912903, + "num_tokens": 737881366.0, + "step": 19336 + }, + { + "epoch": 2.459865157104694, + "ewc_loss": 0.048848818987607956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.442440927552525e-05, + "grad_norm": 31.022991180419922, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8645836114883423, + "num_tokens": 737922595.0, + "step": 19337 + }, + { + "epoch": 2.4599923673832844, + "ewc_loss": 0.04893529415130615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4467646653647535e-05, + "grad_norm": 30.920866012573242, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8794633150100708, + "num_tokens": 737956368.0, + "step": 19338 + }, + { + "epoch": 2.460119577661875, + "ewc_loss": 0.048764750361442566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4382376068388112e-05, + "grad_norm": 30.99262237548828, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8762515783309937, + "num_tokens": 738000167.0, + "step": 19339 + }, + { + "epoch": 2.4602467879404655, + "ewc_loss": 0.049009840935468674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4504921384504996e-05, + "grad_norm": 31.032756805419922, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8650078773498535, + "num_tokens": 738040850.0, + "step": 19340 + }, + { + "epoch": 2.460373998219056, + "ewc_loss": 0.048771172761917114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.438558658468537e-05, + "grad_norm": 30.95969581604004, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8703835010528564, + "num_tokens": 738080370.0, + "step": 19341 + }, + { + "epoch": 2.4605012084976465, + "ewc_loss": 0.04886488616466522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.443244375172071e-05, + "grad_norm": 31.02595329284668, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.876640796661377, + "num_tokens": 738116751.0, + "step": 19342 + }, + { + "epoch": 2.460628418776237, + "ewc_loss": 0.04879431053996086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4397155357291922e-05, + "grad_norm": 30.95332145690918, + "learning_rate": 1e-06, + "loss": 0.3436, + "mean_token_accuracy": 0.8907378315925598, + "num_tokens": 738154597.0, + "step": 19343 + }, + { + "epoch": 2.4607556290548276, + "ewc_loss": 0.048802971839904785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4401486371061765e-05, + "grad_norm": 30.951444625854492, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8793994784355164, + "num_tokens": 738186484.0, + "step": 19344 + }, + { + "epoch": 2.460882839333418, + "ewc_loss": 0.048870038241147995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4435019440716133e-05, + "grad_norm": 30.87937355041504, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8756183385848999, + "num_tokens": 738217722.0, + "step": 19345 + }, + { + "epoch": 2.4610100496120086, + "ewc_loss": 0.048866502940654755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4433251383015886e-05, + "grad_norm": 30.993730545043945, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8856420516967773, + "num_tokens": 738255304.0, + "step": 19346 + }, + { + "epoch": 2.461137259890599, + "ewc_loss": 0.048905059695243835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4452529032714665e-05, + "grad_norm": 30.97088623046875, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8678137063980103, + "num_tokens": 738292162.0, + "step": 19347 + }, + { + "epoch": 2.4612644701691897, + "ewc_loss": 0.048904672265052795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.445233621983789e-05, + "grad_norm": 31.05111312866211, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.884758710861206, + "num_tokens": 738332851.0, + "step": 19348 + }, + { + "epoch": 2.4613916804477802, + "ewc_loss": 0.04884776473045349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.442388176859822e-05, + "grad_norm": 30.86386489868164, + "learning_rate": 1e-06, + "loss": 0.3833, + "mean_token_accuracy": 0.8817411661148071, + "num_tokens": 738375120.0, + "step": 19349 + }, + { + "epoch": 2.4615188907263708, + "ewc_loss": 0.048835691064596176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4417846361757256e-05, + "grad_norm": 30.998409271240234, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8637040853500366, + "num_tokens": 738413857.0, + "step": 19350 + }, + { + "epoch": 2.4616461010049613, + "ewc_loss": 0.048899129033088684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4449564079986885e-05, + "grad_norm": 30.962949752807617, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.866080641746521, + "num_tokens": 738450097.0, + "step": 19351 + }, + { + "epoch": 2.461773311283552, + "ewc_loss": 0.04878779128193855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4393895728280768e-05, + "grad_norm": 31.004398345947266, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.8546104431152344, + "num_tokens": 738483784.0, + "step": 19352 + }, + { + "epoch": 2.4619005215621423, + "ewc_loss": 0.048877980560064316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4438990294584073e-05, + "grad_norm": 30.906753540039062, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8529985547065735, + "num_tokens": 738526820.0, + "step": 19353 + }, + { + "epoch": 2.462027731840733, + "ewc_loss": 0.04895087704062462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4475439204252325e-05, + "grad_norm": 31.1019229888916, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8587920665740967, + "num_tokens": 738569049.0, + "step": 19354 + }, + { + "epoch": 2.4621549421193234, + "ewc_loss": 0.04880316182971001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.440158095851075e-05, + "grad_norm": 30.825895309448242, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.868693470954895, + "num_tokens": 738608320.0, + "step": 19355 + }, + { + "epoch": 2.462282152397914, + "ewc_loss": 0.048820629715919495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4410313926637173e-05, + "grad_norm": 30.93262481689453, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.886549174785614, + "num_tokens": 738645566.0, + "step": 19356 + }, + { + "epoch": 2.4624093626765045, + "ewc_loss": 0.04892956465482712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.446478174533695e-05, + "grad_norm": 30.986175537109375, + "learning_rate": 1e-06, + "loss": 0.4912, + "mean_token_accuracy": 0.849075198173523, + "num_tokens": 738686184.0, + "step": 19357 + }, + { + "epoch": 2.462536572955095, + "ewc_loss": 0.048861294984817505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4430646590190008e-05, + "grad_norm": 30.899152755737305, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8581913709640503, + "num_tokens": 738728820.0, + "step": 19358 + }, + { + "epoch": 2.462663783233685, + "ewc_loss": 0.04891861975193024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4459310225211084e-05, + "grad_norm": 30.943649291992188, + "learning_rate": 1e-06, + "loss": 0.5088, + "mean_token_accuracy": 0.8427327871322632, + "num_tokens": 738760180.0, + "step": 19359 + }, + { + "epoch": 2.462790993512276, + "ewc_loss": 0.048944782465696335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4472390578011982e-05, + "grad_norm": 30.861671447753906, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8799725770950317, + "num_tokens": 738795501.0, + "step": 19360 + }, + { + "epoch": 2.462918203790866, + "ewc_loss": 0.04898328706622124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.449164276185911e-05, + "grad_norm": 31.102428436279297, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8620535135269165, + "num_tokens": 738831027.0, + "step": 19361 + }, + { + "epoch": 2.4630454140694567, + "ewc_loss": 0.04894580319523811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.447290171403438e-05, + "grad_norm": 30.9342041015625, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8550360798835754, + "num_tokens": 738870383.0, + "step": 19362 + }, + { + "epoch": 2.463172624348047, + "ewc_loss": 0.048883408308029175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4441704226774164e-05, + "grad_norm": 31.052370071411133, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8764371275901794, + "num_tokens": 738908231.0, + "step": 19363 + }, + { + "epoch": 2.4632998346266377, + "ewc_loss": 0.04897446185350418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4487231712555513e-05, + "grad_norm": 31.080869674682617, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8626737594604492, + "num_tokens": 738950927.0, + "step": 19364 + }, + { + "epoch": 2.4634270449052282, + "ewc_loss": 0.048905570060014725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4452785510220565e-05, + "grad_norm": 30.93876838684082, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8839112520217896, + "num_tokens": 738989857.0, + "step": 19365 + }, + { + "epoch": 2.4635542551838188, + "ewc_loss": 0.048908691853284836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4454346203128807e-05, + "grad_norm": 30.983346939086914, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.8857045769691467, + "num_tokens": 739021281.0, + "step": 19366 + }, + { + "epoch": 2.4636814654624093, + "ewc_loss": 0.04891921579837799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4459608539473265e-05, + "grad_norm": 30.968122482299805, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8786892890930176, + "num_tokens": 739063475.0, + "step": 19367 + }, + { + "epoch": 2.463808675741, + "ewc_loss": 0.04900369048118591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.45018454734236e-05, + "grad_norm": 31.027511596679688, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8678756952285767, + "num_tokens": 739107083.0, + "step": 19368 + }, + { + "epoch": 2.4639358860195903, + "ewc_loss": 0.048982907086610794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4491453586961143e-05, + "grad_norm": 31.044540405273438, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8508885502815247, + "num_tokens": 739152002.0, + "step": 19369 + }, + { + "epoch": 2.464063096298181, + "ewc_loss": 0.048911355435848236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4455677703372203e-05, + "grad_norm": 30.920818328857422, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8691375255584717, + "num_tokens": 739191781.0, + "step": 19370 + }, + { + "epoch": 2.4641903065767714, + "ewc_loss": 0.04884898290038109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.442449113004841e-05, + "grad_norm": 31.070697784423828, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8917044401168823, + "num_tokens": 739230374.0, + "step": 19371 + }, + { + "epoch": 2.464317516855362, + "ewc_loss": 0.04895686358213425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4478431441821158e-05, + "grad_norm": 30.922348022460938, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8688552379608154, + "num_tokens": 739267310.0, + "step": 19372 + }, + { + "epoch": 2.4644447271339525, + "ewc_loss": 0.04883687198162079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4418435714324005e-05, + "grad_norm": 31.02326202392578, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8747850060462952, + "num_tokens": 739305074.0, + "step": 19373 + }, + { + "epoch": 2.464571937412543, + "ewc_loss": 0.04897948354482651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4489741917932406e-05, + "grad_norm": 31.03000831604004, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8631237745285034, + "num_tokens": 739343028.0, + "step": 19374 + }, + { + "epoch": 2.4646991476911335, + "ewc_loss": 0.048878252506256104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.443912671878934e-05, + "grad_norm": 31.037147521972656, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8704972863197327, + "num_tokens": 739384364.0, + "step": 19375 + }, + { + "epoch": 2.464826357969724, + "ewc_loss": 0.048899102956056595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.444955134706106e-05, + "grad_norm": 30.927858352661133, + "learning_rate": 1e-06, + "loss": 0.3684, + "mean_token_accuracy": 0.8881180286407471, + "num_tokens": 739423564.0, + "step": 19376 + }, + { + "epoch": 2.4649535682483146, + "ewc_loss": 0.048877205699682236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4438602849841118e-05, + "grad_norm": 31.020862579345703, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8804082274436951, + "num_tokens": 739457609.0, + "step": 19377 + }, + { + "epoch": 2.465080778526905, + "ewc_loss": 0.04900353401899338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4501767256879248e-05, + "grad_norm": 30.985136032104492, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.872612714767456, + "num_tokens": 739497204.0, + "step": 19378 + }, + { + "epoch": 2.4652079888054956, + "ewc_loss": 0.048805758357048035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.440287971694488e-05, + "grad_norm": 30.865766525268555, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.866222620010376, + "num_tokens": 739543688.0, + "step": 19379 + }, + { + "epoch": 2.465335199084086, + "ewc_loss": 0.04896470904350281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4482355001964606e-05, + "grad_norm": 31.094533920288086, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.884097695350647, + "num_tokens": 739578221.0, + "step": 19380 + }, + { + "epoch": 2.4654624093626767, + "ewc_loss": 0.04887630417943001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.443815174046904e-05, + "grad_norm": 30.971059799194336, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8771178126335144, + "num_tokens": 739615892.0, + "step": 19381 + }, + { + "epoch": 2.4655896196412668, + "ewc_loss": 0.04888434335589409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4442171707050875e-05, + "grad_norm": 31.00162124633789, + "learning_rate": 1e-06, + "loss": 0.4487, + "mean_token_accuracy": 0.857205331325531, + "num_tokens": 739657697.0, + "step": 19382 + }, + { + "epoch": 2.4657168299198577, + "ewc_loss": 0.04885499179363251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4427496100543067e-05, + "grad_norm": 30.91390037536621, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8804551362991333, + "num_tokens": 739694173.0, + "step": 19383 + }, + { + "epoch": 2.465844040198448, + "ewc_loss": 0.04899522662162781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.449761268508155e-05, + "grad_norm": 31.184165954589844, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8687008619308472, + "num_tokens": 739734111.0, + "step": 19384 + }, + { + "epoch": 2.4659712504770384, + "ewc_loss": 0.04889444261789322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4447221221635118e-05, + "grad_norm": 31.035545349121094, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.873908281326294, + "num_tokens": 739771409.0, + "step": 19385 + }, + { + "epoch": 2.466098460755629, + "ewc_loss": 0.048813097178936005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4406548618571833e-05, + "grad_norm": 30.987567901611328, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8591135740280151, + "num_tokens": 739807185.0, + "step": 19386 + }, + { + "epoch": 2.4662256710342194, + "ewc_loss": 0.04890696704387665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4453484002151527e-05, + "grad_norm": 31.057723999023438, + "learning_rate": 1e-06, + "loss": 0.3826, + "mean_token_accuracy": 0.8842225670814514, + "num_tokens": 739837750.0, + "step": 19387 + }, + { + "epoch": 2.46635288131281, + "ewc_loss": 0.04889548569917679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4447743271593936e-05, + "grad_norm": 31.045326232910156, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8708103895187378, + "num_tokens": 739879516.0, + "step": 19388 + }, + { + "epoch": 2.4664800915914005, + "ewc_loss": 0.04880433902144432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4402170311077498e-05, + "grad_norm": 30.936717987060547, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8785874843597412, + "num_tokens": 739918688.0, + "step": 19389 + }, + { + "epoch": 2.466607301869991, + "ewc_loss": 0.04888718202710152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.444359051878564e-05, + "grad_norm": 31.08761978149414, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.875876784324646, + "num_tokens": 739958978.0, + "step": 19390 + }, + { + "epoch": 2.4667345121485815, + "ewc_loss": 0.04900212958455086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.450106512696948e-05, + "grad_norm": 30.8583927154541, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8768903017044067, + "num_tokens": 740002428.0, + "step": 19391 + }, + { + "epoch": 2.466861722427172, + "ewc_loss": 0.04886430874466896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4432154532405548e-05, + "grad_norm": 31.03643798828125, + "learning_rate": 1e-06, + "loss": 0.3614, + "mean_token_accuracy": 0.8919259309768677, + "num_tokens": 740045856.0, + "step": 19392 + }, + { + "epoch": 2.4669889327057626, + "ewc_loss": 0.04905584827065468, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4527924324502237e-05, + "grad_norm": 31.003345489501953, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8699730038642883, + "num_tokens": 740084512.0, + "step": 19393 + }, + { + "epoch": 2.467116142984353, + "ewc_loss": 0.048809174448251724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.440458774799481e-05, + "grad_norm": 30.923511505126953, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8830140829086304, + "num_tokens": 740129624.0, + "step": 19394 + }, + { + "epoch": 2.4672433532629436, + "ewc_loss": 0.04908185824751854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4540928279748186e-05, + "grad_norm": 31.10763168334961, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8548132181167603, + "num_tokens": 740168368.0, + "step": 19395 + }, + { + "epoch": 2.467370563541534, + "ewc_loss": 0.04897995665669441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4489978386554867e-05, + "grad_norm": 30.891881942749023, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8658220171928406, + "num_tokens": 740206670.0, + "step": 19396 + }, + { + "epoch": 2.4674977738201247, + "ewc_loss": 0.04887022823095322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4435114028165117e-05, + "grad_norm": 31.04150390625, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8505189418792725, + "num_tokens": 740246422.0, + "step": 19397 + }, + { + "epoch": 2.4676249840987152, + "ewc_loss": 0.049017421901226044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4508710339432582e-05, + "grad_norm": 30.933849334716797, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8685171604156494, + "num_tokens": 740284450.0, + "step": 19398 + }, + { + "epoch": 2.4677521943773058, + "ewc_loss": 0.04888395965099335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4441980713163503e-05, + "grad_norm": 30.950725555419922, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8495564460754395, + "num_tokens": 740319062.0, + "step": 19399 + }, + { + "epoch": 2.4678794046558963, + "ewc_loss": 0.04906933754682541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4534669137210585e-05, + "grad_norm": 31.022714614868164, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8725981712341309, + "num_tokens": 740353996.0, + "step": 19400 + }, + { + "epoch": 2.468006614934487, + "ewc_loss": 0.04886593669652939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4432967620668933e-05, + "grad_norm": 30.9676570892334, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8711921572685242, + "num_tokens": 740394694.0, + "step": 19401 + }, + { + "epoch": 2.4681338252130773, + "ewc_loss": 0.049050070345401764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4525035769329406e-05, + "grad_norm": 30.92753028869629, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8929139375686646, + "num_tokens": 740436567.0, + "step": 19402 + }, + { + "epoch": 2.468261035491668, + "ewc_loss": 0.04896232485771179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.448116174491588e-05, + "grad_norm": 31.01568603515625, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8644304871559143, + "num_tokens": 740476458.0, + "step": 19403 + }, + { + "epoch": 2.4683882457702584, + "ewc_loss": 0.04898675158619881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.449337625876069e-05, + "grad_norm": 31.005756378173828, + "learning_rate": 1e-06, + "loss": 0.3817, + "mean_token_accuracy": 0.8809511065483093, + "num_tokens": 740514703.0, + "step": 19404 + }, + { + "epoch": 2.468515456048849, + "ewc_loss": 0.048946231603622437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4473116354783997e-05, + "grad_norm": 30.949533462524414, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.8821429014205933, + "num_tokens": 740552328.0, + "step": 19405 + }, + { + "epoch": 2.4686426663274394, + "ewc_loss": 0.04897095635533333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4485478206770495e-05, + "grad_norm": 31.034029006958008, + "learning_rate": 1e-06, + "loss": 0.4706, + "mean_token_accuracy": 0.8534461259841919, + "num_tokens": 740593141.0, + "step": 19406 + }, + { + "epoch": 2.4687698766060295, + "ewc_loss": 0.04901733994483948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4508670321665704e-05, + "grad_norm": 30.871978759765625, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8815133571624756, + "num_tokens": 740634097.0, + "step": 19407 + }, + { + "epoch": 2.4688970868846205, + "ewc_loss": 0.04888702183961868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4443510483251885e-05, + "grad_norm": 31.022403717041016, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8772691488265991, + "num_tokens": 740672917.0, + "step": 19408 + }, + { + "epoch": 2.4690242971632106, + "ewc_loss": 0.04899075999855995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4495380785083398e-05, + "grad_norm": 30.847427368164062, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8657156229019165, + "num_tokens": 740709351.0, + "step": 19409 + }, + { + "epoch": 2.469151507441801, + "ewc_loss": 0.04895377531647682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4476887119817548e-05, + "grad_norm": 31.05915069580078, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8628809452056885, + "num_tokens": 740744049.0, + "step": 19410 + }, + { + "epoch": 2.4692787177203916, + "ewc_loss": 0.049002956598997116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4501478037564084e-05, + "grad_norm": 30.881425857543945, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8749657869338989, + "num_tokens": 740784765.0, + "step": 19411 + }, + { + "epoch": 2.469405927998982, + "ewc_loss": 0.048943620175123215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.447181032039225e-05, + "grad_norm": 31.020795822143555, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8651360273361206, + "num_tokens": 740815859.0, + "step": 19412 + }, + { + "epoch": 2.4695331382775727, + "ewc_loss": 0.049124300479888916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4562150429119356e-05, + "grad_norm": 31.144641876220703, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.890301525592804, + "num_tokens": 740854300.0, + "step": 19413 + }, + { + "epoch": 2.4696603485561632, + "ewc_loss": 0.04895589500665665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4477947590639815e-05, + "grad_norm": 30.903051376342773, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8555471897125244, + "num_tokens": 740891836.0, + "step": 19414 + }, + { + "epoch": 2.4697875588347538, + "ewc_loss": 0.04902325198054314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4511626179446466e-05, + "grad_norm": 31.0324649810791, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8797222375869751, + "num_tokens": 740927434.0, + "step": 19415 + }, + { + "epoch": 2.4699147691133443, + "ewc_loss": 0.04904816672205925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4524082618881948e-05, + "grad_norm": 31.01297378540039, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8745149970054626, + "num_tokens": 740973707.0, + "step": 19416 + }, + { + "epoch": 2.470041979391935, + "ewc_loss": 0.048930902034044266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4465451133437455e-05, + "grad_norm": 30.979955673217773, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8785552978515625, + "num_tokens": 741015953.0, + "step": 19417 + }, + { + "epoch": 2.4701691896705253, + "ewc_loss": 0.04904891923069954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4524459149688482e-05, + "grad_norm": 31.18071746826172, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8792018890380859, + "num_tokens": 741049634.0, + "step": 19418 + }, + { + "epoch": 2.470296399949116, + "ewc_loss": 0.04899616912007332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4498083803337067e-05, + "grad_norm": 30.895265579223633, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8815629482269287, + "num_tokens": 741084267.0, + "step": 19419 + }, + { + "epoch": 2.4704236102277064, + "ewc_loss": 0.04893241077661514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4466206014039926e-05, + "grad_norm": 31.00351905822754, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8785987496376038, + "num_tokens": 741122237.0, + "step": 19420 + }, + { + "epoch": 2.470550820506297, + "ewc_loss": 0.04911458119750023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.455729008943308e-05, + "grad_norm": 31.005184173583984, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.859534502029419, + "num_tokens": 741163593.0, + "step": 19421 + }, + { + "epoch": 2.4706780307848875, + "ewc_loss": 0.048955440521240234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.447772021696437e-05, + "grad_norm": 30.91927146911621, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8616143465042114, + "num_tokens": 741202328.0, + "step": 19422 + }, + { + "epoch": 2.470805241063478, + "ewc_loss": 0.049006879329681396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4503438908141106e-05, + "grad_norm": 30.940044403076172, + "learning_rate": 1e-06, + "loss": 0.3547, + "mean_token_accuracy": 0.8919051289558411, + "num_tokens": 741240095.0, + "step": 19423 + }, + { + "epoch": 2.4709324513420685, + "ewc_loss": 0.04905577003955841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.452788430673536e-05, + "grad_norm": 30.967233657836914, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8735601902008057, + "num_tokens": 741278509.0, + "step": 19424 + }, + { + "epoch": 2.471059661620659, + "ewc_loss": 0.049157194793224335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4578597731306218e-05, + "grad_norm": 31.126798629760742, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.867378830909729, + "num_tokens": 741321446.0, + "step": 19425 + }, + { + "epoch": 2.4711868718992496, + "ewc_loss": 0.04898390918970108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4491953809047118e-05, + "grad_norm": 30.948898315429688, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8676718473434448, + "num_tokens": 741360469.0, + "step": 19426 + }, + { + "epoch": 2.47131408217784, + "ewc_loss": 0.04886889457702637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4434446459054016e-05, + "grad_norm": 30.87362289428711, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8624287843704224, + "num_tokens": 741393833.0, + "step": 19427 + }, + { + "epoch": 2.4714412924564306, + "ewc_loss": 0.049081653356552124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4540826416341588e-05, + "grad_norm": 31.108070373535156, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8773418068885803, + "num_tokens": 741426956.0, + "step": 19428 + }, + { + "epoch": 2.471568502735021, + "ewc_loss": 0.04894368350505829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.447184124321211e-05, + "grad_norm": 30.887910842895508, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8714895248413086, + "num_tokens": 741464820.0, + "step": 19429 + }, + { + "epoch": 2.4716957130136117, + "ewc_loss": 0.049027081578969955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.45135415752884e-05, + "grad_norm": 31.10785484313965, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8791158199310303, + "num_tokens": 741499519.0, + "step": 19430 + }, + { + "epoch": 2.471822923292202, + "ewc_loss": 0.049047768115997314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4523884349036962e-05, + "grad_norm": 30.905895233154297, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8782820701599121, + "num_tokens": 741533443.0, + "step": 19431 + }, + { + "epoch": 2.4719501335707923, + "ewc_loss": 0.04897238314151764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4486191250616685e-05, + "grad_norm": 31.012239456176758, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8690259456634521, + "num_tokens": 741576849.0, + "step": 19432 + }, + { + "epoch": 2.4720773438493833, + "ewc_loss": 0.04910049960017204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.455025060044136e-05, + "grad_norm": 31.07217025756836, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8765714168548584, + "num_tokens": 741614653.0, + "step": 19433 + }, + { + "epoch": 2.4722045541279734, + "ewc_loss": 0.049018558114767075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4509279683115892e-05, + "grad_norm": 30.90828514099121, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8677638173103333, + "num_tokens": 741649354.0, + "step": 19434 + }, + { + "epoch": 2.472331764406564, + "ewc_loss": 0.048997633159160614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4498816856066696e-05, + "grad_norm": 31.02199935913086, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8649791479110718, + "num_tokens": 741692257.0, + "step": 19435 + }, + { + "epoch": 2.4724589746851544, + "ewc_loss": 0.04909740015864372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4548700821469538e-05, + "grad_norm": 30.95893669128418, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8686135411262512, + "num_tokens": 741729739.0, + "step": 19436 + }, + { + "epoch": 2.472586184963745, + "ewc_loss": 0.04892762377858162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.446381222398486e-05, + "grad_norm": 30.953792572021484, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8628119230270386, + "num_tokens": 741763409.0, + "step": 19437 + }, + { + "epoch": 2.4727133952423355, + "ewc_loss": 0.04905480146408081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4527400455554016e-05, + "grad_norm": 30.86762237548828, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8690552711486816, + "num_tokens": 741798746.0, + "step": 19438 + }, + { + "epoch": 2.472840605520926, + "ewc_loss": 0.04909924790263176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4549623049097136e-05, + "grad_norm": 30.980072021484375, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8804709911346436, + "num_tokens": 741840488.0, + "step": 19439 + }, + { + "epoch": 2.4729678157995165, + "ewc_loss": 0.04915110766887665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4575554562034085e-05, + "grad_norm": 31.047704696655273, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8886750936508179, + "num_tokens": 741880839.0, + "step": 19440 + }, + { + "epoch": 2.473095026078107, + "ewc_loss": 0.04907321184873581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.453660636092536e-05, + "grad_norm": 30.933712005615234, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.865338146686554, + "num_tokens": 741919317.0, + "step": 19441 + }, + { + "epoch": 2.4732222363566976, + "ewc_loss": 0.04907551780343056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.453775960020721e-05, + "grad_norm": 30.98940658569336, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8523505926132202, + "num_tokens": 741955782.0, + "step": 19442 + }, + { + "epoch": 2.473349446635288, + "ewc_loss": 0.049105022102594376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4552511604269966e-05, + "grad_norm": 30.894575119018555, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8660778403282166, + "num_tokens": 741990410.0, + "step": 19443 + }, + { + "epoch": 2.4734766569138786, + "ewc_loss": 0.04916960373520851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4584802304161713e-05, + "grad_norm": 31.036426544189453, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8683943748474121, + "num_tokens": 742031536.0, + "step": 19444 + }, + { + "epoch": 2.473603867192469, + "ewc_loss": 0.049090053886175156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4545026462874375e-05, + "grad_norm": 30.983232498168945, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8609404563903809, + "num_tokens": 742067222.0, + "step": 19445 + }, + { + "epoch": 2.4737310774710597, + "ewc_loss": 0.04900455102324486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4502274754922837e-05, + "grad_norm": 30.909990310668945, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8833435773849487, + "num_tokens": 742114740.0, + "step": 19446 + }, + { + "epoch": 2.47385828774965, + "ewc_loss": 0.049155205488204956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4577602744102478e-05, + "grad_norm": 30.963655471801758, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8788203001022339, + "num_tokens": 742149344.0, + "step": 19447 + }, + { + "epoch": 2.4739854980282407, + "ewc_loss": 0.04910380393266678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.455190224281978e-05, + "grad_norm": 30.95978355407715, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.873539388179779, + "num_tokens": 742191089.0, + "step": 19448 + }, + { + "epoch": 2.4741127083068313, + "ewc_loss": 0.04913436248898506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4567181753809564e-05, + "grad_norm": 31.097623825073242, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8714333176612854, + "num_tokens": 742224944.0, + "step": 19449 + }, + { + "epoch": 2.474239918585422, + "ewc_loss": 0.049034543335437775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4517272322555073e-05, + "grad_norm": 30.96302032470703, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8617585897445679, + "num_tokens": 742263674.0, + "step": 19450 + }, + { + "epoch": 2.4743671288640123, + "ewc_loss": 0.04912418872117996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4562094040447846e-05, + "grad_norm": 30.93734359741211, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8673226833343506, + "num_tokens": 742300424.0, + "step": 19451 + }, + { + "epoch": 2.474494339142603, + "ewc_loss": 0.049134399741888046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.45671999437036e-05, + "grad_norm": 30.972124099731445, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8819803595542908, + "num_tokens": 742333854.0, + "step": 19452 + }, + { + "epoch": 2.4746215494211934, + "ewc_loss": 0.049064889550209045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.453244451317005e-05, + "grad_norm": 31.08333969116211, + "learning_rate": 1e-06, + "loss": 0.3904, + "mean_token_accuracy": 0.8798952102661133, + "num_tokens": 742377126.0, + "step": 19453 + }, + { + "epoch": 2.474748759699784, + "ewc_loss": 0.04910435900092125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4552178729209118e-05, + "grad_norm": 30.96942710876465, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8698446750640869, + "num_tokens": 742421284.0, + "step": 19454 + }, + { + "epoch": 2.4748759699783744, + "ewc_loss": 0.04894331842660904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4471659344271757e-05, + "grad_norm": 30.95435333251953, + "learning_rate": 1e-06, + "loss": 0.4805, + "mean_token_accuracy": 0.8555079698562622, + "num_tokens": 742459124.0, + "step": 19455 + }, + { + "epoch": 2.475003180256965, + "ewc_loss": 0.04902878403663635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.451439286232926e-05, + "grad_norm": 31.028793334960938, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8624990582466125, + "num_tokens": 742493137.0, + "step": 19456 + }, + { + "epoch": 2.475130390535555, + "ewc_loss": 0.0490640290081501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4532015231670812e-05, + "grad_norm": 30.959802627563477, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8745810985565186, + "num_tokens": 742529805.0, + "step": 19457 + }, + { + "epoch": 2.475257600814146, + "ewc_loss": 0.049000222235918045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.450011197652202e-05, + "grad_norm": 30.998958587646484, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8837077021598816, + "num_tokens": 742560859.0, + "step": 19458 + }, + { + "epoch": 2.475384811092736, + "ewc_loss": 0.04910073056817055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4550365196773782e-05, + "grad_norm": 31.01510238647461, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8834893703460693, + "num_tokens": 742601465.0, + "step": 19459 + }, + { + "epoch": 2.4755120213713266, + "ewc_loss": 0.04898509010672569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.449254498060327e-05, + "grad_norm": 30.962770462036133, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8661384582519531, + "num_tokens": 742639242.0, + "step": 19460 + }, + { + "epoch": 2.475639231649917, + "ewc_loss": 0.049122728407382965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4561364625697024e-05, + "grad_norm": 31.101247787475586, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.8719251155853271, + "num_tokens": 742676574.0, + "step": 19461 + }, + { + "epoch": 2.4757664419285077, + "ewc_loss": 0.04904581233859062, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4522905732737854e-05, + "grad_norm": 31.105607986450195, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8616420030593872, + "num_tokens": 742713326.0, + "step": 19462 + }, + { + "epoch": 2.4758936522070982, + "ewc_loss": 0.04890126734972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.445063364575617e-05, + "grad_norm": 30.926889419555664, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8486858010292053, + "num_tokens": 742756883.0, + "step": 19463 + }, + { + "epoch": 2.4760208624856888, + "ewc_loss": 0.04906294867396355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.453147499181796e-05, + "grad_norm": 31.030534744262695, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8679229617118835, + "num_tokens": 742794301.0, + "step": 19464 + }, + { + "epoch": 2.4761480727642793, + "ewc_loss": 0.048985887318849564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4492943339282647e-05, + "grad_norm": 30.875459671020508, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.874552845954895, + "num_tokens": 742832966.0, + "step": 19465 + }, + { + "epoch": 2.47627528304287, + "ewc_loss": 0.04900512099266052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4502560336259194e-05, + "grad_norm": 30.957473754882812, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8682824373245239, + "num_tokens": 742872926.0, + "step": 19466 + }, + { + "epoch": 2.4764024933214603, + "ewc_loss": 0.04910072311758995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4550361558794975e-05, + "grad_norm": 31.02023696899414, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8786795139312744, + "num_tokens": 742910838.0, + "step": 19467 + }, + { + "epoch": 2.476529703600051, + "ewc_loss": 0.049064356833696365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.453217894071713e-05, + "grad_norm": 30.9954891204834, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8794407844543457, + "num_tokens": 742950098.0, + "step": 19468 + }, + { + "epoch": 2.4766569138786414, + "ewc_loss": 0.04905347526073456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4526738343411125e-05, + "grad_norm": 31.001184463500977, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8833964467048645, + "num_tokens": 742983872.0, + "step": 19469 + }, + { + "epoch": 2.476784124157232, + "ewc_loss": 0.04907212406396866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.45360624830937e-05, + "grad_norm": 30.907670974731445, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8610680103302002, + "num_tokens": 743029816.0, + "step": 19470 + }, + { + "epoch": 2.4769113344358225, + "ewc_loss": 0.049069445580244064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4534721887903288e-05, + "grad_norm": 31.008773803710938, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8649562001228333, + "num_tokens": 743068565.0, + "step": 19471 + }, + { + "epoch": 2.477038544714413, + "ewc_loss": 0.049123525619506836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.45617629843764e-05, + "grad_norm": 30.954010009765625, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8731270432472229, + "num_tokens": 743112137.0, + "step": 19472 + }, + { + "epoch": 2.4771657549930035, + "ewc_loss": 0.04901920258998871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.450960164424032e-05, + "grad_norm": 30.93117332458496, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8860976696014404, + "num_tokens": 743145067.0, + "step": 19473 + }, + { + "epoch": 2.477292965271594, + "ewc_loss": 0.0490303561091423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.451517866575159e-05, + "grad_norm": 31.01219940185547, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8769059181213379, + "num_tokens": 743177017.0, + "step": 19474 + }, + { + "epoch": 2.4774201755501846, + "ewc_loss": 0.049191318452358246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4595659851911478e-05, + "grad_norm": 31.0011043548584, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8715054392814636, + "num_tokens": 743218121.0, + "step": 19475 + }, + { + "epoch": 2.477547385828775, + "ewc_loss": 0.04902738705277443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4513694370398298e-05, + "grad_norm": 30.849882125854492, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8783853054046631, + "num_tokens": 743256635.0, + "step": 19476 + }, + { + "epoch": 2.4776745961073656, + "ewc_loss": 0.04910041391849518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4550206944695674e-05, + "grad_norm": 31.01117515563965, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8698444366455078, + "num_tokens": 743293801.0, + "step": 19477 + }, + { + "epoch": 2.477801806385956, + "ewc_loss": 0.04907887056469917, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4539434889447875e-05, + "grad_norm": 30.92022705078125, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8851699829101562, + "num_tokens": 743331582.0, + "step": 19478 + }, + { + "epoch": 2.4779290166645467, + "ewc_loss": 0.04898503050208092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4492515876772813e-05, + "grad_norm": 30.919578552246094, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8741369247436523, + "num_tokens": 743369901.0, + "step": 19479 + }, + { + "epoch": 2.4780562269431368, + "ewc_loss": 0.04903507977724075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4517539713997394e-05, + "grad_norm": 30.908823013305664, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8683186173439026, + "num_tokens": 743405808.0, + "step": 19480 + }, + { + "epoch": 2.4781834372217277, + "ewc_loss": 0.04913591593503952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4567958462284878e-05, + "grad_norm": 31.027050018310547, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8736530542373657, + "num_tokens": 743447236.0, + "step": 19481 + }, + { + "epoch": 2.478310647500318, + "ewc_loss": 0.049124497920274734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4562248654547147e-05, + "grad_norm": 30.968795776367188, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8793066740036011, + "num_tokens": 743486494.0, + "step": 19482 + }, + { + "epoch": 2.4784378577789083, + "ewc_loss": 0.04909244552254677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.454622335790191e-05, + "grad_norm": 31.150991439819336, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8684456944465637, + "num_tokens": 743523941.0, + "step": 19483 + }, + { + "epoch": 2.478565068057499, + "ewc_loss": 0.04916425794363022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.45821283897385e-05, + "grad_norm": 30.99673080444336, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8760654330253601, + "num_tokens": 743554169.0, + "step": 19484 + }, + { + "epoch": 2.4786922783360894, + "ewc_loss": 0.049003299325704575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4501649022568017e-05, + "grad_norm": 30.93114471435547, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8563781976699829, + "num_tokens": 743594656.0, + "step": 19485 + }, + { + "epoch": 2.47881948861468, + "ewc_loss": 0.04911171644926071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.455585854477249e-05, + "grad_norm": 31.025197982788086, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8788992166519165, + "num_tokens": 743628638.0, + "step": 19486 + }, + { + "epoch": 2.4789466988932705, + "ewc_loss": 0.049048200249671936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4524100808775984e-05, + "grad_norm": 30.861553192138672, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8634830117225647, + "num_tokens": 743665382.0, + "step": 19487 + }, + { + "epoch": 2.479073909171861, + "ewc_loss": 0.04904387891292572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.452193984936457e-05, + "grad_norm": 31.006610870361328, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8844033479690552, + "num_tokens": 743701712.0, + "step": 19488 + }, + { + "epoch": 2.4792011194504515, + "ewc_loss": 0.04908603057265282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4543014660594054e-05, + "grad_norm": 30.888681411743164, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8721430897712708, + "num_tokens": 743740347.0, + "step": 19489 + }, + { + "epoch": 2.479328329729042, + "ewc_loss": 0.04903598129749298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4517990823369473e-05, + "grad_norm": 30.998348236083984, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8737334609031677, + "num_tokens": 743775434.0, + "step": 19490 + }, + { + "epoch": 2.4794555400076326, + "ewc_loss": 0.04921354353427887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.460677205817774e-05, + "grad_norm": 31.04532241821289, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8701068162918091, + "num_tokens": 743812475.0, + "step": 19491 + }, + { + "epoch": 2.479582750286223, + "ewc_loss": 0.04908230900764465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4541153834434226e-05, + "grad_norm": 30.992143630981445, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8627817630767822, + "num_tokens": 743851350.0, + "step": 19492 + }, + { + "epoch": 2.4797099605648136, + "ewc_loss": 0.04911438003182411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4557190045015886e-05, + "grad_norm": 30.998611450195312, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8694154620170593, + "num_tokens": 743885086.0, + "step": 19493 + }, + { + "epoch": 2.479837170843404, + "ewc_loss": 0.04908343032002449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4541715902159922e-05, + "grad_norm": 31.031736373901367, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8750302195549011, + "num_tokens": 743931747.0, + "step": 19494 + }, + { + "epoch": 2.4799643811219947, + "ewc_loss": 0.04908505454659462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4542527171433903e-05, + "grad_norm": 30.946643829345703, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8845047950744629, + "num_tokens": 743971806.0, + "step": 19495 + }, + { + "epoch": 2.480091591400585, + "ewc_loss": 0.04909340664744377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4546703571104445e-05, + "grad_norm": 31.061614990234375, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.8825885057449341, + "num_tokens": 744005304.0, + "step": 19496 + }, + { + "epoch": 2.4802188016791757, + "ewc_loss": 0.049183815717697144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4591907276771963e-05, + "grad_norm": 31.0865535736084, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8629720211029053, + "num_tokens": 744051159.0, + "step": 19497 + }, + { + "epoch": 2.4803460119577663, + "ewc_loss": 0.0490521565079689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4526078050257638e-05, + "grad_norm": 31.013452529907227, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8547207117080688, + "num_tokens": 744089856.0, + "step": 19498 + }, + { + "epoch": 2.480473222236357, + "ewc_loss": 0.049085382372140884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4542690880480222e-05, + "grad_norm": 31.004819869995117, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8523788452148438, + "num_tokens": 744136355.0, + "step": 19499 + }, + { + "epoch": 2.4806004325149473, + "ewc_loss": 0.049134500324726105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.45672508754069e-05, + "grad_norm": 31.08419418334961, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8611495494842529, + "num_tokens": 744177682.0, + "step": 19500 + }, + { + "epoch": 2.480727642793538, + "ewc_loss": 0.049038004130125046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4519002181477845e-05, + "grad_norm": 31.001798629760742, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8737294673919678, + "num_tokens": 744214209.0, + "step": 19501 + }, + { + "epoch": 2.4808548530721284, + "ewc_loss": 0.049068499356508255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4534248950658366e-05, + "grad_norm": 31.02215003967285, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.855114221572876, + "num_tokens": 744257923.0, + "step": 19502 + }, + { + "epoch": 2.480982063350719, + "ewc_loss": 0.049167755991220474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.458387825754471e-05, + "grad_norm": 31.13051414489746, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8782514333724976, + "num_tokens": 744293370.0, + "step": 19503 + }, + { + "epoch": 2.4811092736293094, + "ewc_loss": 0.04908820241689682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.454410059726797e-05, + "grad_norm": 30.997623443603516, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8775891661643982, + "num_tokens": 744328572.0, + "step": 19504 + }, + { + "epoch": 2.4812364839078995, + "ewc_loss": 0.04905245080590248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4526225388399325e-05, + "grad_norm": 31.01955795288086, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8557454347610474, + "num_tokens": 744364124.0, + "step": 19505 + }, + { + "epoch": 2.4813636941864905, + "ewc_loss": 0.049090225249528885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4545111955376342e-05, + "grad_norm": 30.96126937866211, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8753501772880554, + "num_tokens": 744403679.0, + "step": 19506 + }, + { + "epoch": 2.4814909044650806, + "ewc_loss": 0.04896104708313942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4480523279635236e-05, + "grad_norm": 31.007671356201172, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8717597723007202, + "num_tokens": 744440922.0, + "step": 19507 + }, + { + "epoch": 2.481618114743671, + "ewc_loss": 0.04912178963422775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.456089532643091e-05, + "grad_norm": 31.110389709472656, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8687892556190491, + "num_tokens": 744479025.0, + "step": 19508 + }, + { + "epoch": 2.4817453250222616, + "ewc_loss": 0.04902190715074539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4510953153367154e-05, + "grad_norm": 31.019508361816406, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8732726573944092, + "num_tokens": 744514165.0, + "step": 19509 + }, + { + "epoch": 2.481872535300852, + "ewc_loss": 0.04902148246765137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.451074033160694e-05, + "grad_norm": 30.95355224609375, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8662076592445374, + "num_tokens": 744547210.0, + "step": 19510 + }, + { + "epoch": 2.4819997455794427, + "ewc_loss": 0.04905494302511215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4527471396140754e-05, + "grad_norm": 31.027738571166992, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.872481107711792, + "num_tokens": 744585132.0, + "step": 19511 + }, + { + "epoch": 2.4821269558580332, + "ewc_loss": 0.048996493220329285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4498247512383386e-05, + "grad_norm": 31.06985855102539, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8821132183074951, + "num_tokens": 744617921.0, + "step": 19512 + }, + { + "epoch": 2.4822541661366238, + "ewc_loss": 0.04905920848250389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4529605070711114e-05, + "grad_norm": 31.08812141418457, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.8672583103179932, + "num_tokens": 744653095.0, + "step": 19513 + }, + { + "epoch": 2.4823813764152143, + "ewc_loss": 0.048963554203510284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.448177656333428e-05, + "grad_norm": 30.992034912109375, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8780362606048584, + "num_tokens": 744688258.0, + "step": 19514 + }, + { + "epoch": 2.482508586693805, + "ewc_loss": 0.04906483367085457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4532417228328995e-05, + "grad_norm": 31.187734603881836, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.870449423789978, + "num_tokens": 744723500.0, + "step": 19515 + }, + { + "epoch": 2.4826357969723953, + "ewc_loss": 0.04911414161324501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4557069991715252e-05, + "grad_norm": 30.972177505493164, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8721902966499329, + "num_tokens": 744757133.0, + "step": 19516 + }, + { + "epoch": 2.482763007250986, + "ewc_loss": 0.048905279487371445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.445263999106828e-05, + "grad_norm": 30.94029426574707, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8845731019973755, + "num_tokens": 744793814.0, + "step": 19517 + }, + { + "epoch": 2.4828902175295764, + "ewc_loss": 0.04910597577691078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4552988179493695e-05, + "grad_norm": 31.111263275146484, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8759392499923706, + "num_tokens": 744828680.0, + "step": 19518 + }, + { + "epoch": 2.483017427808167, + "ewc_loss": 0.04915367811918259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.457683876855299e-05, + "grad_norm": 31.143596649169922, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8673436045646667, + "num_tokens": 744861642.0, + "step": 19519 + }, + { + "epoch": 2.4831446380867574, + "ewc_loss": 0.04896296560764313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4481483706040308e-05, + "grad_norm": 30.991283416748047, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8824037313461304, + "num_tokens": 744900345.0, + "step": 19520 + }, + { + "epoch": 2.483271848365348, + "ewc_loss": 0.049070265144109726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.453513297950849e-05, + "grad_norm": 31.018985748291016, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8782551884651184, + "num_tokens": 744938589.0, + "step": 19521 + }, + { + "epoch": 2.4833990586439385, + "ewc_loss": 0.049132224172353745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.456611218804028e-05, + "grad_norm": 31.158084869384766, + "learning_rate": 1e-06, + "loss": 0.4681, + "mean_token_accuracy": 0.8624703288078308, + "num_tokens": 744975385.0, + "step": 19522 + }, + { + "epoch": 2.483526268922529, + "ewc_loss": 0.04911048710346222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4555243726354092e-05, + "grad_norm": 31.087078094482422, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8616026639938354, + "num_tokens": 745010860.0, + "step": 19523 + }, + { + "epoch": 2.4836534792011196, + "ewc_loss": 0.04903067648410797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4515338736819103e-05, + "grad_norm": 30.975786209106445, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.86409592628479, + "num_tokens": 745047183.0, + "step": 19524 + }, + { + "epoch": 2.48378068947971, + "ewc_loss": 0.04908278211951256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4541390303056687e-05, + "grad_norm": 30.989097595214844, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8679971694946289, + "num_tokens": 745081906.0, + "step": 19525 + }, + { + "epoch": 2.4839078997583006, + "ewc_loss": 0.04910142719745636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4550714442739263e-05, + "grad_norm": 30.95281982421875, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8767701983451843, + "num_tokens": 745127285.0, + "step": 19526 + }, + { + "epoch": 2.484035110036891, + "ewc_loss": 0.04921448230743408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4607241357443854e-05, + "grad_norm": 31.113452911376953, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8868628740310669, + "num_tokens": 745171564.0, + "step": 19527 + }, + { + "epoch": 2.4841623203154817, + "ewc_loss": 0.049127545207738876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.456377296766732e-05, + "grad_norm": 30.92403221130371, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8819103837013245, + "num_tokens": 745214252.0, + "step": 19528 + }, + { + "epoch": 2.484289530594072, + "ewc_loss": 0.04900623485445976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.450311694701668e-05, + "grad_norm": 31.008760452270508, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8739714622497559, + "num_tokens": 745251983.0, + "step": 19529 + }, + { + "epoch": 2.4844167408726623, + "ewc_loss": 0.04917014762759209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.458507333358284e-05, + "grad_norm": 30.998641967773438, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8737386465072632, + "num_tokens": 745287037.0, + "step": 19530 + }, + { + "epoch": 2.4845439511512533, + "ewc_loss": 0.049102816730737686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4551407477702014e-05, + "grad_norm": 30.93832015991211, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8694911003112793, + "num_tokens": 745327854.0, + "step": 19531 + }, + { + "epoch": 2.4846711614298433, + "ewc_loss": 0.04924971982836723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.46248600888066e-05, + "grad_norm": 31.09604835510254, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8602992296218872, + "num_tokens": 745370227.0, + "step": 19532 + }, + { + "epoch": 2.484798371708434, + "ewc_loss": 0.04914550483226776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4572753318352625e-05, + "grad_norm": 31.0954532623291, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8693537712097168, + "num_tokens": 745409416.0, + "step": 19533 + }, + { + "epoch": 2.4849255819870244, + "ewc_loss": 0.049164608120918274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4582304831710644e-05, + "grad_norm": 31.10499382019043, + "learning_rate": 1e-06, + "loss": 0.4802, + "mean_token_accuracy": 0.8548958897590637, + "num_tokens": 745449908.0, + "step": 19534 + }, + { + "epoch": 2.485052792265615, + "ewc_loss": 0.04910251498222351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4551258320570923e-05, + "grad_norm": 31.035329818725586, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8602144718170166, + "num_tokens": 745486384.0, + "step": 19535 + }, + { + "epoch": 2.4851800025442055, + "ewc_loss": 0.04914849251508713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4574246708652936e-05, + "grad_norm": 31.084455490112305, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8597087860107422, + "num_tokens": 745523058.0, + "step": 19536 + }, + { + "epoch": 2.485307212822796, + "ewc_loss": 0.04907473549246788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4537368517485447e-05, + "grad_norm": 31.09874725341797, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8545240163803101, + "num_tokens": 745559276.0, + "step": 19537 + }, + { + "epoch": 2.4854344231013865, + "ewc_loss": 0.04903537780046463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4517688871128485e-05, + "grad_norm": 30.94035530090332, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8698993921279907, + "num_tokens": 745592726.0, + "step": 19538 + }, + { + "epoch": 2.485561633379977, + "ewc_loss": 0.04906065762042999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4530329028493725e-05, + "grad_norm": 30.921907424926758, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8732906579971313, + "num_tokens": 745630117.0, + "step": 19539 + }, + { + "epoch": 2.4856888436585676, + "ewc_loss": 0.04923951253294945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.461975600454025e-05, + "grad_norm": 31.05974769592285, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8583139777183533, + "num_tokens": 745664968.0, + "step": 19540 + }, + { + "epoch": 2.485816053937158, + "ewc_loss": 0.04917960241436958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4589800887042657e-05, + "grad_norm": 30.971160888671875, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8734208941459656, + "num_tokens": 745701893.0, + "step": 19541 + }, + { + "epoch": 2.4859432642157486, + "ewc_loss": 0.049178749322891235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4589375243522227e-05, + "grad_norm": 30.948823928833008, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8686904311180115, + "num_tokens": 745736884.0, + "step": 19542 + }, + { + "epoch": 2.486070474494339, + "ewc_loss": 0.049142803996801376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.457140180922579e-05, + "grad_norm": 30.990381240844727, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8838402032852173, + "num_tokens": 745782381.0, + "step": 19543 + }, + { + "epoch": 2.4861976847729297, + "ewc_loss": 0.04908763989806175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4543820472899824e-05, + "grad_norm": 30.87683868408203, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.8870924115180969, + "num_tokens": 745827630.0, + "step": 19544 + }, + { + "epoch": 2.48632489505152, + "ewc_loss": 0.04913727194070816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4568635126342997e-05, + "grad_norm": 30.967512130737305, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8790154457092285, + "num_tokens": 745862223.0, + "step": 19545 + }, + { + "epoch": 2.4864521053301107, + "ewc_loss": 0.049175504595041275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4587752704974264e-05, + "grad_norm": 30.938251495361328, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8872342109680176, + "num_tokens": 745903077.0, + "step": 19546 + }, + { + "epoch": 2.4865793156087013, + "ewc_loss": 0.0492502897977829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4625145670142956e-05, + "grad_norm": 30.975669860839844, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8715811967849731, + "num_tokens": 745945534.0, + "step": 19547 + }, + { + "epoch": 2.486706525887292, + "ewc_loss": 0.049180034548044205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.459001734678168e-05, + "grad_norm": 31.105215072631836, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8552956581115723, + "num_tokens": 745986308.0, + "step": 19548 + }, + { + "epoch": 2.4868337361658823, + "ewc_loss": 0.04917869716882706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4589347958681174e-05, + "grad_norm": 30.886043548583984, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8633893728256226, + "num_tokens": 746022206.0, + "step": 19549 + }, + { + "epoch": 2.486960946444473, + "ewc_loss": 0.04905308037996292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.452654007356614e-05, + "grad_norm": 31.039709091186523, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8783838748931885, + "num_tokens": 746059854.0, + "step": 19550 + }, + { + "epoch": 2.4870881567230634, + "ewc_loss": 0.049203142523765564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4601571567473002e-05, + "grad_norm": 31.009374618530273, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8794714212417603, + "num_tokens": 746103994.0, + "step": 19551 + }, + { + "epoch": 2.487215367001654, + "ewc_loss": 0.04909348115324974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4546739950892515e-05, + "grad_norm": 31.078350067138672, + "learning_rate": 1e-06, + "loss": 0.4633, + "mean_token_accuracy": 0.8610541820526123, + "num_tokens": 746138491.0, + "step": 19552 + }, + { + "epoch": 2.4873425772802444, + "ewc_loss": 0.0491650365293026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.458251765347086e-05, + "grad_norm": 30.939807891845703, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8693221211433411, + "num_tokens": 746172598.0, + "step": 19553 + }, + { + "epoch": 2.487469787558835, + "ewc_loss": 0.04907350242137909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4536751880077645e-05, + "grad_norm": 31.028217315673828, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8792027831077576, + "num_tokens": 746210385.0, + "step": 19554 + }, + { + "epoch": 2.487596997837425, + "ewc_loss": 0.04918532446026802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4592662157374434e-05, + "grad_norm": 30.949462890625, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8740208148956299, + "num_tokens": 746252563.0, + "step": 19555 + }, + { + "epoch": 2.487724208116016, + "ewc_loss": 0.04901750758290291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4508753995178267e-05, + "grad_norm": 30.957250595092773, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8777018189430237, + "num_tokens": 746292356.0, + "step": 19556 + }, + { + "epoch": 2.487851418394606, + "ewc_loss": 0.049138087779283524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4569044398958795e-05, + "grad_norm": 30.91550064086914, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8608065247535706, + "num_tokens": 746325467.0, + "step": 19557 + }, + { + "epoch": 2.4879786286731966, + "ewc_loss": 0.04915173351764679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4575867428211495e-05, + "grad_norm": 30.929401397705078, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8664323687553406, + "num_tokens": 746362285.0, + "step": 19558 + }, + { + "epoch": 2.488105838951787, + "ewc_loss": 0.049130167812108994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4565084459027275e-05, + "grad_norm": 30.960933685302734, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.87847900390625, + "num_tokens": 746401540.0, + "step": 19559 + }, + { + "epoch": 2.4882330492303777, + "ewc_loss": 0.04918994382023811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4594972273916937e-05, + "grad_norm": 31.005233764648438, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8726624846458435, + "num_tokens": 746439282.0, + "step": 19560 + }, + { + "epoch": 2.488360259508968, + "ewc_loss": 0.04916353151202202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4581766410847194e-05, + "grad_norm": 30.992753982543945, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.882464587688446, + "num_tokens": 746480993.0, + "step": 19561 + }, + { + "epoch": 2.4884874697875587, + "ewc_loss": 0.04912998154759407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.456498987157829e-05, + "grad_norm": 30.924455642700195, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8686561584472656, + "num_tokens": 746519318.0, + "step": 19562 + }, + { + "epoch": 2.4886146800661493, + "ewc_loss": 0.049125444144010544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.456272159179207e-05, + "grad_norm": 31.03810691833496, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8578752875328064, + "num_tokens": 746559761.0, + "step": 19563 + }, + { + "epoch": 2.48874189034474, + "ewc_loss": 0.0492473766207695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4623688659630716e-05, + "grad_norm": 30.943939208984375, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.880127489566803, + "num_tokens": 746591622.0, + "step": 19564 + }, + { + "epoch": 2.4888691006233303, + "ewc_loss": 0.04917031526565552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4585157007095404e-05, + "grad_norm": 31.01896095275879, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8690783977508545, + "num_tokens": 746627709.0, + "step": 19565 + }, + { + "epoch": 2.488996310901921, + "ewc_loss": 0.04921095818281174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4605478756711818e-05, + "grad_norm": 30.996580123901367, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8684513568878174, + "num_tokens": 746663445.0, + "step": 19566 + }, + { + "epoch": 2.4891235211805114, + "ewc_loss": 0.04919363558292389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4596818548161536e-05, + "grad_norm": 31.068180084228516, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8595006465911865, + "num_tokens": 746706869.0, + "step": 19567 + }, + { + "epoch": 2.489250731459102, + "ewc_loss": 0.04920351132750511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.460175528540276e-05, + "grad_norm": 30.943132400512695, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.883007287979126, + "num_tokens": 746749502.0, + "step": 19568 + }, + { + "epoch": 2.4893779417376924, + "ewc_loss": 0.04905299097299576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.452649459883105e-05, + "grad_norm": 30.931575775146484, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8745930194854736, + "num_tokens": 746786510.0, + "step": 19569 + }, + { + "epoch": 2.489505152016283, + "ewc_loss": 0.049222931265830994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4611465050838888e-05, + "grad_norm": 30.966766357421875, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8765121698379517, + "num_tokens": 746819430.0, + "step": 19570 + }, + { + "epoch": 2.4896323622948735, + "ewc_loss": 0.04925844445824623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4629222025396302e-05, + "grad_norm": 31.05019187927246, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8747053742408752, + "num_tokens": 746856018.0, + "step": 19571 + }, + { + "epoch": 2.489759572573464, + "ewc_loss": 0.049168672412633896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4584336642874405e-05, + "grad_norm": 30.99069595336914, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.865725040435791, + "num_tokens": 746897350.0, + "step": 19572 + }, + { + "epoch": 2.4898867828520546, + "ewc_loss": 0.049193620681762695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4596811272203922e-05, + "grad_norm": 30.986515045166016, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8654681444168091, + "num_tokens": 746937163.0, + "step": 19573 + }, + { + "epoch": 2.490013993130645, + "ewc_loss": 0.04920893535017967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4604467398603447e-05, + "grad_norm": 30.99383544921875, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8815001845359802, + "num_tokens": 746976140.0, + "step": 19574 + }, + { + "epoch": 2.4901412034092356, + "ewc_loss": 0.04924014210700989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4620070689707063e-05, + "grad_norm": 30.977046966552734, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8721063137054443, + "num_tokens": 747012576.0, + "step": 19575 + }, + { + "epoch": 2.490268413687826, + "ewc_loss": 0.04912970960140228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.456485526636243e-05, + "grad_norm": 30.976940155029297, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8699349164962769, + "num_tokens": 747048179.0, + "step": 19576 + }, + { + "epoch": 2.4903956239664167, + "ewc_loss": 0.04924856126308441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4624279831186868e-05, + "grad_norm": 30.991607666015625, + "learning_rate": 1e-06, + "loss": 0.4695, + "mean_token_accuracy": 0.8536744117736816, + "num_tokens": 747085291.0, + "step": 19577 + }, + { + "epoch": 2.4905228342450068, + "ewc_loss": 0.049145929515361786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4572964321123436e-05, + "grad_norm": 30.872133255004883, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8782209157943726, + "num_tokens": 747119085.0, + "step": 19578 + }, + { + "epoch": 2.4906500445235977, + "ewc_loss": 0.049202755093574524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4601376935606822e-05, + "grad_norm": 31.012157440185547, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8708298206329346, + "num_tokens": 747153463.0, + "step": 19579 + }, + { + "epoch": 2.490777254802188, + "ewc_loss": 0.0491974912583828, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.459874485793989e-05, + "grad_norm": 30.937517166137695, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8461740016937256, + "num_tokens": 747194781.0, + "step": 19580 + }, + { + "epoch": 2.4909044650807783, + "ewc_loss": 0.049149807542562485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4574903363827616e-05, + "grad_norm": 31.001808166503906, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8730162382125854, + "num_tokens": 747235328.0, + "step": 19581 + }, + { + "epoch": 2.491031675359369, + "ewc_loss": 0.04928046092391014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4640230549266562e-05, + "grad_norm": 30.976703643798828, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8694571256637573, + "num_tokens": 747278481.0, + "step": 19582 + }, + { + "epoch": 2.4911588856379594, + "ewc_loss": 0.0491630882024765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4581544494139962e-05, + "grad_norm": 31.010236740112305, + "learning_rate": 1e-06, + "loss": 0.3707, + "mean_token_accuracy": 0.8855161666870117, + "num_tokens": 747313077.0, + "step": 19583 + }, + { + "epoch": 2.49128609591655, + "ewc_loss": 0.04929167777299881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4645838493597694e-05, + "grad_norm": 31.017683029174805, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8783510327339172, + "num_tokens": 747352693.0, + "step": 19584 + }, + { + "epoch": 2.4914133061951405, + "ewc_loss": 0.04917764291167259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.458882227074355e-05, + "grad_norm": 31.025747299194336, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8636865615844727, + "num_tokens": 747397717.0, + "step": 19585 + }, + { + "epoch": 2.491540516473731, + "ewc_loss": 0.049167878925800323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4583940103184432e-05, + "grad_norm": 30.943349838256836, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8748621940612793, + "num_tokens": 747435457.0, + "step": 19586 + }, + { + "epoch": 2.4916677267523215, + "ewc_loss": 0.04916750267148018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4583750928286463e-05, + "grad_norm": 31.170764923095703, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8757556080818176, + "num_tokens": 747479017.0, + "step": 19587 + }, + { + "epoch": 2.491794937030912, + "ewc_loss": 0.04921998828649521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.460999348841142e-05, + "grad_norm": 31.009807586669922, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8713641166687012, + "num_tokens": 747520247.0, + "step": 19588 + }, + { + "epoch": 2.4919221473095026, + "ewc_loss": 0.04913799464702606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4568997105234303e-05, + "grad_norm": 31.08930778503418, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8577324151992798, + "num_tokens": 747559983.0, + "step": 19589 + }, + { + "epoch": 2.492049357588093, + "ewc_loss": 0.049126338213682175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.456316906318534e-05, + "grad_norm": 31.05612564086914, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8665193319320679, + "num_tokens": 747595746.0, + "step": 19590 + }, + { + "epoch": 2.4921765678666836, + "ewc_loss": 0.04911858215928078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4559290977776982e-05, + "grad_norm": 30.91872215270996, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8692426085472107, + "num_tokens": 747639261.0, + "step": 19591 + }, + { + "epoch": 2.492303778145274, + "ewc_loss": 0.04917542636394501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4587712687207386e-05, + "grad_norm": 31.03450584411621, + "learning_rate": 1e-06, + "loss": 0.4749, + "mean_token_accuracy": 0.8530038595199585, + "num_tokens": 747680665.0, + "step": 19592 + }, + { + "epoch": 2.4924309884238647, + "ewc_loss": 0.04925534874200821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4627674065413885e-05, + "grad_norm": 31.07752799987793, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.875383198261261, + "num_tokens": 747713320.0, + "step": 19593 + }, + { + "epoch": 2.492558198702455, + "ewc_loss": 0.04910201206803322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.455100548104383e-05, + "grad_norm": 30.92267608642578, + "learning_rate": 1e-06, + "loss": 0.5133, + "mean_token_accuracy": 0.844864010810852, + "num_tokens": 747752167.0, + "step": 19594 + }, + { + "epoch": 2.4926854089810457, + "ewc_loss": 0.04914048686623573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.457024311297573e-05, + "grad_norm": 31.013235092163086, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8766655921936035, + "num_tokens": 747787274.0, + "step": 19595 + }, + { + "epoch": 2.4928126192596363, + "ewc_loss": 0.049261532723903656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4630766347399913e-05, + "grad_norm": 31.049379348754883, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.865533709526062, + "num_tokens": 747826647.0, + "step": 19596 + }, + { + "epoch": 2.492939829538227, + "ewc_loss": 0.04915973171591759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.457986556692049e-05, + "grad_norm": 30.968002319335938, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8670633435249329, + "num_tokens": 747866797.0, + "step": 19597 + }, + { + "epoch": 2.4930670398168173, + "ewc_loss": 0.04915999993681908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.458000017213635e-05, + "grad_norm": 30.965620040893555, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8733142018318176, + "num_tokens": 747907589.0, + "step": 19598 + }, + { + "epoch": 2.493194250095408, + "ewc_loss": 0.04925563186407089, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.462781594658736e-05, + "grad_norm": 31.04297637939453, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8629832863807678, + "num_tokens": 747939865.0, + "step": 19599 + }, + { + "epoch": 2.4933214603739984, + "ewc_loss": 0.04926078021526337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.463038981659338e-05, + "grad_norm": 31.149188995361328, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8686752915382385, + "num_tokens": 747976238.0, + "step": 19600 + }, + { + "epoch": 2.493448670652589, + "ewc_loss": 0.0491986908018589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.459934512444306e-05, + "grad_norm": 31.081106185913086, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8681125640869141, + "num_tokens": 748012076.0, + "step": 19601 + }, + { + "epoch": 2.4935758809311794, + "ewc_loss": 0.049280088394880295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.46400450123474e-05, + "grad_norm": 31.139450073242188, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8654310703277588, + "num_tokens": 748042518.0, + "step": 19602 + }, + { + "epoch": 2.4937030912097695, + "ewc_loss": 0.049176380038261414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4588189262431115e-05, + "grad_norm": 31.159961700439453, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.8783862590789795, + "num_tokens": 748077509.0, + "step": 19603 + }, + { + "epoch": 2.4938303014883605, + "ewc_loss": 0.04921602085232735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4608010789961554e-05, + "grad_norm": 31.058549880981445, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8771588206291199, + "num_tokens": 748118164.0, + "step": 19604 + }, + { + "epoch": 2.4939575117669506, + "ewc_loss": 0.04916570335626602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.458285234752111e-05, + "grad_norm": 31.04566192626953, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8644484281539917, + "num_tokens": 748153609.0, + "step": 19605 + }, + { + "epoch": 2.494084722045541, + "ewc_loss": 0.04912584275007248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.456292168062646e-05, + "grad_norm": 30.9080753326416, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8692268133163452, + "num_tokens": 748189573.0, + "step": 19606 + }, + { + "epoch": 2.4942119323241316, + "ewc_loss": 0.04913555085659027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.456777474435512e-05, + "grad_norm": 30.994874954223633, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8655492067337036, + "num_tokens": 748231121.0, + "step": 19607 + }, + { + "epoch": 2.494339142602722, + "ewc_loss": 0.04910126328468323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.45506307692267e-05, + "grad_norm": 31.012182235717773, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8822910785675049, + "num_tokens": 748268744.0, + "step": 19608 + }, + { + "epoch": 2.4944663528813127, + "ewc_loss": 0.04925251752138138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4626258891657926e-05, + "grad_norm": 31.02956771850586, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8759522438049316, + "num_tokens": 748307441.0, + "step": 19609 + }, + { + "epoch": 2.494593563159903, + "ewc_loss": 0.04922880604863167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4614402718725614e-05, + "grad_norm": 31.07708740234375, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8710484504699707, + "num_tokens": 748343506.0, + "step": 19610 + }, + { + "epoch": 2.4947207734384937, + "ewc_loss": 0.04927593097090721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4637965907459147e-05, + "grad_norm": 30.918684005737305, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8719255924224854, + "num_tokens": 748382209.0, + "step": 19611 + }, + { + "epoch": 2.4948479837170843, + "ewc_loss": 0.04917411878705025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4587059670011513e-05, + "grad_norm": 31.00787925720215, + "learning_rate": 1e-06, + "loss": 0.3524, + "mean_token_accuracy": 0.8896907567977905, + "num_tokens": 748413660.0, + "step": 19612 + }, + { + "epoch": 2.494975193995675, + "ewc_loss": 0.0492287240922451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4614362700958736e-05, + "grad_norm": 30.967592239379883, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8718507289886475, + "num_tokens": 748457037.0, + "step": 19613 + }, + { + "epoch": 2.4951024042742653, + "ewc_loss": 0.04926318675279617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4631593987578526e-05, + "grad_norm": 30.969575881958008, + "learning_rate": 1e-06, + "loss": 0.4909, + "mean_token_accuracy": 0.8504546880722046, + "num_tokens": 748497227.0, + "step": 19614 + }, + { + "epoch": 2.495229614552856, + "ewc_loss": 0.049292612820863724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4646305973874405e-05, + "grad_norm": 30.997995376586914, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8680760860443115, + "num_tokens": 748529901.0, + "step": 19615 + }, + { + "epoch": 2.4953568248314464, + "ewc_loss": 0.049369439482688904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4684719392098486e-05, + "grad_norm": 31.004688262939453, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8574036359786987, + "num_tokens": 748568238.0, + "step": 19616 + }, + { + "epoch": 2.495484035110037, + "ewc_loss": 0.049290627241134644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.464531280566007e-05, + "grad_norm": 31.011919021606445, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8606798648834229, + "num_tokens": 748608153.0, + "step": 19617 + }, + { + "epoch": 2.4956112453886274, + "ewc_loss": 0.0493515282869339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.467576450726483e-05, + "grad_norm": 31.010080337524414, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.867131233215332, + "num_tokens": 748646977.0, + "step": 19618 + }, + { + "epoch": 2.495738455667218, + "ewc_loss": 0.049284689128398895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4642344214953482e-05, + "grad_norm": 30.996051788330078, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8616678714752197, + "num_tokens": 748689586.0, + "step": 19619 + }, + { + "epoch": 2.4958656659458085, + "ewc_loss": 0.049314532428979874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.465726538503077e-05, + "grad_norm": 30.928632736206055, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8700603246688843, + "num_tokens": 748730915.0, + "step": 19620 + }, + { + "epoch": 2.495992876224399, + "ewc_loss": 0.04932105913758278, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.466052865202073e-05, + "grad_norm": 31.094932556152344, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8668588399887085, + "num_tokens": 748766996.0, + "step": 19621 + }, + { + "epoch": 2.4961200865029896, + "ewc_loss": 0.0492272786796093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4613638743176125e-05, + "grad_norm": 30.91301155090332, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8636885285377502, + "num_tokens": 748801225.0, + "step": 19622 + }, + { + "epoch": 2.49624729678158, + "ewc_loss": 0.04929639771580696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4648199541843496e-05, + "grad_norm": 30.96324920654297, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8648169040679932, + "num_tokens": 748841355.0, + "step": 19623 + }, + { + "epoch": 2.4963745070601706, + "ewc_loss": 0.04933403059840202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4667015168233775e-05, + "grad_norm": 31.044281005859375, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8686583042144775, + "num_tokens": 748879636.0, + "step": 19624 + }, + { + "epoch": 2.496501717338761, + "ewc_loss": 0.04932261258363724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4661307179485448e-05, + "grad_norm": 30.997116088867188, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8671398162841797, + "num_tokens": 748917795.0, + "step": 19625 + }, + { + "epoch": 2.4966289276173517, + "ewc_loss": 0.049288518726825714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4644259610795416e-05, + "grad_norm": 31.067785263061523, + "learning_rate": 1e-06, + "loss": 0.5101, + "mean_token_accuracy": 0.8438214659690857, + "num_tokens": 748956512.0, + "step": 19626 + }, + { + "epoch": 2.496756137895942, + "ewc_loss": 0.04929535090923309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4647675672895275e-05, + "grad_norm": 31.074501037597656, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.875306248664856, + "num_tokens": 748995577.0, + "step": 19627 + }, + { + "epoch": 2.4968833481745323, + "ewc_loss": 0.04923855885863304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.461927942931652e-05, + "grad_norm": 31.03515625, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8667886257171631, + "num_tokens": 749027672.0, + "step": 19628 + }, + { + "epoch": 2.4970105584531233, + "ewc_loss": 0.04928054288029671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.464027056703344e-05, + "grad_norm": 30.979005813598633, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8810281753540039, + "num_tokens": 749059101.0, + "step": 19629 + }, + { + "epoch": 2.4971377687317133, + "ewc_loss": 0.04929638281464577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4648192265885882e-05, + "grad_norm": 31.065988540649414, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8557704091072083, + "num_tokens": 749097034.0, + "step": 19630 + }, + { + "epoch": 2.497264979010304, + "ewc_loss": 0.049262985587120056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4631492124171928e-05, + "grad_norm": 30.99509620666504, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.867250382900238, + "num_tokens": 749137536.0, + "step": 19631 + }, + { + "epoch": 2.4973921892888944, + "ewc_loss": 0.049244474619627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4622237106086686e-05, + "grad_norm": 31.105987548828125, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8707648515701294, + "num_tokens": 749171487.0, + "step": 19632 + }, + { + "epoch": 2.497519399567485, + "ewc_loss": 0.049322597682476044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.466129808453843e-05, + "grad_norm": 31.046918869018555, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8623846769332886, + "num_tokens": 749213234.0, + "step": 19633 + }, + { + "epoch": 2.4976466098460754, + "ewc_loss": 0.04920355603098869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4601777113275602e-05, + "grad_norm": 30.91083526611328, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8594486713409424, + "num_tokens": 749256184.0, + "step": 19634 + }, + { + "epoch": 2.497773820124666, + "ewc_loss": 0.04938584566116333, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4692923034308478e-05, + "grad_norm": 31.232973098754883, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8675529956817627, + "num_tokens": 749292909.0, + "step": 19635 + }, + { + "epoch": 2.4979010304032565, + "ewc_loss": 0.04933565482497215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.466782825649716e-05, + "grad_norm": 31.00881576538086, + "learning_rate": 1e-06, + "loss": 0.4701, + "mean_token_accuracy": 0.8578263521194458, + "num_tokens": 749327947.0, + "step": 19636 + }, + { + "epoch": 2.498028240681847, + "ewc_loss": 0.04919055104255676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.459527604514733e-05, + "grad_norm": 31.095062255859375, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8784355521202087, + "num_tokens": 749364107.0, + "step": 19637 + }, + { + "epoch": 2.4981554509604376, + "ewc_loss": 0.04935767874121666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4678838599356823e-05, + "grad_norm": 31.038089752197266, + "learning_rate": 1e-06, + "loss": 0.3722, + "mean_token_accuracy": 0.8856334090232849, + "num_tokens": 749398055.0, + "step": 19638 + }, + { + "epoch": 2.498282661239028, + "ewc_loss": 0.04927109181880951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.463554665155243e-05, + "grad_norm": 31.06396484375, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8697521686553955, + "num_tokens": 749434626.0, + "step": 19639 + }, + { + "epoch": 2.4984098715176186, + "ewc_loss": 0.04930686205625534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4653430955368094e-05, + "grad_norm": 30.98151969909668, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8631840944290161, + "num_tokens": 749470616.0, + "step": 19640 + }, + { + "epoch": 2.498537081796209, + "ewc_loss": 0.04932500049471855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4662500436534174e-05, + "grad_norm": 31.041606903076172, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8726590871810913, + "num_tokens": 749513072.0, + "step": 19641 + }, + { + "epoch": 2.4986642920747997, + "ewc_loss": 0.04933113604784012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4665567252668552e-05, + "grad_norm": 31.16727066040039, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8847849369049072, + "num_tokens": 749546988.0, + "step": 19642 + }, + { + "epoch": 2.49879150235339, + "ewc_loss": 0.0492994524538517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.464972567395307e-05, + "grad_norm": 30.908784866333008, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8643308877944946, + "num_tokens": 749585851.0, + "step": 19643 + }, + { + "epoch": 2.4989187126319807, + "ewc_loss": 0.04923397675156593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4616987502668053e-05, + "grad_norm": 30.987627029418945, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8672916889190674, + "num_tokens": 749619510.0, + "step": 19644 + }, + { + "epoch": 2.4990459229105713, + "ewc_loss": 0.0492960549890995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.464802673785016e-05, + "grad_norm": 30.963138580322266, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8783853054046631, + "num_tokens": 749653660.0, + "step": 19645 + }, + { + "epoch": 2.499173133189162, + "ewc_loss": 0.04924928769469261, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4624643629067577e-05, + "grad_norm": 31.013620376586914, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8694972395896912, + "num_tokens": 749691321.0, + "step": 19646 + }, + { + "epoch": 2.4993003434677523, + "ewc_loss": 0.049422670155763626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.471133484505117e-05, + "grad_norm": 31.099258422851562, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8709152936935425, + "num_tokens": 749729305.0, + "step": 19647 + }, + { + "epoch": 2.499427553746343, + "ewc_loss": 0.04932788386940956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4663941076141782e-05, + "grad_norm": 31.08867835998535, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8770711421966553, + "num_tokens": 749766134.0, + "step": 19648 + }, + { + "epoch": 2.4995547640249334, + "ewc_loss": 0.04934319481253624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4671597202541307e-05, + "grad_norm": 31.1063289642334, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8675852417945862, + "num_tokens": 749807715.0, + "step": 19649 + }, + { + "epoch": 2.499681974303524, + "ewc_loss": 0.04924868047237396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4624339857837185e-05, + "grad_norm": 31.013900756835938, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8711735010147095, + "num_tokens": 749838799.0, + "step": 19650 + }, + { + "epoch": 2.499809184582114, + "ewc_loss": 0.049204159528017044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.460207906551659e-05, + "grad_norm": 31.103662490844727, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8835344314575195, + "num_tokens": 749874283.0, + "step": 19651 + }, + { + "epoch": 2.499936394860705, + "ewc_loss": 0.04921479895710945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4607399609521963e-05, + "grad_norm": 31.040054321289062, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8682141900062561, + "num_tokens": 749915912.0, + "step": 19652 + }, + { + "epoch": 2.500063605139295, + "ewc_loss": 0.04939083755016327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4695418687770143e-05, + "grad_norm": 31.18653678894043, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8771313428878784, + "num_tokens": 749959133.0, + "step": 19653 + }, + { + "epoch": 2.500190815417886, + "ewc_loss": 0.04928009957075119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.464005046931561e-05, + "grad_norm": 31.17029571533203, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8668853044509888, + "num_tokens": 749997846.0, + "step": 19654 + }, + { + "epoch": 2.500318025696476, + "ewc_loss": 0.049177996814250946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4588998712715693e-05, + "grad_norm": 30.994029998779297, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.873540997505188, + "num_tokens": 750040973.0, + "step": 19655 + }, + { + "epoch": 2.5004452359750666, + "ewc_loss": 0.049127500504255295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4563751139794476e-05, + "grad_norm": 31.021310806274414, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8648893237113953, + "num_tokens": 750082171.0, + "step": 19656 + }, + { + "epoch": 2.500572446253657, + "ewc_loss": 0.04925481975078583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.462741031195037e-05, + "grad_norm": 31.127307891845703, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8824084997177124, + "num_tokens": 750123083.0, + "step": 19657 + }, + { + "epoch": 2.5006996565322477, + "ewc_loss": 0.049283452332019806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4641725758556277e-05, + "grad_norm": 31.182567596435547, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8555654883384705, + "num_tokens": 750162235.0, + "step": 19658 + }, + { + "epoch": 2.500826866810838, + "ewc_loss": 0.04908544942736626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.454272544127889e-05, + "grad_norm": 31.00931739807129, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8827242851257324, + "num_tokens": 750196501.0, + "step": 19659 + }, + { + "epoch": 2.5009540770894287, + "ewc_loss": 0.049196865409612656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.459843199176248e-05, + "grad_norm": 31.25658416748047, + "learning_rate": 1e-06, + "loss": 0.5217, + "mean_token_accuracy": 0.841199517250061, + "num_tokens": 750241178.0, + "step": 19660 + }, + { + "epoch": 2.5010812873680193, + "ewc_loss": 0.049232788383960724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4616394512122497e-05, + "grad_norm": 31.139297485351562, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8754101991653442, + "num_tokens": 750284104.0, + "step": 19661 + }, + { + "epoch": 2.50120849764661, + "ewc_loss": 0.049071501940488815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4535751435905695e-05, + "grad_norm": 31.06379508972168, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8757655620574951, + "num_tokens": 750318525.0, + "step": 19662 + }, + { + "epoch": 2.5013357079252003, + "ewc_loss": 0.04909338802099228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4546694476157427e-05, + "grad_norm": 31.04916000366211, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8785699605941772, + "num_tokens": 750358892.0, + "step": 19663 + }, + { + "epoch": 2.501462918203791, + "ewc_loss": 0.04919753223657608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4598766685812734e-05, + "grad_norm": 31.169248580932617, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8569462299346924, + "num_tokens": 750399325.0, + "step": 19664 + }, + { + "epoch": 2.5015901284823814, + "ewc_loss": 0.04923994839191437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4619974283268675e-05, + "grad_norm": 31.194652557373047, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8637576103210449, + "num_tokens": 750437493.0, + "step": 19665 + }, + { + "epoch": 2.501717338760972, + "ewc_loss": 0.04909393563866615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.454696732456796e-05, + "grad_norm": 30.805177688598633, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8782939910888672, + "num_tokens": 750472381.0, + "step": 19666 + }, + { + "epoch": 2.5018445490395624, + "ewc_loss": 0.049170102924108505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.458505150571e-05, + "grad_norm": 31.164945602416992, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.85368812084198, + "num_tokens": 750511786.0, + "step": 19667 + }, + { + "epoch": 2.501971759318153, + "ewc_loss": 0.04916428029537201, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.458213930367492e-05, + "grad_norm": 30.98563003540039, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8768792152404785, + "num_tokens": 750550596.0, + "step": 19668 + }, + { + "epoch": 2.5020989695967435, + "ewc_loss": 0.04914508014917374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.457254049659241e-05, + "grad_norm": 31.04719352722168, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8736839890480042, + "num_tokens": 750594790.0, + "step": 19669 + }, + { + "epoch": 2.502226179875334, + "ewc_loss": 0.049190670251846313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4595334252808243e-05, + "grad_norm": 30.982494354248047, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8817161917686462, + "num_tokens": 750636738.0, + "step": 19670 + }, + { + "epoch": 2.5023533901539246, + "ewc_loss": 0.04914453625679016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4572267648181878e-05, + "grad_norm": 31.132986068725586, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8819233179092407, + "num_tokens": 750672338.0, + "step": 19671 + }, + { + "epoch": 2.502480600432515, + "ewc_loss": 0.049296457320451736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4648228645673953e-05, + "grad_norm": 31.101842880249023, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8645284175872803, + "num_tokens": 750709149.0, + "step": 19672 + }, + { + "epoch": 2.5026078107111056, + "ewc_loss": 0.04916529729962349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4582648620707914e-05, + "grad_norm": 31.2059268951416, + "learning_rate": 1e-06, + "loss": 0.3733, + "mean_token_accuracy": 0.8831793069839478, + "num_tokens": 750741727.0, + "step": 19673 + }, + { + "epoch": 2.5027350209896957, + "ewc_loss": 0.049186933785676956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.45934661506908e-05, + "grad_norm": 31.099760055541992, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8582417368888855, + "num_tokens": 750775903.0, + "step": 19674 + }, + { + "epoch": 2.5028622312682867, + "ewc_loss": 0.049174726009368896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4587363441241905e-05, + "grad_norm": 31.06264877319336, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8660626411437988, + "num_tokens": 750810502.0, + "step": 19675 + }, + { + "epoch": 2.5029894415468767, + "ewc_loss": 0.04915224760770798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4576123905717395e-05, + "grad_norm": 31.086627960205078, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8786900639533997, + "num_tokens": 750847762.0, + "step": 19676 + }, + { + "epoch": 2.5031166518254677, + "ewc_loss": 0.049188677221536636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4594339265604503e-05, + "grad_norm": 31.141700744628906, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8717615604400635, + "num_tokens": 750887201.0, + "step": 19677 + }, + { + "epoch": 2.503243862104058, + "ewc_loss": 0.049170415848493576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4585207938798703e-05, + "grad_norm": 31.05177116394043, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8679681420326233, + "num_tokens": 750927323.0, + "step": 19678 + }, + { + "epoch": 2.5033710723826488, + "ewc_loss": 0.04913961514830589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4569808374508284e-05, + "grad_norm": 31.008079528808594, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8624509572982788, + "num_tokens": 750972501.0, + "step": 19679 + }, + { + "epoch": 2.503498282661239, + "ewc_loss": 0.04915745556354523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4578728698543273e-05, + "grad_norm": 31.084430694580078, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8811694383621216, + "num_tokens": 751010743.0, + "step": 19680 + }, + { + "epoch": 2.5036254929398294, + "ewc_loss": 0.04924267157912254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4621334887342528e-05, + "grad_norm": 31.208051681518555, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8665703535079956, + "num_tokens": 751050440.0, + "step": 19681 + }, + { + "epoch": 2.50375270321842, + "ewc_loss": 0.04913273826241493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.456636866554618e-05, + "grad_norm": 31.00115203857422, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8742681741714478, + "num_tokens": 751088319.0, + "step": 19682 + }, + { + "epoch": 2.5038799134970104, + "ewc_loss": 0.04926750808954239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.463375494698994e-05, + "grad_norm": 31.09384536743164, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8705387711524963, + "num_tokens": 751123766.0, + "step": 19683 + }, + { + "epoch": 2.504007123775601, + "ewc_loss": 0.049186378717422485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4593189664301462e-05, + "grad_norm": 31.001449584960938, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8866145610809326, + "num_tokens": 751163375.0, + "step": 19684 + }, + { + "epoch": 2.5041343340541915, + "ewc_loss": 0.04920978099107742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4604891223134473e-05, + "grad_norm": 31.10763168334961, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8668622970581055, + "num_tokens": 751204526.0, + "step": 19685 + }, + { + "epoch": 2.504261544332782, + "ewc_loss": 0.049210477620363235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.460523865011055e-05, + "grad_norm": 31.1522159576416, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8748641014099121, + "num_tokens": 751245133.0, + "step": 19686 + }, + { + "epoch": 2.5043887546113726, + "ewc_loss": 0.04908967390656471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4544837287976407e-05, + "grad_norm": 30.95752716064453, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.8601469397544861, + "num_tokens": 751284578.0, + "step": 19687 + }, + { + "epoch": 2.504515964889963, + "ewc_loss": 0.049216583371162415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.46082909143297e-05, + "grad_norm": 31.140718460083008, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8626390695571899, + "num_tokens": 751325794.0, + "step": 19688 + }, + { + "epoch": 2.5046431751685536, + "ewc_loss": 0.04921451210975647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4607255909359083e-05, + "grad_norm": 31.070158004760742, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8765772581100464, + "num_tokens": 751363777.0, + "step": 19689 + }, + { + "epoch": 2.504770385447144, + "ewc_loss": 0.04923387989401817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.461694020894356e-05, + "grad_norm": 31.13700294494629, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8615949749946594, + "num_tokens": 751400499.0, + "step": 19690 + }, + { + "epoch": 2.5048975957257347, + "ewc_loss": 0.04912655055522919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4563274564570747e-05, + "grad_norm": 30.911409378051758, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8679571151733398, + "num_tokens": 751434895.0, + "step": 19691 + }, + { + "epoch": 2.505024806004325, + "ewc_loss": 0.04917676001787186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4588380256318487e-05, + "grad_norm": 31.127822875976562, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8812010288238525, + "num_tokens": 751473984.0, + "step": 19692 + }, + { + "epoch": 2.5051520162829157, + "ewc_loss": 0.04916786029934883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4583931008237414e-05, + "grad_norm": 30.915790557861328, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8710682988166809, + "num_tokens": 751515030.0, + "step": 19693 + }, + { + "epoch": 2.5052792265615063, + "ewc_loss": 0.04922923445701599, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4614617359475233e-05, + "grad_norm": 31.089208602905273, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8683323264122009, + "num_tokens": 751550023.0, + "step": 19694 + }, + { + "epoch": 2.505406436840097, + "ewc_loss": 0.0492415614426136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4620780095574446e-05, + "grad_norm": 30.908231735229492, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8721716403961182, + "num_tokens": 751584793.0, + "step": 19695 + }, + { + "epoch": 2.5055336471186873, + "ewc_loss": 0.04921229928731918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4606149963801727e-05, + "grad_norm": 31.091659545898438, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.8531065583229065, + "num_tokens": 751619817.0, + "step": 19696 + }, + { + "epoch": 2.505660857397278, + "ewc_loss": 0.049320898950099945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4660448616486974e-05, + "grad_norm": 30.966373443603516, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8769248723983765, + "num_tokens": 751657097.0, + "step": 19697 + }, + { + "epoch": 2.5057880676758684, + "ewc_loss": 0.049239568412303925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4619783289381303e-05, + "grad_norm": 31.048749923706055, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8620169162750244, + "num_tokens": 751687042.0, + "step": 19698 + }, + { + "epoch": 2.5059152779544585, + "ewc_loss": 0.04943687841296196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.471843981766142e-05, + "grad_norm": 30.981679916381836, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.879223644733429, + "num_tokens": 751724281.0, + "step": 19699 + }, + { + "epoch": 2.5060424882330494, + "ewc_loss": 0.04930467531085014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4652337742736563e-05, + "grad_norm": 31.086090087890625, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8505837321281433, + "num_tokens": 751765144.0, + "step": 19700 + }, + { + "epoch": 2.5061696985116395, + "ewc_loss": 0.04936082288622856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.468041202519089e-05, + "grad_norm": 31.110960006713867, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.878614068031311, + "num_tokens": 751799602.0, + "step": 19701 + }, + { + "epoch": 2.5062969087902305, + "ewc_loss": 0.04926906153559685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4634531655465253e-05, + "grad_norm": 31.06820297241211, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8786542415618896, + "num_tokens": 751836972.0, + "step": 19702 + }, + { + "epoch": 2.5064241190688206, + "ewc_loss": 0.049298595637083054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4649298211443238e-05, + "grad_norm": 30.991703033447266, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8723255395889282, + "num_tokens": 751869627.0, + "step": 19703 + }, + { + "epoch": 2.5065513293474115, + "ewc_loss": 0.04933817684650421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4669088816153817e-05, + "grad_norm": 31.092382431030273, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8630069494247437, + "num_tokens": 751911103.0, + "step": 19704 + }, + { + "epoch": 2.5066785396260016, + "ewc_loss": 0.049430809915065765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4715405743336305e-05, + "grad_norm": 31.03849220275879, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8657448887825012, + "num_tokens": 751948956.0, + "step": 19705 + }, + { + "epoch": 2.506805749904592, + "ewc_loss": 0.04934786260128021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4673930965946056e-05, + "grad_norm": 31.064653396606445, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8736872673034668, + "num_tokens": 751992185.0, + "step": 19706 + }, + { + "epoch": 2.5069329601831827, + "ewc_loss": 0.049369074404239655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4684537493158132e-05, + "grad_norm": 30.910234451293945, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8658255338668823, + "num_tokens": 752032661.0, + "step": 19707 + }, + { + "epoch": 2.507060170461773, + "ewc_loss": 0.04932595044374466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.46629751927685e-05, + "grad_norm": 30.993276596069336, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8752335906028748, + "num_tokens": 752073946.0, + "step": 19708 + }, + { + "epoch": 2.5071873807403637, + "ewc_loss": 0.04940894618630409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4704473617020994e-05, + "grad_norm": 30.972530364990234, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8557455539703369, + "num_tokens": 752118766.0, + "step": 19709 + }, + { + "epoch": 2.5073145910189543, + "ewc_loss": 0.049415864050388336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4707931515877135e-05, + "grad_norm": 31.11690902709961, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8732805252075195, + "num_tokens": 752159669.0, + "step": 19710 + }, + { + "epoch": 2.507441801297545, + "ewc_loss": 0.04936137795448303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.468068851158023e-05, + "grad_norm": 30.94240951538086, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.874570906162262, + "num_tokens": 752198139.0, + "step": 19711 + }, + { + "epoch": 2.5075690115761353, + "ewc_loss": 0.04932236298918724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4661181669216603e-05, + "grad_norm": 31.06355094909668, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8728108406066895, + "num_tokens": 752239686.0, + "step": 19712 + }, + { + "epoch": 2.507696221854726, + "ewc_loss": 0.049368951469659805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.468447564751841e-05, + "grad_norm": 31.021930694580078, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8529436588287354, + "num_tokens": 752275740.0, + "step": 19713 + }, + { + "epoch": 2.5078234321333164, + "ewc_loss": 0.04941387474536896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4706936528673396e-05, + "grad_norm": 31.054107666015625, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8587630391120911, + "num_tokens": 752310554.0, + "step": 19714 + }, + { + "epoch": 2.507950642411907, + "ewc_loss": 0.04927946627140045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4639733965159394e-05, + "grad_norm": 30.901042938232422, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8759851455688477, + "num_tokens": 752347761.0, + "step": 19715 + }, + { + "epoch": 2.5080778526904974, + "ewc_loss": 0.04932964965701103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4664825104991905e-05, + "grad_norm": 31.088632583618164, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8595413565635681, + "num_tokens": 752387128.0, + "step": 19716 + }, + { + "epoch": 2.508205062969088, + "ewc_loss": 0.04944295808672905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4721479348954745e-05, + "grad_norm": 31.059349060058594, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8693752288818359, + "num_tokens": 752428722.0, + "step": 19717 + }, + { + "epoch": 2.5083322732476785, + "ewc_loss": 0.04926815629005432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.463407872710377e-05, + "grad_norm": 31.155193328857422, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.869194746017456, + "num_tokens": 752468920.0, + "step": 19718 + }, + { + "epoch": 2.508459483526269, + "ewc_loss": 0.04940050467848778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4700251742615364e-05, + "grad_norm": 31.03175926208496, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8610622882843018, + "num_tokens": 752505226.0, + "step": 19719 + }, + { + "epoch": 2.5085866938048595, + "ewc_loss": 0.04928553104400635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4642766220495105e-05, + "grad_norm": 31.129114151000977, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8713882565498352, + "num_tokens": 752543693.0, + "step": 19720 + }, + { + "epoch": 2.50871390408345, + "ewc_loss": 0.049387700855731964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4693850718904287e-05, + "grad_norm": 30.99265480041504, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8550969362258911, + "num_tokens": 752578758.0, + "step": 19721 + }, + { + "epoch": 2.5088411143620406, + "ewc_loss": 0.04928654059767723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4643270080559887e-05, + "grad_norm": 31.07611846923828, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8807154893875122, + "num_tokens": 752612115.0, + "step": 19722 + }, + { + "epoch": 2.508968324640631, + "ewc_loss": 0.04935673251748085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.46783656621119e-05, + "grad_norm": 30.98204231262207, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8751296997070312, + "num_tokens": 752644601.0, + "step": 19723 + }, + { + "epoch": 2.509095534919221, + "ewc_loss": 0.04933915659785271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.466957812430337e-05, + "grad_norm": 31.170698165893555, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8735824823379517, + "num_tokens": 752682333.0, + "step": 19724 + }, + { + "epoch": 2.509222745197812, + "ewc_loss": 0.049471933394670486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4735965780564584e-05, + "grad_norm": 31.0887451171875, + "learning_rate": 1e-06, + "loss": 0.4446, + "mean_token_accuracy": 0.863849937915802, + "num_tokens": 752721802.0, + "step": 19725 + }, + { + "epoch": 2.5093499554764023, + "ewc_loss": 0.04923488572239876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.461744225001894e-05, + "grad_norm": 31.01093292236328, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8853141665458679, + "num_tokens": 752755673.0, + "step": 19726 + }, + { + "epoch": 2.5094771657549932, + "ewc_loss": 0.04946122318506241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4730610675760545e-05, + "grad_norm": 31.0708065032959, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8798805475234985, + "num_tokens": 752795575.0, + "step": 19727 + }, + { + "epoch": 2.5096043760335833, + "ewc_loss": 0.049363117665052414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4681557988515124e-05, + "grad_norm": 30.97551727294922, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8860633373260498, + "num_tokens": 752833172.0, + "step": 19728 + }, + { + "epoch": 2.5097315863121743, + "ewc_loss": 0.049350060522556305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4675029635545798e-05, + "grad_norm": 30.995073318481445, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8554059267044067, + "num_tokens": 752872857.0, + "step": 19729 + }, + { + "epoch": 2.5098587965907644, + "ewc_loss": 0.049378812313079834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4689406927791424e-05, + "grad_norm": 31.074291229248047, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8797582983970642, + "num_tokens": 752908553.0, + "step": 19730 + }, + { + "epoch": 2.509986006869355, + "ewc_loss": 0.04936885088682175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4684424715815112e-05, + "grad_norm": 31.043907165527344, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8748229742050171, + "num_tokens": 752937762.0, + "step": 19731 + }, + { + "epoch": 2.5101132171479454, + "ewc_loss": 0.04932349547743797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4661747374921106e-05, + "grad_norm": 31.103103637695312, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8566584587097168, + "num_tokens": 752977184.0, + "step": 19732 + }, + { + "epoch": 2.510240427426536, + "ewc_loss": 0.04934896156191826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4674480300745927e-05, + "grad_norm": 31.014650344848633, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8664802312850952, + "num_tokens": 753021071.0, + "step": 19733 + }, + { + "epoch": 2.5103676377051265, + "ewc_loss": 0.04939044266939163, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4695220417925157e-05, + "grad_norm": 31.22073745727539, + "learning_rate": 1e-06, + "loss": 0.5062, + "mean_token_accuracy": 0.8443219065666199, + "num_tokens": 753056597.0, + "step": 19734 + }, + { + "epoch": 2.510494847983717, + "ewc_loss": 0.04941023886203766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4705119358259253e-05, + "grad_norm": 31.136274337768555, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8726997375488281, + "num_tokens": 753096178.0, + "step": 19735 + }, + { + "epoch": 2.5106220582623076, + "ewc_loss": 0.04922526329755783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.461263102304656e-05, + "grad_norm": 31.0955753326416, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.873665452003479, + "num_tokens": 753130388.0, + "step": 19736 + }, + { + "epoch": 2.510749268540898, + "ewc_loss": 0.04932428523898125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4662142095621675e-05, + "grad_norm": 31.061397552490234, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8776034116744995, + "num_tokens": 753166585.0, + "step": 19737 + }, + { + "epoch": 2.5108764788194886, + "ewc_loss": 0.04924299940466881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.462150041537825e-05, + "grad_norm": 31.146820068359375, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8800970315933228, + "num_tokens": 753203403.0, + "step": 19738 + }, + { + "epoch": 2.511003689098079, + "ewc_loss": 0.049300868064165115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.465043326083105e-05, + "grad_norm": 31.107471466064453, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8767014741897583, + "num_tokens": 753241632.0, + "step": 19739 + }, + { + "epoch": 2.5111308993766697, + "ewc_loss": 0.049267541617155075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.463377131789457e-05, + "grad_norm": 31.09625244140625, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8842472434043884, + "num_tokens": 753279292.0, + "step": 19740 + }, + { + "epoch": 2.51125810965526, + "ewc_loss": 0.049339305609464645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4669652702868916e-05, + "grad_norm": 31.095088958740234, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8613706231117249, + "num_tokens": 753319121.0, + "step": 19741 + }, + { + "epoch": 2.5113853199338507, + "ewc_loss": 0.04920453205704689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4602266421425156e-05, + "grad_norm": 31.159944534301758, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8552751541137695, + "num_tokens": 753353074.0, + "step": 19742 + }, + { + "epoch": 2.5115125302124413, + "ewc_loss": 0.0493331104516983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4666554963914678e-05, + "grad_norm": 31.037193298339844, + "learning_rate": 1e-06, + "loss": 0.3673, + "mean_token_accuracy": 0.8887201547622681, + "num_tokens": 753388994.0, + "step": 19743 + }, + { + "epoch": 2.511639740491032, + "ewc_loss": 0.04927470535039902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4637352908030152e-05, + "grad_norm": 31.15800666809082, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8770723342895508, + "num_tokens": 753430271.0, + "step": 19744 + }, + { + "epoch": 2.5117669507696223, + "ewc_loss": 0.049286212772130966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4643106371513568e-05, + "grad_norm": 31.09667205810547, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8628458976745605, + "num_tokens": 753473820.0, + "step": 19745 + }, + { + "epoch": 2.511894161048213, + "ewc_loss": 0.04927023500204086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4635117370053194e-05, + "grad_norm": 31.064210891723633, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8659952878952026, + "num_tokens": 753516397.0, + "step": 19746 + }, + { + "epoch": 2.5120213713268034, + "ewc_loss": 0.04923554137349129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.461777148710098e-05, + "grad_norm": 31.07052230834961, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8807615041732788, + "num_tokens": 753557589.0, + "step": 19747 + }, + { + "epoch": 2.512148581605394, + "ewc_loss": 0.049300190061330795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.465009492880199e-05, + "grad_norm": 31.16048812866211, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8783127069473267, + "num_tokens": 753593764.0, + "step": 19748 + }, + { + "epoch": 2.512275791883984, + "ewc_loss": 0.04926960915327072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4634804503875785e-05, + "grad_norm": 31.044546127319336, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8582717180252075, + "num_tokens": 753635967.0, + "step": 19749 + }, + { + "epoch": 2.512403002162575, + "ewc_loss": 0.049219533801078796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4609766114735976e-05, + "grad_norm": 31.010807037353516, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8781518936157227, + "num_tokens": 753674595.0, + "step": 19750 + }, + { + "epoch": 2.512530212441165, + "ewc_loss": 0.04936946555972099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.468473212502431e-05, + "grad_norm": 31.22319221496582, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8549817204475403, + "num_tokens": 753714118.0, + "step": 19751 + }, + { + "epoch": 2.512657422719756, + "ewc_loss": 0.04926375672221184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.463187774992548e-05, + "grad_norm": 31.00677490234375, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8568332195281982, + "num_tokens": 753758139.0, + "step": 19752 + }, + { + "epoch": 2.512784632998346, + "ewc_loss": 0.04918632656335831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4593164198449813e-05, + "grad_norm": 31.18276596069336, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8590127229690552, + "num_tokens": 753800373.0, + "step": 19753 + }, + { + "epoch": 2.5129118432769366, + "ewc_loss": 0.04925016686320305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4625083824503236e-05, + "grad_norm": 31.000492095947266, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8653217554092407, + "num_tokens": 753832366.0, + "step": 19754 + }, + { + "epoch": 2.513039053555527, + "ewc_loss": 0.04929238557815552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4646193196531385e-05, + "grad_norm": 31.241641998291016, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8890818357467651, + "num_tokens": 753866176.0, + "step": 19755 + }, + { + "epoch": 2.5131662638341177, + "ewc_loss": 0.04929205775260925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4646029487485066e-05, + "grad_norm": 31.010231018066406, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8701795339584351, + "num_tokens": 753911470.0, + "step": 19756 + }, + { + "epoch": 2.513293474112708, + "ewc_loss": 0.04912429302930832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.456214679114055e-05, + "grad_norm": 31.049362182617188, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8674283623695374, + "num_tokens": 753953027.0, + "step": 19757 + }, + { + "epoch": 2.5134206843912987, + "ewc_loss": 0.04931795224547386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4658975235070102e-05, + "grad_norm": 31.147293090820312, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8644917011260986, + "num_tokens": 753987992.0, + "step": 19758 + }, + { + "epoch": 2.5135478946698893, + "ewc_loss": 0.0492149293422699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.460746509314049e-05, + "grad_norm": 30.992595672607422, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8743818998336792, + "num_tokens": 754026671.0, + "step": 19759 + }, + { + "epoch": 2.51367510494848, + "ewc_loss": 0.049292534589767456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.464626777509693e-05, + "grad_norm": 31.18984031677246, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8705611228942871, + "num_tokens": 754063526.0, + "step": 19760 + }, + { + "epoch": 2.5138023152270703, + "ewc_loss": 0.04929972067475319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4649860279168934e-05, + "grad_norm": 31.012487411499023, + "learning_rate": 1e-06, + "loss": 0.4716, + "mean_token_accuracy": 0.8533452153205872, + "num_tokens": 754103361.0, + "step": 19761 + }, + { + "epoch": 2.513929525505661, + "ewc_loss": 0.04917626082897186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.45881310547702e-05, + "grad_norm": 31.076316833496094, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8737355470657349, + "num_tokens": 754141969.0, + "step": 19762 + }, + { + "epoch": 2.5140567357842514, + "ewc_loss": 0.04935291409492493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.467645754222758e-05, + "grad_norm": 31.093111038208008, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8862068057060242, + "num_tokens": 754182691.0, + "step": 19763 + }, + { + "epoch": 2.514183946062842, + "ewc_loss": 0.04927020147442818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4635100999148563e-05, + "grad_norm": 31.140256881713867, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8695552349090576, + "num_tokens": 754219956.0, + "step": 19764 + }, + { + "epoch": 2.5143111563414324, + "ewc_loss": 0.049312032759189606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4656015739310533e-05, + "grad_norm": 31.109899520874023, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8655339479446411, + "num_tokens": 754259467.0, + "step": 19765 + }, + { + "epoch": 2.514438366620023, + "ewc_loss": 0.049237072467803955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.461853546265047e-05, + "grad_norm": 31.16111183166504, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8736472725868225, + "num_tokens": 754300344.0, + "step": 19766 + }, + { + "epoch": 2.5145655768986135, + "ewc_loss": 0.04937082156538963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4685410608071834e-05, + "grad_norm": 31.112173080444336, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8702179193496704, + "num_tokens": 754342606.0, + "step": 19767 + }, + { + "epoch": 2.514692787177204, + "ewc_loss": 0.04929456114768982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4647280952194706e-05, + "grad_norm": 31.10776710510254, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8662159442901611, + "num_tokens": 754379946.0, + "step": 19768 + }, + { + "epoch": 2.5148199974557945, + "ewc_loss": 0.049209754914045334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4604876671219245e-05, + "grad_norm": 31.050064086914062, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8836897611618042, + "num_tokens": 754414353.0, + "step": 19769 + }, + { + "epoch": 2.514947207734385, + "ewc_loss": 0.04922828450798988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4614142603240907e-05, + "grad_norm": 31.174808502197266, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.857734203338623, + "num_tokens": 754455506.0, + "step": 19770 + }, + { + "epoch": 2.5150744180129756, + "ewc_loss": 0.04925435036420822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4627175662317313e-05, + "grad_norm": 31.047582626342773, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8538613319396973, + "num_tokens": 754498316.0, + "step": 19771 + }, + { + "epoch": 2.5152016282915657, + "ewc_loss": 0.04916081205010414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4580405806773342e-05, + "grad_norm": 31.03676986694336, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.8582954406738281, + "num_tokens": 754534943.0, + "step": 19772 + }, + { + "epoch": 2.5153288385701567, + "ewc_loss": 0.0492444671690464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.462223346810788e-05, + "grad_norm": 31.073957443237305, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.867763876914978, + "num_tokens": 754572775.0, + "step": 19773 + }, + { + "epoch": 2.5154560488487467, + "ewc_loss": 0.049263518303632736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.463175951561425e-05, + "grad_norm": 31.18463897705078, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.880046010017395, + "num_tokens": 754610473.0, + "step": 19774 + }, + { + "epoch": 2.5155832591273377, + "ewc_loss": 0.04933091998100281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4665459932293743e-05, + "grad_norm": 31.11077117919922, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8723876476287842, + "num_tokens": 754649527.0, + "step": 19775 + }, + { + "epoch": 2.515710469405928, + "ewc_loss": 0.04919550195336342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4597751689725555e-05, + "grad_norm": 31.110626220703125, + "learning_rate": 1e-06, + "loss": 0.4152, + "mean_token_accuracy": 0.8721333742141724, + "num_tokens": 754691513.0, + "step": 19776 + }, + { + "epoch": 2.5158376796845188, + "ewc_loss": 0.04932777211070061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4663886506459676e-05, + "grad_norm": 31.169353485107422, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8576594591140747, + "num_tokens": 754727932.0, + "step": 19777 + }, + { + "epoch": 2.515964889963109, + "ewc_loss": 0.04921809583902359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4609047613921575e-05, + "grad_norm": 31.063583374023438, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8764595985412598, + "num_tokens": 754764571.0, + "step": 19778 + }, + { + "epoch": 2.5160921002416994, + "ewc_loss": 0.049297090619802475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.464854514983017e-05, + "grad_norm": 31.2509708404541, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8855152130126953, + "num_tokens": 754805846.0, + "step": 19779 + }, + { + "epoch": 2.51621931052029, + "ewc_loss": 0.049313995987176895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4656997993588448e-05, + "grad_norm": 31.11567497253418, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8780635595321655, + "num_tokens": 754844016.0, + "step": 19780 + }, + { + "epoch": 2.5163465207988804, + "ewc_loss": 0.04923003911972046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4615019356133416e-05, + "grad_norm": 31.157066345214844, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.86967933177948, + "num_tokens": 754886493.0, + "step": 19781 + }, + { + "epoch": 2.516473731077471, + "ewc_loss": 0.04929268732666969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.464634417265188e-05, + "grad_norm": 31.1577205657959, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8725712299346924, + "num_tokens": 754928195.0, + "step": 19782 + }, + { + "epoch": 2.5166009413560615, + "ewc_loss": 0.04917897656559944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4589488020865247e-05, + "grad_norm": 31.211933135986328, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8789809942245483, + "num_tokens": 754958399.0, + "step": 19783 + }, + { + "epoch": 2.516728151634652, + "ewc_loss": 0.0492689274251461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.463446435285732e-05, + "grad_norm": 31.19847869873047, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8745226860046387, + "num_tokens": 754997926.0, + "step": 19784 + }, + { + "epoch": 2.5168553619132426, + "ewc_loss": 0.049243878573179245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4621938791824505e-05, + "grad_norm": 31.1614933013916, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8758567571640015, + "num_tokens": 755040577.0, + "step": 19785 + }, + { + "epoch": 2.516982572191833, + "ewc_loss": 0.04927412047982216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.463706005073618e-05, + "grad_norm": 31.243671417236328, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8692246675491333, + "num_tokens": 755070458.0, + "step": 19786 + }, + { + "epoch": 2.5171097824704236, + "ewc_loss": 0.04917120933532715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4585604478488676e-05, + "grad_norm": 31.18942642211914, + "learning_rate": 1e-06, + "loss": 0.4873, + "mean_token_accuracy": 0.8497235774993896, + "num_tokens": 755117205.0, + "step": 19787 + }, + { + "epoch": 2.517236992749014, + "ewc_loss": 0.04917389526963234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4586946892668493e-05, + "grad_norm": 31.069988250732422, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8728051781654358, + "num_tokens": 755162098.0, + "step": 19788 + }, + { + "epoch": 2.5173642030276047, + "ewc_loss": 0.049213774502277374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4606886654510163e-05, + "grad_norm": 31.23200035095215, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8686532974243164, + "num_tokens": 755199655.0, + "step": 19789 + }, + { + "epoch": 2.517491413306195, + "ewc_loss": 0.0492248497903347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.461242547724396e-05, + "grad_norm": 31.022232055664062, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8841921091079712, + "num_tokens": 755234165.0, + "step": 19790 + }, + { + "epoch": 2.5176186235847857, + "ewc_loss": 0.049116723239421844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.455836147419177e-05, + "grad_norm": 31.093082427978516, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8713217973709106, + "num_tokens": 755271933.0, + "step": 19791 + }, + { + "epoch": 2.5177458338633762, + "ewc_loss": 0.049281369894742966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.464068529661745e-05, + "grad_norm": 31.14689064025879, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8796944618225098, + "num_tokens": 755306903.0, + "step": 19792 + }, + { + "epoch": 2.5178730441419668, + "ewc_loss": 0.049132268875837326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.456613401591312e-05, + "grad_norm": 31.184206008911133, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8786270618438721, + "num_tokens": 755347613.0, + "step": 19793 + }, + { + "epoch": 2.5180002544205573, + "ewc_loss": 0.04923565685749054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.461782787577249e-05, + "grad_norm": 31.1829833984375, + "learning_rate": 1e-06, + "loss": 0.3671, + "mean_token_accuracy": 0.8899009227752686, + "num_tokens": 755380541.0, + "step": 19794 + }, + { + "epoch": 2.518127464699148, + "ewc_loss": 0.049145784229040146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4572891561547294e-05, + "grad_norm": 31.11087417602539, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8683929443359375, + "num_tokens": 755422502.0, + "step": 19795 + }, + { + "epoch": 2.5182546749777384, + "ewc_loss": 0.04916462302207947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4582312107668258e-05, + "grad_norm": 31.098644256591797, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8718758225440979, + "num_tokens": 755463975.0, + "step": 19796 + }, + { + "epoch": 2.5183818852563284, + "ewc_loss": 0.0491349957883358, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.456749825796578e-05, + "grad_norm": 31.028181076049805, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8717455267906189, + "num_tokens": 755504873.0, + "step": 19797 + }, + { + "epoch": 2.5185090955349194, + "ewc_loss": 0.04926690459251404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.463345299474895e-05, + "grad_norm": 31.11372184753418, + "learning_rate": 1e-06, + "loss": 0.4989, + "mean_token_accuracy": 0.8463895916938782, + "num_tokens": 755545646.0, + "step": 19798 + }, + { + "epoch": 2.5186363058135095, + "ewc_loss": 0.04922650009393692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4613249479443766e-05, + "grad_norm": 31.165769577026367, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8578229546546936, + "num_tokens": 755577228.0, + "step": 19799 + }, + { + "epoch": 2.5187635160921005, + "ewc_loss": 0.049231451004743576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4615725124021992e-05, + "grad_norm": 31.063077926635742, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8793158531188965, + "num_tokens": 755621437.0, + "step": 19800 + }, + { + "epoch": 2.5188907263706906, + "ewc_loss": 0.049237821251153946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.46189101744676e-05, + "grad_norm": 31.22700309753418, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8683028817176819, + "num_tokens": 755658266.0, + "step": 19801 + }, + { + "epoch": 2.5190179366492815, + "ewc_loss": 0.04930022731423378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4650113118696027e-05, + "grad_norm": 31.16386604309082, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8592665195465088, + "num_tokens": 755696755.0, + "step": 19802 + }, + { + "epoch": 2.5191451469278716, + "ewc_loss": 0.049206677824258804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.460333962517325e-05, + "grad_norm": 31.094812393188477, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8845191597938538, + "num_tokens": 755732434.0, + "step": 19803 + }, + { + "epoch": 2.519272357206462, + "ewc_loss": 0.04930402711033821, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4652013962622732e-05, + "grad_norm": 31.222579956054688, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8767663240432739, + "num_tokens": 755768488.0, + "step": 19804 + }, + { + "epoch": 2.5193995674850527, + "ewc_loss": 0.0491785891354084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.458929520798847e-05, + "grad_norm": 31.069068908691406, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8775573372840881, + "num_tokens": 755799637.0, + "step": 19805 + }, + { + "epoch": 2.519526777763643, + "ewc_loss": 0.049244966357946396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4622482669656165e-05, + "grad_norm": 31.1905574798584, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8554385304450989, + "num_tokens": 755834229.0, + "step": 19806 + }, + { + "epoch": 2.5196539880422337, + "ewc_loss": 0.049167852848768234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4583925551269203e-05, + "grad_norm": 31.12224578857422, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8749105930328369, + "num_tokens": 755871776.0, + "step": 19807 + }, + { + "epoch": 2.5197811983208243, + "ewc_loss": 0.049237169325351715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4618584575364366e-05, + "grad_norm": 31.10051727294922, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8816470503807068, + "num_tokens": 755911125.0, + "step": 19808 + }, + { + "epoch": 2.519908408599415, + "ewc_loss": 0.04929519444704056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4647597456350923e-05, + "grad_norm": 31.268943786621094, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8794057369232178, + "num_tokens": 755952204.0, + "step": 19809 + }, + { + "epoch": 2.5200356188780053, + "ewc_loss": 0.04935067892074585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.46753388637444e-05, + "grad_norm": 31.137636184692383, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8596786856651306, + "num_tokens": 755996702.0, + "step": 19810 + }, + { + "epoch": 2.520162829156596, + "ewc_loss": 0.049170561134815216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4585280698374845e-05, + "grad_norm": 31.060739517211914, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8869965672492981, + "num_tokens": 756033285.0, + "step": 19811 + }, + { + "epoch": 2.5202900394351864, + "ewc_loss": 0.04932460933923721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.466230398567859e-05, + "grad_norm": 31.167251586914062, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.869263231754303, + "num_tokens": 756068851.0, + "step": 19812 + }, + { + "epoch": 2.520417249713777, + "ewc_loss": 0.04923452436923981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.461726217006799e-05, + "grad_norm": 31.13834571838379, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8631136417388916, + "num_tokens": 756111676.0, + "step": 19813 + }, + { + "epoch": 2.5205444599923674, + "ewc_loss": 0.049372296780347824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4686149117769673e-05, + "grad_norm": 31.208269119262695, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.8550831079483032, + "num_tokens": 756146575.0, + "step": 19814 + }, + { + "epoch": 2.520671670270958, + "ewc_loss": 0.049333807080984116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4666904209880158e-05, + "grad_norm": 31.167203903198242, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8613831400871277, + "num_tokens": 756190006.0, + "step": 19815 + }, + { + "epoch": 2.5207988805495485, + "ewc_loss": 0.04919920861721039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4599603420938365e-05, + "grad_norm": 31.01703643798828, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8822725415229797, + "num_tokens": 756232828.0, + "step": 19816 + }, + { + "epoch": 2.520926090828139, + "ewc_loss": 0.04933133348822594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4665667297085747e-05, + "grad_norm": 31.14286994934082, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8777532577514648, + "num_tokens": 756273077.0, + "step": 19817 + }, + { + "epoch": 2.5210533011067295, + "ewc_loss": 0.049301713705062866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4650857085362077e-05, + "grad_norm": 30.998098373413086, + "learning_rate": 1e-06, + "loss": 0.4752, + "mean_token_accuracy": 0.855768084526062, + "num_tokens": 756313421.0, + "step": 19818 + }, + { + "epoch": 2.52118051138532, + "ewc_loss": 0.04927647113800049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4638235117890872e-05, + "grad_norm": 31.159265518188477, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8654335141181946, + "num_tokens": 756356461.0, + "step": 19819 + }, + { + "epoch": 2.5213077216639106, + "ewc_loss": 0.04941119626164436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4705597752472386e-05, + "grad_norm": 31.120555877685547, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.867530107498169, + "num_tokens": 756388309.0, + "step": 19820 + }, + { + "epoch": 2.521434931942501, + "ewc_loss": 0.04932405799627304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4662029318278655e-05, + "grad_norm": 31.070465087890625, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8828167915344238, + "num_tokens": 756426655.0, + "step": 19821 + }, + { + "epoch": 2.521562142221091, + "ewc_loss": 0.049344874918460846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4672437575645745e-05, + "grad_norm": 31.205345153808594, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8681344985961914, + "num_tokens": 756464961.0, + "step": 19822 + }, + { + "epoch": 2.521689352499682, + "ewc_loss": 0.04938500002026558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4692499209777452e-05, + "grad_norm": 31.20191764831543, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8857916593551636, + "num_tokens": 756497338.0, + "step": 19823 + }, + { + "epoch": 2.5218165627782723, + "ewc_loss": 0.04934128373861313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4670642233104445e-05, + "grad_norm": 31.213926315307617, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8764561414718628, + "num_tokens": 756535324.0, + "step": 19824 + }, + { + "epoch": 2.5219437730568632, + "ewc_loss": 0.049309905618429184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4654953449498862e-05, + "grad_norm": 31.100170135498047, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8792440295219421, + "num_tokens": 756574138.0, + "step": 19825 + }, + { + "epoch": 2.5220709833354533, + "ewc_loss": 0.049307890236377716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4653945729369298e-05, + "grad_norm": 31.133529663085938, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8810466527938843, + "num_tokens": 756608967.0, + "step": 19826 + }, + { + "epoch": 2.522198193614044, + "ewc_loss": 0.04929989203810692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.46499457716709e-05, + "grad_norm": 31.178529739379883, + "learning_rate": 1e-06, + "loss": 0.4946, + "mean_token_accuracy": 0.8501269817352295, + "num_tokens": 756653882.0, + "step": 19827 + }, + { + "epoch": 2.5223254038926344, + "ewc_loss": 0.04940883815288544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4704419047338888e-05, + "grad_norm": 31.153841018676758, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8657643795013428, + "num_tokens": 756695638.0, + "step": 19828 + }, + { + "epoch": 2.522452614171225, + "ewc_loss": 0.0493677593767643, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4683879018994048e-05, + "grad_norm": 31.15660285949707, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8635669350624084, + "num_tokens": 756734964.0, + "step": 19829 + }, + { + "epoch": 2.5225798244498154, + "ewc_loss": 0.04931550845503807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4657754693180323e-05, + "grad_norm": 31.1887264251709, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8755309581756592, + "num_tokens": 756768454.0, + "step": 19830 + }, + { + "epoch": 2.522707034728406, + "ewc_loss": 0.049241747707128525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.462087468302343e-05, + "grad_norm": 30.9822940826416, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.859889030456543, + "num_tokens": 756801696.0, + "step": 19831 + }, + { + "epoch": 2.5228342450069965, + "ewc_loss": 0.049382880330085754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.469144055794459e-05, + "grad_norm": 31.07278060913086, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.871601939201355, + "num_tokens": 756839874.0, + "step": 19832 + }, + { + "epoch": 2.522961455285587, + "ewc_loss": 0.04940423369407654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4702116206754e-05, + "grad_norm": 31.081012725830078, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.860296368598938, + "num_tokens": 756884179.0, + "step": 19833 + }, + { + "epoch": 2.5230886655641775, + "ewc_loss": 0.04938999563455582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4694998501217924e-05, + "grad_norm": 31.202377319335938, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8726838231086731, + "num_tokens": 756922725.0, + "step": 19834 + }, + { + "epoch": 2.523215875842768, + "ewc_loss": 0.04937078431248665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.46853924181778e-05, + "grad_norm": 31.10378074645996, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.865754246711731, + "num_tokens": 756956080.0, + "step": 19835 + }, + { + "epoch": 2.5233430861213586, + "ewc_loss": 0.049339186400175095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.46695926762186e-05, + "grad_norm": 31.12657356262207, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.861189067363739, + "num_tokens": 756995137.0, + "step": 19836 + }, + { + "epoch": 2.523470296399949, + "ewc_loss": 0.04944925755262375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4724628019612283e-05, + "grad_norm": 31.141983032226562, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8765923976898193, + "num_tokens": 757035300.0, + "step": 19837 + }, + { + "epoch": 2.5235975066785397, + "ewc_loss": 0.049344293773174286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4672146537341177e-05, + "grad_norm": 31.06296157836914, + "learning_rate": 1e-06, + "loss": 0.499, + "mean_token_accuracy": 0.8473493456840515, + "num_tokens": 757068789.0, + "step": 19838 + }, + { + "epoch": 2.52372471695713, + "ewc_loss": 0.049408767372369766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.470438448654022e-05, + "grad_norm": 31.188945770263672, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8650527000427246, + "num_tokens": 757102913.0, + "step": 19839 + }, + { + "epoch": 2.5238519272357207, + "ewc_loss": 0.0493786595761776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4689330530236475e-05, + "grad_norm": 31.112051010131836, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8715232014656067, + "num_tokens": 757138932.0, + "step": 19840 + }, + { + "epoch": 2.5239791375143112, + "ewc_loss": 0.04938323050737381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.469161518092733e-05, + "grad_norm": 31.02542495727539, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8740915060043335, + "num_tokens": 757179181.0, + "step": 19841 + }, + { + "epoch": 2.5241063477929018, + "ewc_loss": 0.04944596812129021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4722983653191477e-05, + "grad_norm": 31.1978702545166, + "learning_rate": 1e-06, + "loss": 0.5118, + "mean_token_accuracy": 0.8403946161270142, + "num_tokens": 757224209.0, + "step": 19842 + }, + { + "epoch": 2.5242335580714923, + "ewc_loss": 0.049436505883932114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4718252461752854e-05, + "grad_norm": 30.978567123413086, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8829190731048584, + "num_tokens": 757262398.0, + "step": 19843 + }, + { + "epoch": 2.524360768350083, + "ewc_loss": 0.0493609681725502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.468048478476703e-05, + "grad_norm": 31.279747009277344, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8606871366500854, + "num_tokens": 757301744.0, + "step": 19844 + }, + { + "epoch": 2.5244879786286734, + "ewc_loss": 0.049457404762506485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4728702555876225e-05, + "grad_norm": 30.98358917236328, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8746485114097595, + "num_tokens": 757338273.0, + "step": 19845 + }, + { + "epoch": 2.524615188907264, + "ewc_loss": 0.04931050166487694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.465524994477164e-05, + "grad_norm": 31.166677474975586, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8624082207679749, + "num_tokens": 757383169.0, + "step": 19846 + }, + { + "epoch": 2.524742399185854, + "ewc_loss": 0.04946589469909668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.47329480771441e-05, + "grad_norm": 31.023454666137695, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8590315580368042, + "num_tokens": 757420998.0, + "step": 19847 + }, + { + "epoch": 2.524869609464445, + "ewc_loss": 0.04941646009683609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4708229830139317e-05, + "grad_norm": 31.125089645385742, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8747597932815552, + "num_tokens": 757463281.0, + "step": 19848 + }, + { + "epoch": 2.524996819743035, + "ewc_loss": 0.04948895424604416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4744476831983775e-05, + "grad_norm": 30.947532653808594, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8640585541725159, + "num_tokens": 757495022.0, + "step": 19849 + }, + { + "epoch": 2.525124030021626, + "ewc_loss": 0.049365587532520294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.468279308232013e-05, + "grad_norm": 31.054960250854492, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8758566975593567, + "num_tokens": 757536801.0, + "step": 19850 + }, + { + "epoch": 2.525251240300216, + "ewc_loss": 0.04951325058937073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4756625862210058e-05, + "grad_norm": 30.954252243041992, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8861334323883057, + "num_tokens": 757573419.0, + "step": 19851 + }, + { + "epoch": 2.5253784505788066, + "ewc_loss": 0.04949893429875374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.47494663199177e-05, + "grad_norm": 31.160465240478516, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.865841269493103, + "num_tokens": 757610773.0, + "step": 19852 + }, + { + "epoch": 2.525505660857397, + "ewc_loss": 0.049619682133197784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4809840397210792e-05, + "grad_norm": 31.10196876525879, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8808746337890625, + "num_tokens": 757648067.0, + "step": 19853 + }, + { + "epoch": 2.5256328711359877, + "ewc_loss": 0.049464937299489975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4732467863941565e-05, + "grad_norm": 31.208837509155273, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.875800371170044, + "num_tokens": 757681598.0, + "step": 19854 + }, + { + "epoch": 2.525760081414578, + "ewc_loss": 0.04958738759160042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4793693228275515e-05, + "grad_norm": 31.214078903198242, + "learning_rate": 1e-06, + "loss": 0.4971, + "mean_token_accuracy": 0.852375328540802, + "num_tokens": 757724752.0, + "step": 19855 + }, + { + "epoch": 2.5258872916931687, + "ewc_loss": 0.049497898668050766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4748949726927094e-05, + "grad_norm": 31.089969635009766, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8822651505470276, + "num_tokens": 757763801.0, + "step": 19856 + }, + { + "epoch": 2.5260145019717593, + "ewc_loss": 0.04957000911235809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4785003915894777e-05, + "grad_norm": 31.160171508789062, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8693548440933228, + "num_tokens": 757799235.0, + "step": 19857 + }, + { + "epoch": 2.52614171225035, + "ewc_loss": 0.04947832226753235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4739161744946614e-05, + "grad_norm": 31.138385772705078, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8752231597900391, + "num_tokens": 757839695.0, + "step": 19858 + }, + { + "epoch": 2.5262689225289403, + "ewc_loss": 0.04952402785420418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.476201370882336e-05, + "grad_norm": 31.160079956054688, + "learning_rate": 1e-06, + "loss": 0.4739, + "mean_token_accuracy": 0.8574961423873901, + "num_tokens": 757877152.0, + "step": 19859 + }, + { + "epoch": 2.526396132807531, + "ewc_loss": 0.04941844940185547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4709224817343056e-05, + "grad_norm": 31.17978286743164, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8688793778419495, + "num_tokens": 757916173.0, + "step": 19860 + }, + { + "epoch": 2.5265233430861214, + "ewc_loss": 0.04949898272752762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.474949178576935e-05, + "grad_norm": 31.164472579956055, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.876473069190979, + "num_tokens": 757957271.0, + "step": 19861 + }, + { + "epoch": 2.526650553364712, + "ewc_loss": 0.04938830807805061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.469415449013468e-05, + "grad_norm": 31.114656448364258, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8637760877609253, + "num_tokens": 757995553.0, + "step": 19862 + }, + { + "epoch": 2.5267777636433024, + "ewc_loss": 0.049391716718673706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.46958588832058e-05, + "grad_norm": 31.176172256469727, + "learning_rate": 1e-06, + "loss": 0.4621, + "mean_token_accuracy": 0.8598853349685669, + "num_tokens": 758035095.0, + "step": 19863 + }, + { + "epoch": 2.526904973921893, + "ewc_loss": 0.04937275871634483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4686380129423924e-05, + "grad_norm": 31.164325714111328, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8687286376953125, + "num_tokens": 758071185.0, + "step": 19864 + }, + { + "epoch": 2.5270321842004835, + "ewc_loss": 0.04945019632577896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4725097318878397e-05, + "grad_norm": 31.333236694335938, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8721485137939453, + "num_tokens": 758110251.0, + "step": 19865 + }, + { + "epoch": 2.527159394479074, + "ewc_loss": 0.04934496060013771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4672479412402026e-05, + "grad_norm": 31.257850646972656, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8547647595405579, + "num_tokens": 758144385.0, + "step": 19866 + }, + { + "epoch": 2.5272866047576645, + "ewc_loss": 0.04935410991311073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4677054170751944e-05, + "grad_norm": 31.22998046875, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8815048933029175, + "num_tokens": 758185935.0, + "step": 19867 + }, + { + "epoch": 2.527413815036255, + "ewc_loss": 0.04930462688207626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4652314095874317e-05, + "grad_norm": 31.166004180908203, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8744546175003052, + "num_tokens": 758226688.0, + "step": 19868 + }, + { + "epoch": 2.5275410253148456, + "ewc_loss": 0.04929428547620773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4647142709000036e-05, + "grad_norm": 31.100317001342773, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8758409023284912, + "num_tokens": 758263866.0, + "step": 19869 + }, + { + "epoch": 2.5276682355934357, + "ewc_loss": 0.049332279711961746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.466614023433067e-05, + "grad_norm": 31.21858787536621, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8759348392486572, + "num_tokens": 758303607.0, + "step": 19870 + }, + { + "epoch": 2.5277954458720266, + "ewc_loss": 0.04934810847043991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4674054657225497e-05, + "grad_norm": 31.171977996826172, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.874098539352417, + "num_tokens": 758340120.0, + "step": 19871 + }, + { + "epoch": 2.5279226561506167, + "ewc_loss": 0.049333587288856506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4666793251526542e-05, + "grad_norm": 31.11771583557129, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8695826530456543, + "num_tokens": 758373792.0, + "step": 19872 + }, + { + "epoch": 2.5280498664292077, + "ewc_loss": 0.049361176788806915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4680588467163034e-05, + "grad_norm": 31.314929962158203, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.85809725522995, + "num_tokens": 758418157.0, + "step": 19873 + }, + { + "epoch": 2.528177076707798, + "ewc_loss": 0.049339812248945236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.466990554239601e-05, + "grad_norm": 31.13370132446289, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8632804155349731, + "num_tokens": 758454042.0, + "step": 19874 + }, + { + "epoch": 2.5283042869863888, + "ewc_loss": 0.0491802841424942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4590142857050523e-05, + "grad_norm": 31.09006118774414, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8718006610870361, + "num_tokens": 758495305.0, + "step": 19875 + }, + { + "epoch": 2.528431497264979, + "ewc_loss": 0.04940269514918327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.47013467742363e-05, + "grad_norm": 31.368234634399414, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8778822422027588, + "num_tokens": 758532646.0, + "step": 19876 + }, + { + "epoch": 2.5285587075435694, + "ewc_loss": 0.049274276942014694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4637138267280534e-05, + "grad_norm": 31.104097366333008, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8667173385620117, + "num_tokens": 758568329.0, + "step": 19877 + }, + { + "epoch": 2.52868591782216, + "ewc_loss": 0.04917101189494133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4585506253060885e-05, + "grad_norm": 31.124591827392578, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8740681409835815, + "num_tokens": 758608246.0, + "step": 19878 + }, + { + "epoch": 2.5288131281007504, + "ewc_loss": 0.04940694570541382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4703473172849044e-05, + "grad_norm": 31.03813362121582, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8798661231994629, + "num_tokens": 758645356.0, + "step": 19879 + }, + { + "epoch": 2.528940338379341, + "ewc_loss": 0.04930025339126587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.465012585162185e-05, + "grad_norm": 31.06119155883789, + "learning_rate": 1e-06, + "loss": 0.3697, + "mean_token_accuracy": 0.8851757049560547, + "num_tokens": 758685657.0, + "step": 19880 + }, + { + "epoch": 2.5290675486579315, + "ewc_loss": 0.04937306046485901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4686531105544418e-05, + "grad_norm": 31.180875778198242, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.874240517616272, + "num_tokens": 758718843.0, + "step": 19881 + }, + { + "epoch": 2.529194758936522, + "ewc_loss": 0.049289409071207047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4644705263199285e-05, + "grad_norm": 30.964885711669922, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8744027018547058, + "num_tokens": 758757946.0, + "step": 19882 + }, + { + "epoch": 2.5293219692151125, + "ewc_loss": 0.04942703619599342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4713517632335424e-05, + "grad_norm": 31.193693161010742, + "learning_rate": 1e-06, + "loss": 0.4859, + "mean_token_accuracy": 0.8509798645973206, + "num_tokens": 758792832.0, + "step": 19883 + }, + { + "epoch": 2.529449179493703, + "ewc_loss": 0.049433380365371704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.471668994985521e-05, + "grad_norm": 31.1732120513916, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8630121946334839, + "num_tokens": 758826062.0, + "step": 19884 + }, + { + "epoch": 2.5295763897722936, + "ewc_loss": 0.0493544302880764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.467721606080886e-05, + "grad_norm": 31.225954055786133, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8789337873458862, + "num_tokens": 758860940.0, + "step": 19885 + }, + { + "epoch": 2.529703600050884, + "ewc_loss": 0.04938533157110214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4692666556802578e-05, + "grad_norm": 31.06656837463379, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8683921098709106, + "num_tokens": 758900740.0, + "step": 19886 + }, + { + "epoch": 2.5298308103294747, + "ewc_loss": 0.049321480095386505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.466073965479154e-05, + "grad_norm": 31.19703483581543, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8681724667549133, + "num_tokens": 758942786.0, + "step": 19887 + }, + { + "epoch": 2.529958020608065, + "ewc_loss": 0.04940027371048927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.470013714628294e-05, + "grad_norm": 30.982585906982422, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8794159889221191, + "num_tokens": 758983376.0, + "step": 19888 + }, + { + "epoch": 2.5300852308866557, + "ewc_loss": 0.04927745461463928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4638728064019233e-05, + "grad_norm": 31.238422393798828, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8583651185035706, + "num_tokens": 759024007.0, + "step": 19889 + }, + { + "epoch": 2.5302124411652462, + "ewc_loss": 0.0494808629155159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.474043139955029e-05, + "grad_norm": 31.00881004333496, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8778975009918213, + "num_tokens": 759061611.0, + "step": 19890 + }, + { + "epoch": 2.5303396514438368, + "ewc_loss": 0.04931651055812836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4658254915266298e-05, + "grad_norm": 31.140087127685547, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8544104099273682, + "num_tokens": 759101942.0, + "step": 19891 + }, + { + "epoch": 2.5304668617224273, + "ewc_loss": 0.04943205043673515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4716026018722914e-05, + "grad_norm": 31.113157272338867, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8881287574768066, + "num_tokens": 759140281.0, + "step": 19892 + }, + { + "epoch": 2.530594072001018, + "ewc_loss": 0.049320921301841736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.46604613494128e-05, + "grad_norm": 31.210676193237305, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8703834414482117, + "num_tokens": 759174319.0, + "step": 19893 + }, + { + "epoch": 2.5307212822796084, + "ewc_loss": 0.04945838451385498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.472919186402578e-05, + "grad_norm": 31.084808349609375, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8776767253875732, + "num_tokens": 759208981.0, + "step": 19894 + }, + { + "epoch": 2.5308484925581984, + "ewc_loss": 0.04939260333776474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4696300897630863e-05, + "grad_norm": 31.180477142333984, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8729545474052429, + "num_tokens": 759240559.0, + "step": 19895 + }, + { + "epoch": 2.5309757028367894, + "ewc_loss": 0.04949698597192764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4748493160586804e-05, + "grad_norm": 31.11332893371582, + "learning_rate": 1e-06, + "loss": 0.3991, + "mean_token_accuracy": 0.8825405240058899, + "num_tokens": 759280550.0, + "step": 19896 + }, + { + "epoch": 2.5311029131153795, + "ewc_loss": 0.04939412698149681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.469706305419095e-05, + "grad_norm": 31.125484466552734, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8631076216697693, + "num_tokens": 759321298.0, + "step": 19897 + }, + { + "epoch": 2.5312301233939705, + "ewc_loss": 0.049390289932489395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.469514583935961e-05, + "grad_norm": 30.98088836669922, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8835339546203613, + "num_tokens": 759354627.0, + "step": 19898 + }, + { + "epoch": 2.5313573336725606, + "ewc_loss": 0.049434930086135864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.471746483934112e-05, + "grad_norm": 31.232080459594727, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8651109933853149, + "num_tokens": 759388129.0, + "step": 19899 + }, + { + "epoch": 2.5314845439511515, + "ewc_loss": 0.049504637718200684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.475231849530246e-05, + "grad_norm": 31.139816284179688, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8797695636749268, + "num_tokens": 759423667.0, + "step": 19900 + }, + { + "epoch": 2.5316117542297416, + "ewc_loss": 0.049378786236047745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4689392375876196e-05, + "grad_norm": 31.152660369873047, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8622938394546509, + "num_tokens": 759466586.0, + "step": 19901 + }, + { + "epoch": 2.531738964508332, + "ewc_loss": 0.04937906563282013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.468953243806027e-05, + "grad_norm": 31.129581451416016, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8646713495254517, + "num_tokens": 759504214.0, + "step": 19902 + }, + { + "epoch": 2.5318661747869227, + "ewc_loss": 0.049469612538814545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4734807084314525e-05, + "grad_norm": 31.21945571899414, + "learning_rate": 1e-06, + "loss": 0.4761, + "mean_token_accuracy": 0.8572613000869751, + "num_tokens": 759543865.0, + "step": 19903 + }, + { + "epoch": 2.531993385065513, + "ewc_loss": 0.04936365783214569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4681829017936252e-05, + "grad_norm": 31.10121726989746, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8658971786499023, + "num_tokens": 759581917.0, + "step": 19904 + }, + { + "epoch": 2.5321205953441037, + "ewc_loss": 0.04938479885458946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4692399165360257e-05, + "grad_norm": 31.177427291870117, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8732469081878662, + "num_tokens": 759617641.0, + "step": 19905 + }, + { + "epoch": 2.5322478056226942, + "ewc_loss": 0.04943443462252617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4717217456782237e-05, + "grad_norm": 31.075992584228516, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8817061185836792, + "num_tokens": 759649097.0, + "step": 19906 + }, + { + "epoch": 2.5323750159012848, + "ewc_loss": 0.04937697574496269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4688488338142633e-05, + "grad_norm": 31.15842056274414, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8651599884033203, + "num_tokens": 759688802.0, + "step": 19907 + }, + { + "epoch": 2.5325022261798753, + "ewc_loss": 0.04945814609527588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.472907362971455e-05, + "grad_norm": 31.080692291259766, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8806365728378296, + "num_tokens": 759724243.0, + "step": 19908 + }, + { + "epoch": 2.532629436458466, + "ewc_loss": 0.049392230808734894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.46961153607117e-05, + "grad_norm": 31.149309158325195, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8636916279792786, + "num_tokens": 759764646.0, + "step": 19909 + }, + { + "epoch": 2.5327566467370564, + "ewc_loss": 0.049404848366975784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4702423615963198e-05, + "grad_norm": 31.08540153503418, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.873479962348938, + "num_tokens": 759806939.0, + "step": 19910 + }, + { + "epoch": 2.532883857015647, + "ewc_loss": 0.04944101721048355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.472050800861325e-05, + "grad_norm": 31.09551429748535, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8724330067634583, + "num_tokens": 759848138.0, + "step": 19911 + }, + { + "epoch": 2.5330110672942374, + "ewc_loss": 0.049482427537441254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4741213564993814e-05, + "grad_norm": 31.150020599365234, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8670767545700073, + "num_tokens": 759881877.0, + "step": 19912 + }, + { + "epoch": 2.533138277572828, + "ewc_loss": 0.049422334879636765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4711167498026043e-05, + "grad_norm": 31.085887908935547, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8810524940490723, + "num_tokens": 759922625.0, + "step": 19913 + }, + { + "epoch": 2.5332654878514185, + "ewc_loss": 0.049441613256931305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4720806322875433e-05, + "grad_norm": 31.15973663330078, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8821841478347778, + "num_tokens": 759952429.0, + "step": 19914 + }, + { + "epoch": 2.533392698130009, + "ewc_loss": 0.04944504424929619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4722521629882976e-05, + "grad_norm": 31.13888931274414, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8735304474830627, + "num_tokens": 759987678.0, + "step": 19915 + }, + { + "epoch": 2.5335199084085995, + "ewc_loss": 0.04946902021765709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4734510589041747e-05, + "grad_norm": 31.22804832458496, + "learning_rate": 1e-06, + "loss": 0.4671, + "mean_token_accuracy": 0.8566081523895264, + "num_tokens": 760023943.0, + "step": 19916 + }, + { + "epoch": 2.53364711868719, + "ewc_loss": 0.049477607011795044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4738803404034115e-05, + "grad_norm": 31.082374572753906, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8734902739524841, + "num_tokens": 760068229.0, + "step": 19917 + }, + { + "epoch": 2.5337743289657806, + "ewc_loss": 0.049336858093738556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.466842852300033e-05, + "grad_norm": 30.997295379638672, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8576638102531433, + "num_tokens": 760109440.0, + "step": 19918 + }, + { + "epoch": 2.533901539244371, + "ewc_loss": 0.04948168620467186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.474084249115549e-05, + "grad_norm": 31.015689849853516, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8720793128013611, + "num_tokens": 760146616.0, + "step": 19919 + }, + { + "epoch": 2.534028749522961, + "ewc_loss": 0.049523163586854935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.476158260833472e-05, + "grad_norm": 31.149768829345703, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8539237976074219, + "num_tokens": 760188708.0, + "step": 19920 + }, + { + "epoch": 2.534155959801552, + "ewc_loss": 0.04949450120329857, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4747250790824182e-05, + "grad_norm": 31.098735809326172, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.886122465133667, + "num_tokens": 760231355.0, + "step": 19921 + }, + { + "epoch": 2.5342831700801423, + "ewc_loss": 0.04952186718583107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4760933229117654e-05, + "grad_norm": 31.200603485107422, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8574438691139221, + "num_tokens": 760268316.0, + "step": 19922 + }, + { + "epoch": 2.5344103803587332, + "ewc_loss": 0.04954523965716362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4772620236035436e-05, + "grad_norm": 31.193727493286133, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.861644446849823, + "num_tokens": 760305420.0, + "step": 19923 + }, + { + "epoch": 2.5345375906373233, + "ewc_loss": 0.0495186410844326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.475931978551671e-05, + "grad_norm": 31.165557861328125, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8897566795349121, + "num_tokens": 760342195.0, + "step": 19924 + }, + { + "epoch": 2.534664800915914, + "ewc_loss": 0.04947848618030548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4739243599469773e-05, + "grad_norm": 31.15001678466797, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8843784332275391, + "num_tokens": 760375394.0, + "step": 19925 + }, + { + "epoch": 2.5347920111945044, + "ewc_loss": 0.04945661500096321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4728307835175656e-05, + "grad_norm": 31.156394958496094, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8591862320899963, + "num_tokens": 760411629.0, + "step": 19926 + }, + { + "epoch": 2.534919221473095, + "ewc_loss": 0.049466948956251144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4733473765081726e-05, + "grad_norm": 31.18438148498535, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8725236654281616, + "num_tokens": 760456212.0, + "step": 19927 + }, + { + "epoch": 2.5350464317516854, + "ewc_loss": 0.049440521746873856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.472026062605437e-05, + "grad_norm": 31.116809844970703, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8616499900817871, + "num_tokens": 760496385.0, + "step": 19928 + }, + { + "epoch": 2.535173642030276, + "ewc_loss": 0.04945285990834236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4726430638111196e-05, + "grad_norm": 31.29192543029785, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8698071837425232, + "num_tokens": 760530426.0, + "step": 19929 + }, + { + "epoch": 2.5353008523088665, + "ewc_loss": 0.0493941530585289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4697075787116773e-05, + "grad_norm": 31.008230209350586, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8819355964660645, + "num_tokens": 760568174.0, + "step": 19930 + }, + { + "epoch": 2.535428062587457, + "ewc_loss": 0.04945830628275871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4729153665248305e-05, + "grad_norm": 31.30657386779785, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8886895179748535, + "num_tokens": 760608097.0, + "step": 19931 + }, + { + "epoch": 2.5355552728660475, + "ewc_loss": 0.04943567141890526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4717835913179442e-05, + "grad_norm": 31.167909622192383, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8697738647460938, + "num_tokens": 760643018.0, + "step": 19932 + }, + { + "epoch": 2.535682483144638, + "ewc_loss": 0.049373749643564224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4686874894541688e-05, + "grad_norm": 31.311786651611328, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8786823153495789, + "num_tokens": 760676553.0, + "step": 19933 + }, + { + "epoch": 2.5358096934232286, + "ewc_loss": 0.0494878813624382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.474394023010973e-05, + "grad_norm": 31.235261917114258, + "learning_rate": 1e-06, + "loss": 0.3747, + "mean_token_accuracy": 0.8854372501373291, + "num_tokens": 760708850.0, + "step": 19934 + }, + { + "epoch": 2.535936903701819, + "ewc_loss": 0.04939649626612663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.469824903528206e-05, + "grad_norm": 31.334548950195312, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8671025037765503, + "num_tokens": 760746088.0, + "step": 19935 + }, + { + "epoch": 2.5360641139804097, + "ewc_loss": 0.04938509315252304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4692546503501944e-05, + "grad_norm": 31.27248764038086, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8769487142562866, + "num_tokens": 760785716.0, + "step": 19936 + }, + { + "epoch": 2.536191324259, + "ewc_loss": 0.049382321536540985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4691160433576442e-05, + "grad_norm": 31.228391647338867, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8799787759780884, + "num_tokens": 760821301.0, + "step": 19937 + }, + { + "epoch": 2.5363185345375907, + "ewc_loss": 0.04929746314883232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.464873068674933e-05, + "grad_norm": 31.09958839416504, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8602288961410522, + "num_tokens": 760861200.0, + "step": 19938 + }, + { + "epoch": 2.5364457448161812, + "ewc_loss": 0.049345195293426514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4672597646713257e-05, + "grad_norm": 31.22555923461914, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8693882822990417, + "num_tokens": 760896398.0, + "step": 19939 + }, + { + "epoch": 2.5365729550947718, + "ewc_loss": 0.04946965351700783, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4734827093197964e-05, + "grad_norm": 31.143375396728516, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8524148464202881, + "num_tokens": 760937300.0, + "step": 19940 + }, + { + "epoch": 2.5367001653733623, + "ewc_loss": 0.04940525442361832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4702627342776395e-05, + "grad_norm": 31.144672393798828, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8643385171890259, + "num_tokens": 760973726.0, + "step": 19941 + }, + { + "epoch": 2.536827375651953, + "ewc_loss": 0.04932103678584099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.466051773808431e-05, + "grad_norm": 31.02341651916504, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.868600606918335, + "num_tokens": 761009048.0, + "step": 19942 + }, + { + "epoch": 2.5369545859305433, + "ewc_loss": 0.04950093850493431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4750468583079055e-05, + "grad_norm": 31.129806518554688, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8681068420410156, + "num_tokens": 761050836.0, + "step": 19943 + }, + { + "epoch": 2.537081796209134, + "ewc_loss": 0.04942983388900757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.471491643518675e-05, + "grad_norm": 31.197547912597656, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8633871078491211, + "num_tokens": 761087517.0, + "step": 19944 + }, + { + "epoch": 2.537209006487724, + "ewc_loss": 0.04942057281732559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4710287107154727e-05, + "grad_norm": 31.077939987182617, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8843356370925903, + "num_tokens": 761121401.0, + "step": 19945 + }, + { + "epoch": 2.537336216766315, + "ewc_loss": 0.049392495304346085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.469624814693816e-05, + "grad_norm": 31.18299674987793, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8588924407958984, + "num_tokens": 761154973.0, + "step": 19946 + }, + { + "epoch": 2.537463427044905, + "ewc_loss": 0.049618713557720184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.480935654602945e-05, + "grad_norm": 31.194746017456055, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.878022313117981, + "num_tokens": 761194211.0, + "step": 19947 + }, + { + "epoch": 2.537590637323496, + "ewc_loss": 0.049479495733976364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4739747459534556e-05, + "grad_norm": 31.078187942504883, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.8646153807640076, + "num_tokens": 761233486.0, + "step": 19948 + }, + { + "epoch": 2.537717847602086, + "ewc_loss": 0.04958471655845642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4792358090053312e-05, + "grad_norm": 31.145299911499023, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8714504241943359, + "num_tokens": 761275846.0, + "step": 19949 + }, + { + "epoch": 2.5378450578806766, + "ewc_loss": 0.04960302263498306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4801511244731955e-05, + "grad_norm": 31.207082748413086, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8610665798187256, + "num_tokens": 761311210.0, + "step": 19950 + }, + { + "epoch": 2.537972268159267, + "ewc_loss": 0.04951481521129608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4757408027653582e-05, + "grad_norm": 31.264183044433594, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8702831268310547, + "num_tokens": 761349961.0, + "step": 19951 + }, + { + "epoch": 2.5380994784378577, + "ewc_loss": 0.04959123954176903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.479561953805387e-05, + "grad_norm": 31.094738006591797, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8778040409088135, + "num_tokens": 761386993.0, + "step": 19952 + }, + { + "epoch": 2.538226688716448, + "ewc_loss": 0.049296677112579346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4648337785038166e-05, + "grad_norm": 31.014568328857422, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8610017895698547, + "num_tokens": 761426910.0, + "step": 19953 + }, + { + "epoch": 2.5383538989950387, + "ewc_loss": 0.04958367720246315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.47918378590839e-05, + "grad_norm": 31.20035743713379, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8733189702033997, + "num_tokens": 761466633.0, + "step": 19954 + }, + { + "epoch": 2.5384811092736292, + "ewc_loss": 0.04952874407172203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.476437293807976e-05, + "grad_norm": 31.016983032226562, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8570497035980225, + "num_tokens": 761497670.0, + "step": 19955 + }, + { + "epoch": 2.5386083195522198, + "ewc_loss": 0.04951011389493942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4755056074354798e-05, + "grad_norm": 30.94654083251953, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.867297351360321, + "num_tokens": 761534626.0, + "step": 19956 + }, + { + "epoch": 2.5387355298308103, + "ewc_loss": 0.049601346254348755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.480067269061692e-05, + "grad_norm": 31.105417251586914, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8777867555618286, + "num_tokens": 761581125.0, + "step": 19957 + }, + { + "epoch": 2.538862740109401, + "ewc_loss": 0.04961075261235237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.480537659721449e-05, + "grad_norm": 30.998733520507812, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8714337348937988, + "num_tokens": 761616533.0, + "step": 19958 + }, + { + "epoch": 2.5389899503879914, + "ewc_loss": 0.04960257560014725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.480128750903532e-05, + "grad_norm": 31.218128204345703, + "learning_rate": 1e-06, + "loss": 0.4624, + "mean_token_accuracy": 0.8605812788009644, + "num_tokens": 761650615.0, + "step": 19959 + }, + { + "epoch": 2.539117160666582, + "ewc_loss": 0.04960105940699577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.480052899045404e-05, + "grad_norm": 31.1472110748291, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8824369311332703, + "num_tokens": 761687020.0, + "step": 19960 + }, + { + "epoch": 2.5392443709451724, + "ewc_loss": 0.04952564463019371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.476282315910794e-05, + "grad_norm": 31.180017471313477, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8864145874977112, + "num_tokens": 761721653.0, + "step": 19961 + }, + { + "epoch": 2.539371581223763, + "ewc_loss": 0.04956210032105446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4781049432931468e-05, + "grad_norm": 31.133058547973633, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8789718151092529, + "num_tokens": 761758377.0, + "step": 19962 + }, + { + "epoch": 2.5394987915023535, + "ewc_loss": 0.04954990744590759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4774953999440186e-05, + "grad_norm": 31.195636749267578, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8712531328201294, + "num_tokens": 761795263.0, + "step": 19963 + }, + { + "epoch": 2.539626001780944, + "ewc_loss": 0.049611277878284454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.48056385316886e-05, + "grad_norm": 31.277151107788086, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8819296956062317, + "num_tokens": 761834758.0, + "step": 19964 + }, + { + "epoch": 2.5397532120595345, + "ewc_loss": 0.04944900423288345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4724502509343438e-05, + "grad_norm": 31.059450149536133, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8636741042137146, + "num_tokens": 761880482.0, + "step": 19965 + }, + { + "epoch": 2.539880422338125, + "ewc_loss": 0.049474310129880905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4737155399634503e-05, + "grad_norm": 31.162954330444336, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8694317936897278, + "num_tokens": 761922494.0, + "step": 19966 + }, + { + "epoch": 2.5400076326167156, + "ewc_loss": 0.04944654554128647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.472327287250664e-05, + "grad_norm": 31.06768798828125, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8891822695732117, + "num_tokens": 761959462.0, + "step": 19967 + }, + { + "epoch": 2.5401348428953057, + "ewc_loss": 0.04955378174781799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4776891223154962e-05, + "grad_norm": 31.22272491455078, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8621776700019836, + "num_tokens": 761990316.0, + "step": 19968 + }, + { + "epoch": 2.5402620531738966, + "ewc_loss": 0.04955166578292847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.47758325713221e-05, + "grad_norm": 31.053497314453125, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8652726411819458, + "num_tokens": 762023671.0, + "step": 19969 + }, + { + "epoch": 2.5403892634524867, + "ewc_loss": 0.049452442675828934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.472622145432979e-05, + "grad_norm": 31.067277908325195, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8761784434318542, + "num_tokens": 762056719.0, + "step": 19970 + }, + { + "epoch": 2.5405164737310777, + "ewc_loss": 0.04962528496980667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4812641640892252e-05, + "grad_norm": 31.120845794677734, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8727333545684814, + "num_tokens": 762098802.0, + "step": 19971 + }, + { + "epoch": 2.540643684009668, + "ewc_loss": 0.0494915246963501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4745762857492082e-05, + "grad_norm": 31.16205596923828, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8740679025650024, + "num_tokens": 762134470.0, + "step": 19972 + }, + { + "epoch": 2.5407708942882588, + "ewc_loss": 0.049609649926424026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4804825443425216e-05, + "grad_norm": 31.185047149658203, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8735584020614624, + "num_tokens": 762177163.0, + "step": 19973 + }, + { + "epoch": 2.540898104566849, + "ewc_loss": 0.04958532378077507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4792661861283705e-05, + "grad_norm": 31.21526527404785, + "learning_rate": 1e-06, + "loss": 0.3926, + "mean_token_accuracy": 0.879037082195282, + "num_tokens": 762217323.0, + "step": 19974 + }, + { + "epoch": 2.5410253148454394, + "ewc_loss": 0.04957401007413864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4787004804238677e-05, + "grad_norm": 31.142013549804688, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8675216436386108, + "num_tokens": 762252144.0, + "step": 19975 + }, + { + "epoch": 2.54115252512403, + "ewc_loss": 0.04957634583115578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4788172595435753e-05, + "grad_norm": 31.26481819152832, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8647134304046631, + "num_tokens": 762282108.0, + "step": 19976 + }, + { + "epoch": 2.5412797354026204, + "ewc_loss": 0.04960283264517784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.480141665728297e-05, + "grad_norm": 31.33099937438965, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8738471865653992, + "num_tokens": 762318154.0, + "step": 19977 + }, + { + "epoch": 2.541406945681211, + "ewc_loss": 0.04951425641775131, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4757127903285436e-05, + "grad_norm": 31.169540405273438, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8682067394256592, + "num_tokens": 762354704.0, + "step": 19978 + }, + { + "epoch": 2.5415341559598015, + "ewc_loss": 0.049480993300676346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4740496883168817e-05, + "grad_norm": 31.135330200195312, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.873945415019989, + "num_tokens": 762389943.0, + "step": 19979 + }, + { + "epoch": 2.541661366238392, + "ewc_loss": 0.049509186297655106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4754594051046297e-05, + "grad_norm": 31.18794059753418, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8732815384864807, + "num_tokens": 762421560.0, + "step": 19980 + }, + { + "epoch": 2.5417885765169825, + "ewc_loss": 0.04959375038743019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4796874640742317e-05, + "grad_norm": 31.096044540405273, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8775414824485779, + "num_tokens": 762458509.0, + "step": 19981 + }, + { + "epoch": 2.541915786795573, + "ewc_loss": 0.04940515384078026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4702576411073096e-05, + "grad_norm": 31.217144012451172, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8704645037651062, + "num_tokens": 762502382.0, + "step": 19982 + }, + { + "epoch": 2.5420429970741636, + "ewc_loss": 0.04963112249970436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4815561118884943e-05, + "grad_norm": 31.135229110717773, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8674731254577637, + "num_tokens": 762536611.0, + "step": 19983 + }, + { + "epoch": 2.542170207352754, + "ewc_loss": 0.0494648702442646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.47324351221323e-05, + "grad_norm": 31.139423370361328, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8707987070083618, + "num_tokens": 762577801.0, + "step": 19984 + }, + { + "epoch": 2.5422974176313446, + "ewc_loss": 0.04956207424402237, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4781036700005643e-05, + "grad_norm": 31.228317260742188, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8730031251907349, + "num_tokens": 762617413.0, + "step": 19985 + }, + { + "epoch": 2.542424627909935, + "ewc_loss": 0.049556102603673935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4778051738394424e-05, + "grad_norm": 31.058809280395508, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8799019455909729, + "num_tokens": 762654811.0, + "step": 19986 + }, + { + "epoch": 2.5425518381885257, + "ewc_loss": 0.0494302473962307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4715123799978755e-05, + "grad_norm": 31.27052116394043, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8760603666305542, + "num_tokens": 762692902.0, + "step": 19987 + }, + { + "epoch": 2.5426790484671162, + "ewc_loss": 0.049670059233903885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4835029762471095e-05, + "grad_norm": 31.111892700195312, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8715137839317322, + "num_tokens": 762731984.0, + "step": 19988 + }, + { + "epoch": 2.5428062587457068, + "ewc_loss": 0.04937942698597908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4689714337000623e-05, + "grad_norm": 31.182382583618164, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8797687888145447, + "num_tokens": 762767104.0, + "step": 19989 + }, + { + "epoch": 2.5429334690242973, + "ewc_loss": 0.049669139087200165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4834569558151998e-05, + "grad_norm": 31.21893882751465, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8600157499313354, + "num_tokens": 762804982.0, + "step": 19990 + }, + { + "epoch": 2.543060679302888, + "ewc_loss": 0.04947672411799431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4738361389609054e-05, + "grad_norm": 31.081787109375, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8611826300621033, + "num_tokens": 762842956.0, + "step": 19991 + }, + { + "epoch": 2.5431878895814783, + "ewc_loss": 0.04953519627451897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4767598006292246e-05, + "grad_norm": 31.145299911499023, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8774376511573792, + "num_tokens": 762875445.0, + "step": 19992 + }, + { + "epoch": 2.5433150998600684, + "ewc_loss": 0.04960939288139343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4804696295177564e-05, + "grad_norm": 31.24752426147461, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8850171566009521, + "num_tokens": 762906061.0, + "step": 19993 + }, + { + "epoch": 2.5434423101386594, + "ewc_loss": 0.04962027072906494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4810135073494166e-05, + "grad_norm": 31.267244338989258, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.878703236579895, + "num_tokens": 762945550.0, + "step": 19994 + }, + { + "epoch": 2.5435695204172495, + "ewc_loss": 0.049491364508867264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4745682821958326e-05, + "grad_norm": 31.09659194946289, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8710578680038452, + "num_tokens": 762982586.0, + "step": 19995 + }, + { + "epoch": 2.5436967306958405, + "ewc_loss": 0.04945416748523712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.472708365530707e-05, + "grad_norm": 31.20718002319336, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8732174634933472, + "num_tokens": 763013527.0, + "step": 19996 + }, + { + "epoch": 2.5438239409744305, + "ewc_loss": 0.049483828246593475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.474191387591418e-05, + "grad_norm": 31.01613998413086, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8656754493713379, + "num_tokens": 763054363.0, + "step": 19997 + }, + { + "epoch": 2.5439511512530215, + "ewc_loss": 0.0495549775660038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.477748967066873e-05, + "grad_norm": 31.256643295288086, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.866862952709198, + "num_tokens": 763096834.0, + "step": 19998 + }, + { + "epoch": 2.5440783615316116, + "ewc_loss": 0.049581378698349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.479069007677026e-05, + "grad_norm": 31.207653045654297, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8708387017250061, + "num_tokens": 763139611.0, + "step": 19999 + }, + { + "epoch": 2.544205571810202, + "ewc_loss": 0.049412067979574203, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4706034309929237e-05, + "grad_norm": 31.159509658813477, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8718978762626648, + "num_tokens": 763181085.0, + "step": 20000 + }, + { + "epoch": 2.5443327820887927, + "ewc_loss": 0.049553800374269485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.477690031810198e-05, + "grad_norm": 31.151899337768555, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8689422011375427, + "num_tokens": 763219811.0, + "step": 20001 + }, + { + "epoch": 2.544459992367383, + "ewc_loss": 0.049481749534606934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4740875232964754e-05, + "grad_norm": 31.168180465698242, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8601253032684326, + "num_tokens": 763257745.0, + "step": 20002 + }, + { + "epoch": 2.5445872026459737, + "ewc_loss": 0.04955299198627472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4776496502454393e-05, + "grad_norm": 31.11681365966797, + "learning_rate": 1e-06, + "loss": 0.4639, + "mean_token_accuracy": 0.8555026054382324, + "num_tokens": 763299475.0, + "step": 20003 + }, + { + "epoch": 2.5447144129245642, + "ewc_loss": 0.04944651946425438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4723260139580816e-05, + "grad_norm": 31.168529510498047, + "learning_rate": 1e-06, + "loss": 0.4846, + "mean_token_accuracy": 0.8487229347229004, + "num_tokens": 763333960.0, + "step": 20004 + }, + { + "epoch": 2.5448416232031548, + "ewc_loss": 0.04955311864614487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4776560167083517e-05, + "grad_norm": 31.112545013427734, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8619327545166016, + "num_tokens": 763378646.0, + "step": 20005 + }, + { + "epoch": 2.5449688334817453, + "ewc_loss": 0.04958988353610039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4794941055006348e-05, + "grad_norm": 31.192712783813477, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8750704526901245, + "num_tokens": 763412441.0, + "step": 20006 + }, + { + "epoch": 2.545096043760336, + "ewc_loss": 0.04957539960741997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.478769965819083e-05, + "grad_norm": 31.232784271240234, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8751378059387207, + "num_tokens": 763446713.0, + "step": 20007 + }, + { + "epoch": 2.5452232540389264, + "ewc_loss": 0.04946589469909668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.47329480771441e-05, + "grad_norm": 31.167016983032227, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8808726072311401, + "num_tokens": 763478686.0, + "step": 20008 + }, + { + "epoch": 2.545350464317517, + "ewc_loss": 0.04949598014354706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4747989300522022e-05, + "grad_norm": 31.182647705078125, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8868231773376465, + "num_tokens": 763517258.0, + "step": 20009 + }, + { + "epoch": 2.5454776745961074, + "ewc_loss": 0.049441490322351456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4720744477235712e-05, + "grad_norm": 31.19050407409668, + "learning_rate": 1e-06, + "loss": 0.4318, + "mean_token_accuracy": 0.8657636046409607, + "num_tokens": 763553229.0, + "step": 20010 + }, + { + "epoch": 2.545604884874698, + "ewc_loss": 0.04951111972332001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.475555993441958e-05, + "grad_norm": 31.096982955932617, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8778578639030457, + "num_tokens": 763594274.0, + "step": 20011 + }, + { + "epoch": 2.5457320951532885, + "ewc_loss": 0.049559298902750015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4779648811090738e-05, + "grad_norm": 31.27490997314453, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8635227084159851, + "num_tokens": 763632162.0, + "step": 20012 + }, + { + "epoch": 2.545859305431879, + "ewc_loss": 0.04957007244229317, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.478503665770404e-05, + "grad_norm": 31.10828399658203, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8715575933456421, + "num_tokens": 763666797.0, + "step": 20013 + }, + { + "epoch": 2.5459865157104695, + "ewc_loss": 0.04946897551417351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.47344869421795e-05, + "grad_norm": 31.30286407470703, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8686161041259766, + "num_tokens": 763705475.0, + "step": 20014 + }, + { + "epoch": 2.54611372598906, + "ewc_loss": 0.0495498925447464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.477494672348257e-05, + "grad_norm": 31.151615142822266, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8870742321014404, + "num_tokens": 763739434.0, + "step": 20015 + }, + { + "epoch": 2.5462409362676506, + "ewc_loss": 0.0494220070540905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4711003788979724e-05, + "grad_norm": 31.259475708007812, + "learning_rate": 1e-06, + "loss": 0.4567, + "mean_token_accuracy": 0.8584388494491577, + "num_tokens": 763773262.0, + "step": 20016 + }, + { + "epoch": 2.546368146546241, + "ewc_loss": 0.049570973962545395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.478548776707612e-05, + "grad_norm": 31.081771850585938, + "learning_rate": 1e-06, + "loss": 0.4588, + "mean_token_accuracy": 0.8598537445068359, + "num_tokens": 763812573.0, + "step": 20017 + }, + { + "epoch": 2.546495356824831, + "ewc_loss": 0.04952326416969299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4761631721048616e-05, + "grad_norm": 31.287609100341797, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8745749592781067, + "num_tokens": 763848555.0, + "step": 20018 + }, + { + "epoch": 2.546622567103422, + "ewc_loss": 0.049506690353155136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.475334440532606e-05, + "grad_norm": 31.053329467773438, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8689126372337341, + "num_tokens": 763884608.0, + "step": 20019 + }, + { + "epoch": 2.5467497773820122, + "ewc_loss": 0.04949165880680084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4745830160100013e-05, + "grad_norm": 31.273696899414062, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8663392066955566, + "num_tokens": 763922038.0, + "step": 20020 + }, + { + "epoch": 2.546876987660603, + "ewc_loss": 0.049614567309617996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4807282898109406e-05, + "grad_norm": 31.189374923706055, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8680862188339233, + "num_tokens": 763958089.0, + "step": 20021 + }, + { + "epoch": 2.5470041979391933, + "ewc_loss": 0.04947404935956001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4737024432397448e-05, + "grad_norm": 31.218626022338867, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8720576763153076, + "num_tokens": 764000802.0, + "step": 20022 + }, + { + "epoch": 2.547131408217784, + "ewc_loss": 0.04952014237642288, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4760071028140374e-05, + "grad_norm": 31.126174926757812, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8762067556381226, + "num_tokens": 764036628.0, + "step": 20023 + }, + { + "epoch": 2.5472586184963744, + "ewc_loss": 0.04952654242515564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.476327063050121e-05, + "grad_norm": 31.212419509887695, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8742629289627075, + "num_tokens": 764073643.0, + "step": 20024 + }, + { + "epoch": 2.547385828774965, + "ewc_loss": 0.04949397221207619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4746985218371265e-05, + "grad_norm": 31.267051696777344, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8772839307785034, + "num_tokens": 764110473.0, + "step": 20025 + }, + { + "epoch": 2.5475130390535554, + "ewc_loss": 0.04948652535676956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4743261747062206e-05, + "grad_norm": 31.137067794799805, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8697549700737, + "num_tokens": 764142941.0, + "step": 20026 + }, + { + "epoch": 2.547640249332146, + "ewc_loss": 0.049491629004478455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.474581378919538e-05, + "grad_norm": 31.204090118408203, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8739153742790222, + "num_tokens": 764178752.0, + "step": 20027 + }, + { + "epoch": 2.5477674596107365, + "ewc_loss": 0.049630578607320786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4815290089463815e-05, + "grad_norm": 31.136825561523438, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8678324222564697, + "num_tokens": 764217440.0, + "step": 20028 + }, + { + "epoch": 2.547894669889327, + "ewc_loss": 0.049515604972839355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4757802748354152e-05, + "grad_norm": 31.18758773803711, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8610019683837891, + "num_tokens": 764257524.0, + "step": 20029 + }, + { + "epoch": 2.5480218801679175, + "ewc_loss": 0.049596741795539856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4798371669021435e-05, + "grad_norm": 31.257339477539062, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8670441508293152, + "num_tokens": 764293645.0, + "step": 20030 + }, + { + "epoch": 2.548149090446508, + "ewc_loss": 0.04953945055603981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.476972440490499e-05, + "grad_norm": 31.158695220947266, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8778704404830933, + "num_tokens": 764331660.0, + "step": 20031 + }, + { + "epoch": 2.5482763007250986, + "ewc_loss": 0.049472376704216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4736187697271816e-05, + "grad_norm": 31.120006561279297, + "learning_rate": 1e-06, + "loss": 0.3629, + "mean_token_accuracy": 0.8889124393463135, + "num_tokens": 764376577.0, + "step": 20032 + }, + { + "epoch": 2.548403511003689, + "ewc_loss": 0.0495903305709362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4795164790702984e-05, + "grad_norm": 31.338537216186523, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8686718940734863, + "num_tokens": 764415492.0, + "step": 20033 + }, + { + "epoch": 2.5485307212822796, + "ewc_loss": 0.049515943974256516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.475797191436868e-05, + "grad_norm": 31.12735366821289, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8610386252403259, + "num_tokens": 764453948.0, + "step": 20034 + }, + { + "epoch": 2.54865793156087, + "ewc_loss": 0.049498796463012695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.474939901730977e-05, + "grad_norm": 31.20290184020996, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.875738799571991, + "num_tokens": 764493906.0, + "step": 20035 + }, + { + "epoch": 2.5487851418394607, + "ewc_loss": 0.04960591346025467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4802957341307774e-05, + "grad_norm": 31.295665740966797, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8648162484169006, + "num_tokens": 764532379.0, + "step": 20036 + }, + { + "epoch": 2.5489123521180512, + "ewc_loss": 0.04952574893832207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4762874090811238e-05, + "grad_norm": 31.2392520904541, + "learning_rate": 1e-06, + "loss": 0.4667, + "mean_token_accuracy": 0.8540464639663696, + "num_tokens": 764567726.0, + "step": 20037 + }, + { + "epoch": 2.5490395623966418, + "ewc_loss": 0.04945753887295723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4728769858484156e-05, + "grad_norm": 31.120370864868164, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8718568086624146, + "num_tokens": 764606229.0, + "step": 20038 + }, + { + "epoch": 2.5491667726752323, + "ewc_loss": 0.049542248249053955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4771125026745722e-05, + "grad_norm": 31.101852416992188, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.881102979183197, + "num_tokens": 764641346.0, + "step": 20039 + }, + { + "epoch": 2.549293982953823, + "ewc_loss": 0.04954168200492859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.477084126439877e-05, + "grad_norm": 31.148279190063477, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8717947006225586, + "num_tokens": 764687286.0, + "step": 20040 + }, + { + "epoch": 2.5494211932324133, + "ewc_loss": 0.049560125917196274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4780063540674746e-05, + "grad_norm": 31.027944564819336, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8738695383071899, + "num_tokens": 764729235.0, + "step": 20041 + }, + { + "epoch": 2.549548403511004, + "ewc_loss": 0.0495094358921051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4754717742325738e-05, + "grad_norm": 31.179597854614258, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8635387420654297, + "num_tokens": 764771834.0, + "step": 20042 + }, + { + "epoch": 2.549675613789594, + "ewc_loss": 0.0496266707777977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4813334675855003e-05, + "grad_norm": 31.118101119995117, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8709319829940796, + "num_tokens": 764808307.0, + "step": 20043 + }, + { + "epoch": 2.549802824068185, + "ewc_loss": 0.04952334240078926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4761671738815494e-05, + "grad_norm": 31.056425094604492, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8641399145126343, + "num_tokens": 764848373.0, + "step": 20044 + }, + { + "epoch": 2.549930034346775, + "ewc_loss": 0.04955532029271126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4777660655672662e-05, + "grad_norm": 31.087812423706055, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8670522570610046, + "num_tokens": 764886468.0, + "step": 20045 + }, + { + "epoch": 2.550057244625366, + "ewc_loss": 0.049691472202539444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4845736334100366e-05, + "grad_norm": 31.120126724243164, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8729451894760132, + "num_tokens": 764928690.0, + "step": 20046 + }, + { + "epoch": 2.550184454903956, + "ewc_loss": 0.04960672929883003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.480336479493417e-05, + "grad_norm": 31.177148818969727, + "learning_rate": 1e-06, + "loss": 0.5314, + "mean_token_accuracy": 0.8370032906532288, + "num_tokens": 764969148.0, + "step": 20047 + }, + { + "epoch": 2.5503116651825466, + "ewc_loss": 0.04961303994059563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.480652074154932e-05, + "grad_norm": 31.113536834716797, + "learning_rate": 1e-06, + "loss": 0.4421, + "mean_token_accuracy": 0.8650396466255188, + "num_tokens": 765010754.0, + "step": 20048 + }, + { + "epoch": 2.550438875461137, + "ewc_loss": 0.04946482554078102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.473241329425946e-05, + "grad_norm": 31.0780086517334, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8750626444816589, + "num_tokens": 765043562.0, + "step": 20049 + }, + { + "epoch": 2.5505660857397277, + "ewc_loss": 0.04960588738322258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4802942789392546e-05, + "grad_norm": 31.205469131469727, + "learning_rate": 1e-06, + "loss": 0.4815, + "mean_token_accuracy": 0.8518480658531189, + "num_tokens": 765085638.0, + "step": 20050 + }, + { + "epoch": 2.550693296018318, + "ewc_loss": 0.04958501085639, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4792505428195e-05, + "grad_norm": 31.161212921142578, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8775172233581543, + "num_tokens": 765115847.0, + "step": 20051 + }, + { + "epoch": 2.5508205062969087, + "ewc_loss": 0.049651239067316055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4825620130286552e-05, + "grad_norm": 31.27080726623535, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8694057464599609, + "num_tokens": 765146839.0, + "step": 20052 + }, + { + "epoch": 2.5509477165754992, + "ewc_loss": 0.04957834631204605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4789173039607704e-05, + "grad_norm": 31.176973342895508, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8717993497848511, + "num_tokens": 765185921.0, + "step": 20053 + }, + { + "epoch": 2.5510749268540898, + "ewc_loss": 0.0495162270963192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4758113795542158e-05, + "grad_norm": 31.22493553161621, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8711844682693481, + "num_tokens": 765224534.0, + "step": 20054 + }, + { + "epoch": 2.5512021371326803, + "ewc_loss": 0.04956653714179993, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4783268600003794e-05, + "grad_norm": 31.118385314941406, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8800446391105652, + "num_tokens": 765267160.0, + "step": 20055 + }, + { + "epoch": 2.551329347411271, + "ewc_loss": 0.04955029860138893, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4775148631306365e-05, + "grad_norm": 31.1866455078125, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8715566992759705, + "num_tokens": 765304770.0, + "step": 20056 + }, + { + "epoch": 2.5514565576898613, + "ewc_loss": 0.049641747027635574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.48208743869327e-05, + "grad_norm": 31.09723663330078, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.868004322052002, + "num_tokens": 765341966.0, + "step": 20057 + }, + { + "epoch": 2.551583767968452, + "ewc_loss": 0.0496242381632328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4812119590933435e-05, + "grad_norm": 31.213279724121094, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8861820697784424, + "num_tokens": 765382653.0, + "step": 20058 + }, + { + "epoch": 2.5517109782470424, + "ewc_loss": 0.049578167498111725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.478908390912693e-05, + "grad_norm": 31.08045768737793, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8699771165847778, + "num_tokens": 765414940.0, + "step": 20059 + }, + { + "epoch": 2.551838188525633, + "ewc_loss": 0.04967427998781204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4837139790179208e-05, + "grad_norm": 31.161930084228516, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8731040954589844, + "num_tokens": 765445549.0, + "step": 20060 + }, + { + "epoch": 2.5519653988042235, + "ewc_loss": 0.049622077494859695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.481103911122773e-05, + "grad_norm": 31.107742309570312, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8786945343017578, + "num_tokens": 765484474.0, + "step": 20061 + }, + { + "epoch": 2.552092609082814, + "ewc_loss": 0.04961442947387695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4807215595501475e-05, + "grad_norm": 31.051429748535156, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8712732791900635, + "num_tokens": 765522651.0, + "step": 20062 + }, + { + "epoch": 2.5522198193614045, + "ewc_loss": 0.04980332776904106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4901662982301787e-05, + "grad_norm": 31.269285202026367, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8655166029930115, + "num_tokens": 765565983.0, + "step": 20063 + }, + { + "epoch": 2.552347029639995, + "ewc_loss": 0.04963940382003784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4819701138767414e-05, + "grad_norm": 31.139432907104492, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8719980716705322, + "num_tokens": 765604056.0, + "step": 20064 + }, + { + "epoch": 2.5524742399185856, + "ewc_loss": 0.049597907811403275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.479895374563057e-05, + "grad_norm": 31.161388397216797, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8799100518226624, + "num_tokens": 765646062.0, + "step": 20065 + }, + { + "epoch": 2.5526014501971757, + "ewc_loss": 0.04960368201136589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4801840481813997e-05, + "grad_norm": 31.051082611083984, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8695254325866699, + "num_tokens": 765687284.0, + "step": 20066 + }, + { + "epoch": 2.5527286604757666, + "ewc_loss": 0.04960652440786362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.480326293152757e-05, + "grad_norm": 31.211307525634766, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8574670553207397, + "num_tokens": 765723743.0, + "step": 20067 + }, + { + "epoch": 2.5528558707543567, + "ewc_loss": 0.0497201606631279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4860080884536728e-05, + "grad_norm": 31.114662170410156, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8672422766685486, + "num_tokens": 765768472.0, + "step": 20068 + }, + { + "epoch": 2.5529830810329477, + "ewc_loss": 0.04962923750281334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4814618882373907e-05, + "grad_norm": 31.229230880737305, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8868192434310913, + "num_tokens": 765809040.0, + "step": 20069 + }, + { + "epoch": 2.5531102913115378, + "ewc_loss": 0.04960077628493309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4800388928269967e-05, + "grad_norm": 31.106468200683594, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8721789121627808, + "num_tokens": 765845968.0, + "step": 20070 + }, + { + "epoch": 2.5532375015901287, + "ewc_loss": 0.049549851566553116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.477492489560973e-05, + "grad_norm": 31.20654296875, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8769535422325134, + "num_tokens": 765881037.0, + "step": 20071 + }, + { + "epoch": 2.553364711868719, + "ewc_loss": 0.04961046949028969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4805234716041014e-05, + "grad_norm": 31.167978286743164, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8761656284332275, + "num_tokens": 765918945.0, + "step": 20072 + }, + { + "epoch": 2.5534919221473094, + "ewc_loss": 0.049581218510866165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4790610041236505e-05, + "grad_norm": 31.17139434814453, + "learning_rate": 1e-06, + "loss": 0.3712, + "mean_token_accuracy": 0.8837777376174927, + "num_tokens": 765956128.0, + "step": 20073 + }, + { + "epoch": 2.5536191324259, + "ewc_loss": 0.04956173524260521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4780867533991113e-05, + "grad_norm": 31.22289276123047, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8839176893234253, + "num_tokens": 765997109.0, + "step": 20074 + }, + { + "epoch": 2.5537463427044904, + "ewc_loss": 0.04955890402197838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4779452360235155e-05, + "grad_norm": 31.27039909362793, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8688697814941406, + "num_tokens": 766032557.0, + "step": 20075 + }, + { + "epoch": 2.553873552983081, + "ewc_loss": 0.04949872940778732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4749364456511103e-05, + "grad_norm": 31.19266700744629, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8673242330551147, + "num_tokens": 766075285.0, + "step": 20076 + }, + { + "epoch": 2.5540007632616715, + "ewc_loss": 0.04946710169315338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4733550162636675e-05, + "grad_norm": 31.16640281677246, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8698241710662842, + "num_tokens": 766110600.0, + "step": 20077 + }, + { + "epoch": 2.554127973540262, + "ewc_loss": 0.04955724626779556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4778622901067138e-05, + "grad_norm": 31.32134437561035, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8599598407745361, + "num_tokens": 766146877.0, + "step": 20078 + }, + { + "epoch": 2.5542551838188525, + "ewc_loss": 0.04948908090591431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.47445404966129e-05, + "grad_norm": 31.232248306274414, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8667296767234802, + "num_tokens": 766185164.0, + "step": 20079 + }, + { + "epoch": 2.554382394097443, + "ewc_loss": 0.049448393285274506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4724196919123642e-05, + "grad_norm": 31.187952041625977, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8901212811470032, + "num_tokens": 766221442.0, + "step": 20080 + }, + { + "epoch": 2.5545096043760336, + "ewc_loss": 0.04944920539855957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4724602553760633e-05, + "grad_norm": 31.348573684692383, + "learning_rate": 1e-06, + "loss": 0.4991, + "mean_token_accuracy": 0.8472907543182373, + "num_tokens": 766255258.0, + "step": 20081 + }, + { + "epoch": 2.554636814654624, + "ewc_loss": 0.0495125986635685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.475629844411742e-05, + "grad_norm": 31.209918975830078, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8689044713973999, + "num_tokens": 766292392.0, + "step": 20082 + }, + { + "epoch": 2.5547640249332146, + "ewc_loss": 0.049432359635829926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4716180632822216e-05, + "grad_norm": 31.287275314331055, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8621360063552856, + "num_tokens": 766335312.0, + "step": 20083 + }, + { + "epoch": 2.554891235211805, + "ewc_loss": 0.04945388436317444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4726941774133593e-05, + "grad_norm": 31.010883331298828, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.872177243232727, + "num_tokens": 766372189.0, + "step": 20084 + }, + { + "epoch": 2.5550184454903957, + "ewc_loss": 0.04955575242638588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4777877115411684e-05, + "grad_norm": 31.39902114868164, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8601938486099243, + "num_tokens": 766414021.0, + "step": 20085 + }, + { + "epoch": 2.5551456557689862, + "ewc_loss": 0.04954102262854576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4770512027316727e-05, + "grad_norm": 31.114404678344727, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8843626379966736, + "num_tokens": 766452005.0, + "step": 20086 + }, + { + "epoch": 2.5552728660475768, + "ewc_loss": 0.04942403361201286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.47120169660775e-05, + "grad_norm": 31.28740692138672, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.865198016166687, + "num_tokens": 766492150.0, + "step": 20087 + }, + { + "epoch": 2.5554000763261673, + "ewc_loss": 0.04955749958753586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4778750230325386e-05, + "grad_norm": 31.142620086669922, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8812186121940613, + "num_tokens": 766533889.0, + "step": 20088 + }, + { + "epoch": 2.555527286604758, + "ewc_loss": 0.04948117583990097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4740587832638994e-05, + "grad_norm": 31.13091278076172, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.871130108833313, + "num_tokens": 766565008.0, + "step": 20089 + }, + { + "epoch": 2.5556544968833483, + "ewc_loss": 0.04963129013776779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4815644792397507e-05, + "grad_norm": 31.224470138549805, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.871589720249176, + "num_tokens": 766605744.0, + "step": 20090 + }, + { + "epoch": 2.5557817071619384, + "ewc_loss": 0.049460526555776596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4730263248784468e-05, + "grad_norm": 31.091102600097656, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8795117139816284, + "num_tokens": 766645782.0, + "step": 20091 + }, + { + "epoch": 2.5559089174405294, + "ewc_loss": 0.049534231424331665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4767115974100307e-05, + "grad_norm": 31.096220016479492, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.8615827560424805, + "num_tokens": 766683135.0, + "step": 20092 + }, + { + "epoch": 2.5560361277191195, + "ewc_loss": 0.04950848966836929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4754244805080816e-05, + "grad_norm": 31.21127700805664, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8725361227989197, + "num_tokens": 766720691.0, + "step": 20093 + }, + { + "epoch": 2.5561633379977104, + "ewc_loss": 0.04960692301392555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4803461201372556e-05, + "grad_norm": 31.192201614379883, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.865673303604126, + "num_tokens": 766755426.0, + "step": 20094 + }, + { + "epoch": 2.5562905482763005, + "ewc_loss": 0.04949616268277168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4748082068981603e-05, + "grad_norm": 31.053239822387695, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8757266998291016, + "num_tokens": 766788315.0, + "step": 20095 + }, + { + "epoch": 2.5564177585548915, + "ewc_loss": 0.04961386322975159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4806931833154522e-05, + "grad_norm": 31.256305694580078, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8860018253326416, + "num_tokens": 766825664.0, + "step": 20096 + }, + { + "epoch": 2.5565449688334816, + "ewc_loss": 0.04956028237938881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.47801417572191e-05, + "grad_norm": 31.172584533691406, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8828521966934204, + "num_tokens": 766868145.0, + "step": 20097 + }, + { + "epoch": 2.556672179112072, + "ewc_loss": 0.04960060119628906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4800299797789194e-05, + "grad_norm": 31.23419761657715, + "learning_rate": 1e-06, + "loss": 0.4646, + "mean_token_accuracy": 0.8595978021621704, + "num_tokens": 766907278.0, + "step": 20098 + }, + { + "epoch": 2.5567993893906626, + "ewc_loss": 0.04961775988340378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.480887997080572e-05, + "grad_norm": 31.211612701416016, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8828935027122498, + "num_tokens": 766941662.0, + "step": 20099 + }, + { + "epoch": 2.556926599669253, + "ewc_loss": 0.04956469684839249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.47823481913656e-05, + "grad_norm": 31.14984893798828, + "learning_rate": 1e-06, + "loss": 0.4913, + "mean_token_accuracy": 0.8529488444328308, + "num_tokens": 766977725.0, + "step": 20100 + }, + { + "epoch": 2.5570538099478437, + "ewc_loss": 0.04958701133728027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.479350587236695e-05, + "grad_norm": 31.204011917114258, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8669527769088745, + "num_tokens": 767011011.0, + "step": 20101 + }, + { + "epoch": 2.5571810202264342, + "ewc_loss": 0.04976402223110199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4882010620785877e-05, + "grad_norm": 31.253101348876953, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.86855548620224, + "num_tokens": 767047078.0, + "step": 20102 + }, + { + "epoch": 2.5573082305050248, + "ewc_loss": 0.049645427614450455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4822713385219686e-05, + "grad_norm": 31.115989685058594, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8678072094917297, + "num_tokens": 767089208.0, + "step": 20103 + }, + { + "epoch": 2.5574354407836153, + "ewc_loss": 0.0496642142534256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4832106646499597e-05, + "grad_norm": 31.199140548706055, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.856499195098877, + "num_tokens": 767131225.0, + "step": 20104 + }, + { + "epoch": 2.557562651062206, + "ewc_loss": 0.04966854676604271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.483427306287922e-05, + "grad_norm": 31.15434455871582, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8592367172241211, + "num_tokens": 767165222.0, + "step": 20105 + }, + { + "epoch": 2.5576898613407963, + "ewc_loss": 0.049597352743148804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4798675440251827e-05, + "grad_norm": 31.192665100097656, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8705225586891174, + "num_tokens": 767200870.0, + "step": 20106 + }, + { + "epoch": 2.557817071619387, + "ewc_loss": 0.04962998256087303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4814991775201634e-05, + "grad_norm": 31.127517700195312, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8701728582382202, + "num_tokens": 767236881.0, + "step": 20107 + }, + { + "epoch": 2.5579442818979774, + "ewc_loss": 0.049647849053144455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4823924832162447e-05, + "grad_norm": 31.140987396240234, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8768599629402161, + "num_tokens": 767269716.0, + "step": 20108 + }, + { + "epoch": 2.558071492176568, + "ewc_loss": 0.049647487699985504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4823744752211496e-05, + "grad_norm": 31.05744171142578, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8619442582130432, + "num_tokens": 767306840.0, + "step": 20109 + }, + { + "epoch": 2.5581987024551585, + "ewc_loss": 0.04963523894548416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4817620214889757e-05, + "grad_norm": 31.11410140991211, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8834331035614014, + "num_tokens": 767346966.0, + "step": 20110 + }, + { + "epoch": 2.558325912733749, + "ewc_loss": 0.04974912106990814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4874560040188953e-05, + "grad_norm": 31.182031631469727, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8677033185958862, + "num_tokens": 767387084.0, + "step": 20111 + }, + { + "epoch": 2.5584531230123395, + "ewc_loss": 0.04967352747917175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4836763259372674e-05, + "grad_norm": 31.21663475036621, + "learning_rate": 1e-06, + "loss": 0.4792, + "mean_token_accuracy": 0.8520683646202087, + "num_tokens": 767417857.0, + "step": 20112 + }, + { + "epoch": 2.55858033329093, + "ewc_loss": 0.04965846240520477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.482923082425259e-05, + "grad_norm": 31.27483367919922, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8733912110328674, + "num_tokens": 767455771.0, + "step": 20113 + }, + { + "epoch": 2.5587075435695206, + "ewc_loss": 0.04965122789144516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.482561467331834e-05, + "grad_norm": 31.186307907104492, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.857192873954773, + "num_tokens": 767493187.0, + "step": 20114 + }, + { + "epoch": 2.558834753848111, + "ewc_loss": 0.0496535487473011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4826775188557804e-05, + "grad_norm": 31.257043838500977, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8635104894638062, + "num_tokens": 767534526.0, + "step": 20115 + }, + { + "epoch": 2.558961964126701, + "ewc_loss": 0.04962073639035225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.481036790413782e-05, + "grad_norm": 31.13111686706543, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8675572872161865, + "num_tokens": 767569297.0, + "step": 20116 + }, + { + "epoch": 2.559089174405292, + "ewc_loss": 0.049717627465724945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4858813048922457e-05, + "grad_norm": 31.176860809326172, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8672582507133484, + "num_tokens": 767605860.0, + "step": 20117 + }, + { + "epoch": 2.5592163846838822, + "ewc_loss": 0.04967230558395386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4836152078933083e-05, + "grad_norm": 31.259008407592773, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8729299306869507, + "num_tokens": 767645252.0, + "step": 20118 + }, + { + "epoch": 2.559343594962473, + "ewc_loss": 0.049734100699424744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4867051251931116e-05, + "grad_norm": 31.187938690185547, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8706549406051636, + "num_tokens": 767681942.0, + "step": 20119 + }, + { + "epoch": 2.5594708052410633, + "ewc_loss": 0.049694351851940155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.484717515471857e-05, + "grad_norm": 31.221874237060547, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8697342872619629, + "num_tokens": 767720631.0, + "step": 20120 + }, + { + "epoch": 2.559598015519654, + "ewc_loss": 0.04969238117337227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4846191081451252e-05, + "grad_norm": 31.21503257751465, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8749911785125732, + "num_tokens": 767757018.0, + "step": 20121 + }, + { + "epoch": 2.5597252257982444, + "ewc_loss": 0.049594372510910034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4797185687930323e-05, + "grad_norm": 31.172147750854492, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8746377825737, + "num_tokens": 767796046.0, + "step": 20122 + }, + { + "epoch": 2.559852436076835, + "ewc_loss": 0.0497102253139019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4855113224475645e-05, + "grad_norm": 31.275732040405273, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8809425830841064, + "num_tokens": 767835678.0, + "step": 20123 + }, + { + "epoch": 2.5599796463554254, + "ewc_loss": 0.04958295449614525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4791477699181996e-05, + "grad_norm": 31.22509765625, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8857699036598206, + "num_tokens": 767870515.0, + "step": 20124 + }, + { + "epoch": 2.560106856634016, + "ewc_loss": 0.04956158995628357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.478079477441497e-05, + "grad_norm": 31.113855361938477, + "learning_rate": 1e-06, + "loss": 0.473, + "mean_token_accuracy": 0.8548140525817871, + "num_tokens": 767911573.0, + "step": 20125 + }, + { + "epoch": 2.5602340669126065, + "ewc_loss": 0.04959401860833168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.479700924595818e-05, + "grad_norm": 31.262287139892578, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8695181012153625, + "num_tokens": 767950161.0, + "step": 20126 + }, + { + "epoch": 2.560361277191197, + "ewc_loss": 0.049673892557621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4836946977302432e-05, + "grad_norm": 31.138303756713867, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8685437440872192, + "num_tokens": 767989431.0, + "step": 20127 + }, + { + "epoch": 2.5604884874697875, + "ewc_loss": 0.049554966390132904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4777482394711114e-05, + "grad_norm": 31.205869674682617, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8816016316413879, + "num_tokens": 768023551.0, + "step": 20128 + }, + { + "epoch": 2.560615697748378, + "ewc_loss": 0.04958932101726532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.47946609306382e-05, + "grad_norm": 31.190031051635742, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8798973560333252, + "num_tokens": 768055317.0, + "step": 20129 + }, + { + "epoch": 2.5607429080269686, + "ewc_loss": 0.049507562071084976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4753780962782912e-05, + "grad_norm": 31.113357543945312, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.86362224817276, + "num_tokens": 768089575.0, + "step": 20130 + }, + { + "epoch": 2.560870118305559, + "ewc_loss": 0.04953213781118393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4766068236203864e-05, + "grad_norm": 31.229103088378906, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8622983694076538, + "num_tokens": 768131795.0, + "step": 20131 + }, + { + "epoch": 2.5609973285841496, + "ewc_loss": 0.049616601318120956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4808301532175392e-05, + "grad_norm": 31.095386505126953, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8751973509788513, + "num_tokens": 768165714.0, + "step": 20132 + }, + { + "epoch": 2.56112453886274, + "ewc_loss": 0.04952134191989899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4760671294643544e-05, + "grad_norm": 31.128820419311523, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8792007565498352, + "num_tokens": 768203475.0, + "step": 20133 + }, + { + "epoch": 2.5612517491413307, + "ewc_loss": 0.0496879480779171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.484397373336833e-05, + "grad_norm": 31.146392822265625, + "learning_rate": 1e-06, + "loss": 0.3804, + "mean_token_accuracy": 0.8866477608680725, + "num_tokens": 768244841.0, + "step": 20134 + }, + { + "epoch": 2.561378959419921, + "ewc_loss": 0.04962438344955444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4812192350509576e-05, + "grad_norm": 31.116243362426758, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.866756796836853, + "num_tokens": 768286502.0, + "step": 20135 + }, + { + "epoch": 2.5615061696985117, + "ewc_loss": 0.04968556389212608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4842782295309007e-05, + "grad_norm": 31.237537384033203, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8790192008018494, + "num_tokens": 768320628.0, + "step": 20136 + }, + { + "epoch": 2.5616333799771023, + "ewc_loss": 0.04962713643908501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4813567506498657e-05, + "grad_norm": 31.105161666870117, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8642079830169678, + "num_tokens": 768353630.0, + "step": 20137 + }, + { + "epoch": 2.561760590255693, + "ewc_loss": 0.049693647772073746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4846824089763686e-05, + "grad_norm": 31.25760269165039, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8710881471633911, + "num_tokens": 768388170.0, + "step": 20138 + }, + { + "epoch": 2.5618878005342833, + "ewc_loss": 0.049613792449235916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.480689545336645e-05, + "grad_norm": 31.159643173217773, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8876417875289917, + "num_tokens": 768428238.0, + "step": 20139 + }, + { + "epoch": 2.562015010812874, + "ewc_loss": 0.0496746264398098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4837312594172545e-05, + "grad_norm": 31.13581085205078, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8850845098495483, + "num_tokens": 768460075.0, + "step": 20140 + }, + { + "epoch": 2.562142221091464, + "ewc_loss": 0.04965183138847351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.482591662555933e-05, + "grad_norm": 31.217037200927734, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.888262152671814, + "num_tokens": 768493982.0, + "step": 20141 + }, + { + "epoch": 2.562269431370055, + "ewc_loss": 0.04968835040926933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4844175641192123e-05, + "grad_norm": 31.1297550201416, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8748447895050049, + "num_tokens": 768529121.0, + "step": 20142 + }, + { + "epoch": 2.562396641648645, + "ewc_loss": 0.049609482288360596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4804741769912653e-05, + "grad_norm": 31.154293060302734, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8720574378967285, + "num_tokens": 768560730.0, + "step": 20143 + }, + { + "epoch": 2.562523851927236, + "ewc_loss": 0.049642425030469894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.482121271896176e-05, + "grad_norm": 31.26910400390625, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8818724155426025, + "num_tokens": 768603015.0, + "step": 20144 + }, + { + "epoch": 2.562651062205826, + "ewc_loss": 0.049609649926424026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4804825443425216e-05, + "grad_norm": 31.03501319885254, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8704230785369873, + "num_tokens": 768639847.0, + "step": 20145 + }, + { + "epoch": 2.5627782724844166, + "ewc_loss": 0.049670904874801636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4835451768012717e-05, + "grad_norm": 31.295394897460938, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8684586882591248, + "num_tokens": 768681614.0, + "step": 20146 + }, + { + "epoch": 2.562905482763007, + "ewc_loss": 0.049774784594774246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.488739301043097e-05, + "grad_norm": 31.32241439819336, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8748489618301392, + "num_tokens": 768722052.0, + "step": 20147 + }, + { + "epoch": 2.5630326930415976, + "ewc_loss": 0.049605440348386765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4802720872685313e-05, + "grad_norm": 31.21022605895996, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8642192482948303, + "num_tokens": 768754191.0, + "step": 20148 + }, + { + "epoch": 2.563159903320188, + "ewc_loss": 0.049692731350660324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4846365704433993e-05, + "grad_norm": 31.278362274169922, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8613455891609192, + "num_tokens": 768792981.0, + "step": 20149 + }, + { + "epoch": 2.5632871135987787, + "ewc_loss": 0.04968985542654991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4844926883815788e-05, + "grad_norm": 31.15782928466797, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8622040152549744, + "num_tokens": 768827040.0, + "step": 20150 + }, + { + "epoch": 2.5634143238773692, + "ewc_loss": 0.04963421821594238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.481710907886736e-05, + "grad_norm": 31.237546920776367, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8689461946487427, + "num_tokens": 768865669.0, + "step": 20151 + }, + { + "epoch": 2.5635415341559598, + "ewc_loss": 0.04964743182063103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.482371564838104e-05, + "grad_norm": 31.10869789123535, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8700777292251587, + "num_tokens": 768903306.0, + "step": 20152 + }, + { + "epoch": 2.5636687444345503, + "ewc_loss": 0.04971415176987648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.485707591404207e-05, + "grad_norm": 31.32661247253418, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8748534917831421, + "num_tokens": 768946322.0, + "step": 20153 + }, + { + "epoch": 2.563795954713141, + "ewc_loss": 0.049737684428691864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.486884295649361e-05, + "grad_norm": 31.120216369628906, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8888639807701111, + "num_tokens": 768987350.0, + "step": 20154 + }, + { + "epoch": 2.5639231649917313, + "ewc_loss": 0.04961952939629555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4809763999655843e-05, + "grad_norm": 31.139034271240234, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8789178133010864, + "num_tokens": 769030762.0, + "step": 20155 + }, + { + "epoch": 2.564050375270322, + "ewc_loss": 0.0497388057410717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.48694032052299e-05, + "grad_norm": 31.291650772094727, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.883237361907959, + "num_tokens": 769069827.0, + "step": 20156 + }, + { + "epoch": 2.5641775855489124, + "ewc_loss": 0.04976547136902809, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4882736397557892e-05, + "grad_norm": 31.22775650024414, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8713778257369995, + "num_tokens": 769107742.0, + "step": 20157 + }, + { + "epoch": 2.564304795827503, + "ewc_loss": 0.049589674919843674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4794837372610345e-05, + "grad_norm": 31.136451721191406, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8695804476737976, + "num_tokens": 769153286.0, + "step": 20158 + }, + { + "epoch": 2.5644320061060935, + "ewc_loss": 0.04968990385532379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4844952349667437e-05, + "grad_norm": 31.13528823852539, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8732216954231262, + "num_tokens": 769192646.0, + "step": 20159 + }, + { + "epoch": 2.564559216384684, + "ewc_loss": 0.04967068135738373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.48353408096591e-05, + "grad_norm": 31.23360824584961, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8490294218063354, + "num_tokens": 769230360.0, + "step": 20160 + }, + { + "epoch": 2.5646864266632745, + "ewc_loss": 0.049621112644672394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.481055707903579e-05, + "grad_norm": 31.070520401000977, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8867543935775757, + "num_tokens": 769270754.0, + "step": 20161 + }, + { + "epoch": 2.564813636941865, + "ewc_loss": 0.049614325165748596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4807162844808772e-05, + "grad_norm": 31.08418846130371, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8482382893562317, + "num_tokens": 769310109.0, + "step": 20162 + }, + { + "epoch": 2.5649408472204556, + "ewc_loss": 0.049798570573329926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4899285563151352e-05, + "grad_norm": 31.20018768310547, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8703284859657288, + "num_tokens": 769352101.0, + "step": 20163 + }, + { + "epoch": 2.5650680574990457, + "ewc_loss": 0.04967793449759483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.483896787452977e-05, + "grad_norm": 31.199941635131836, + "learning_rate": 1e-06, + "loss": 0.3786, + "mean_token_accuracy": 0.8854776620864868, + "num_tokens": 769386570.0, + "step": 20164 + }, + { + "epoch": 2.5651952677776366, + "ewc_loss": 0.04967115819454193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4835579097270966e-05, + "grad_norm": 31.136266708374023, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.883608341217041, + "num_tokens": 769427167.0, + "step": 20165 + }, + { + "epoch": 2.5653224780562267, + "ewc_loss": 0.0496380440890789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.481902265571989e-05, + "grad_norm": 31.2567195892334, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8608884215354919, + "num_tokens": 769465456.0, + "step": 20166 + }, + { + "epoch": 2.5654496883348177, + "ewc_loss": 0.04963540658354759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.481770388840232e-05, + "grad_norm": 31.07025718688965, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8835549354553223, + "num_tokens": 769497135.0, + "step": 20167 + }, + { + "epoch": 2.5655768986134078, + "ewc_loss": 0.04965434595942497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.482717354723718e-05, + "grad_norm": 31.228357315063477, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8659031391143799, + "num_tokens": 769533706.0, + "step": 20168 + }, + { + "epoch": 2.5657041088919987, + "ewc_loss": 0.049680326133966446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.48401629505679e-05, + "grad_norm": 31.038330078125, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8657098412513733, + "num_tokens": 769578698.0, + "step": 20169 + }, + { + "epoch": 2.565831319170589, + "ewc_loss": 0.04968070238828659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.484035212546587e-05, + "grad_norm": 31.323503494262695, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8831191062927246, + "num_tokens": 769619799.0, + "step": 20170 + }, + { + "epoch": 2.5659585294491793, + "ewc_loss": 0.04967986047267914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4839930119924247e-05, + "grad_norm": 31.092252731323242, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.870858371257782, + "num_tokens": 769655973.0, + "step": 20171 + }, + { + "epoch": 2.56608573972777, + "ewc_loss": 0.04961404576897621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.48070227826247e-05, + "grad_norm": 31.271011352539062, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8675357103347778, + "num_tokens": 769692333.0, + "step": 20172 + }, + { + "epoch": 2.5662129500063604, + "ewc_loss": 0.04973149299621582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4865747036528774e-05, + "grad_norm": 31.199953079223633, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8760520219802856, + "num_tokens": 769731804.0, + "step": 20173 + }, + { + "epoch": 2.566340160284951, + "ewc_loss": 0.04964489862322807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.482244963175617e-05, + "grad_norm": 31.164594650268555, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8775709867477417, + "num_tokens": 769767409.0, + "step": 20174 + }, + { + "epoch": 2.5664673705635415, + "ewc_loss": 0.049648962914943695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4824481442919932e-05, + "grad_norm": 31.163522720336914, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8691152334213257, + "num_tokens": 769806468.0, + "step": 20175 + }, + { + "epoch": 2.566594580842132, + "ewc_loss": 0.049584295600652695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.47921470872825e-05, + "grad_norm": 31.225704193115234, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8793736696243286, + "num_tokens": 769842025.0, + "step": 20176 + }, + { + "epoch": 2.5667217911207225, + "ewc_loss": 0.04960806295275688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.480403236404527e-05, + "grad_norm": 31.1638126373291, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8674917817115784, + "num_tokens": 769884125.0, + "step": 20177 + }, + { + "epoch": 2.566849001399313, + "ewc_loss": 0.049758896231651306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.487944766471628e-05, + "grad_norm": 31.201324462890625, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8599891662597656, + "num_tokens": 769920523.0, + "step": 20178 + }, + { + "epoch": 2.5669762116779036, + "ewc_loss": 0.049611661583185196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4805831344565377e-05, + "grad_norm": 31.204933166503906, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8695623874664307, + "num_tokens": 769960141.0, + "step": 20179 + }, + { + "epoch": 2.567103421956494, + "ewc_loss": 0.04967885464429855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.483942807884887e-05, + "grad_norm": 31.201066970825195, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8726627826690674, + "num_tokens": 769995281.0, + "step": 20180 + }, + { + "epoch": 2.5672306322350846, + "ewc_loss": 0.04961583390831947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4807917725411244e-05, + "grad_norm": 31.21380043029785, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8746048212051392, + "num_tokens": 770028230.0, + "step": 20181 + }, + { + "epoch": 2.567357842513675, + "ewc_loss": 0.04967717081308365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4838585886755027e-05, + "grad_norm": 31.23301124572754, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8779844045639038, + "num_tokens": 770068996.0, + "step": 20182 + }, + { + "epoch": 2.5674850527922657, + "ewc_loss": 0.04960240051150322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.480120019754395e-05, + "grad_norm": 31.165464401245117, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8607777953147888, + "num_tokens": 770107018.0, + "step": 20183 + }, + { + "epoch": 2.567612263070856, + "ewc_loss": 0.049698539078235626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.484926881152205e-05, + "grad_norm": 31.230085372924805, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8707813024520874, + "num_tokens": 770140251.0, + "step": 20184 + }, + { + "epoch": 2.5677394733494467, + "ewc_loss": 0.04963639751076698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4818198653520085e-05, + "grad_norm": 31.085020065307617, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8633938431739807, + "num_tokens": 770182241.0, + "step": 20185 + }, + { + "epoch": 2.5678666836280373, + "ewc_loss": 0.04971464350819588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.485732147761155e-05, + "grad_norm": 31.279611587524414, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8849583268165588, + "num_tokens": 770223267.0, + "step": 20186 + }, + { + "epoch": 2.567993893906628, + "ewc_loss": 0.049745216965675354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.487260826455895e-05, + "grad_norm": 31.20728874206543, + "learning_rate": 1e-06, + "loss": 0.4557, + "mean_token_accuracy": 0.8601956367492676, + "num_tokens": 770261856.0, + "step": 20187 + }, + { + "epoch": 2.5681211041852183, + "ewc_loss": 0.04968762770295143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4843813662300818e-05, + "grad_norm": 31.28171157836914, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8610596060752869, + "num_tokens": 770299762.0, + "step": 20188 + }, + { + "epoch": 2.5682483144638084, + "ewc_loss": 0.04977177456021309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4885886887204833e-05, + "grad_norm": 31.387929916381836, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8656758069992065, + "num_tokens": 770333690.0, + "step": 20189 + }, + { + "epoch": 2.5683755247423994, + "ewc_loss": 0.04971680790185928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.485840377630666e-05, + "grad_norm": 31.178916931152344, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.878296971321106, + "num_tokens": 770375755.0, + "step": 20190 + }, + { + "epoch": 2.5685027350209895, + "ewc_loss": 0.04957156628370285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4785782443359494e-05, + "grad_norm": 31.253719329833984, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8797428607940674, + "num_tokens": 770417805.0, + "step": 20191 + }, + { + "epoch": 2.5686299452995804, + "ewc_loss": 0.04973651096224785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4868255422916263e-05, + "grad_norm": 31.25995445251465, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8695718050003052, + "num_tokens": 770458183.0, + "step": 20192 + }, + { + "epoch": 2.5687571555781705, + "ewc_loss": 0.04974375292658806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4871877030818723e-05, + "grad_norm": 31.22325325012207, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8747718930244446, + "num_tokens": 770488973.0, + "step": 20193 + }, + { + "epoch": 2.5688843658567615, + "ewc_loss": 0.04968247935175896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.48412397922948e-05, + "grad_norm": 31.18552017211914, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8652704358100891, + "num_tokens": 770529030.0, + "step": 20194 + }, + { + "epoch": 2.5690115761353516, + "ewc_loss": 0.04960446059703827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4802229745546356e-05, + "grad_norm": 31.255599975585938, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8602789640426636, + "num_tokens": 770574697.0, + "step": 20195 + }, + { + "epoch": 2.569138786413942, + "ewc_loss": 0.04964553192257881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.482276613591239e-05, + "grad_norm": 31.151912689208984, + "learning_rate": 1e-06, + "loss": 0.4004, + "mean_token_accuracy": 0.8772460222244263, + "num_tokens": 770615417.0, + "step": 20196 + }, + { + "epoch": 2.5692659966925326, + "ewc_loss": 0.04972963035106659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4864815713954158e-05, + "grad_norm": 31.242368698120117, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.858689546585083, + "num_tokens": 770653228.0, + "step": 20197 + }, + { + "epoch": 2.569393206971123, + "ewc_loss": 0.04972568526864052, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.486284211045131e-05, + "grad_norm": 31.09145164489746, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8868926763534546, + "num_tokens": 770693876.0, + "step": 20198 + }, + { + "epoch": 2.5695204172497137, + "ewc_loss": 0.049748439341783524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.487421988917049e-05, + "grad_norm": 31.309463500976562, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8707318902015686, + "num_tokens": 770728754.0, + "step": 20199 + }, + { + "epoch": 2.5696476275283042, + "ewc_loss": 0.0497957281768322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4897864932427183e-05, + "grad_norm": 31.218921661376953, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8700096011161804, + "num_tokens": 770767961.0, + "step": 20200 + }, + { + "epoch": 2.5697748378068948, + "ewc_loss": 0.04963460564613342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.481730371073354e-05, + "grad_norm": 31.166894912719727, + "learning_rate": 1e-06, + "loss": 0.5125, + "mean_token_accuracy": 0.8440906405448914, + "num_tokens": 770808880.0, + "step": 20201 + }, + { + "epoch": 2.5699020480854853, + "ewc_loss": 0.0497417077422142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.487085475877393e-05, + "grad_norm": 31.216434478759766, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8655573129653931, + "num_tokens": 770846483.0, + "step": 20202 + }, + { + "epoch": 2.570029258364076, + "ewc_loss": 0.04972294718027115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4861474230419844e-05, + "grad_norm": 31.3063907623291, + "learning_rate": 1e-06, + "loss": 0.4975, + "mean_token_accuracy": 0.8512288331985474, + "num_tokens": 770884234.0, + "step": 20203 + }, + { + "epoch": 2.5701564686426663, + "ewc_loss": 0.04974636808037758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4873184884199873e-05, + "grad_norm": 31.071720123291016, + "learning_rate": 1e-06, + "loss": 0.468, + "mean_token_accuracy": 0.8569326400756836, + "num_tokens": 770918785.0, + "step": 20204 + }, + { + "epoch": 2.570283678921257, + "ewc_loss": 0.04972205311059952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4861026759026572e-05, + "grad_norm": 31.261919021606445, + "learning_rate": 1e-06, + "loss": 0.3529, + "mean_token_accuracy": 0.8921216726303101, + "num_tokens": 770956889.0, + "step": 20205 + }, + { + "epoch": 2.5704108891998474, + "ewc_loss": 0.049779366701841354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4889683118090034e-05, + "grad_norm": 31.06940460205078, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8577624559402466, + "num_tokens": 770994937.0, + "step": 20206 + }, + { + "epoch": 2.570538099478438, + "ewc_loss": 0.04971286281943321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4856431991793215e-05, + "grad_norm": 31.316694259643555, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8792039155960083, + "num_tokens": 771031451.0, + "step": 20207 + }, + { + "epoch": 2.5706653097570284, + "ewc_loss": 0.04986197128891945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4930985091486946e-05, + "grad_norm": 31.076881408691406, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8781688809394836, + "num_tokens": 771061575.0, + "step": 20208 + }, + { + "epoch": 2.570792520035619, + "ewc_loss": 0.04967270791530609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4836353986756876e-05, + "grad_norm": 31.186281204223633, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.870312511920929, + "num_tokens": 771103366.0, + "step": 20209 + }, + { + "epoch": 2.5709197303142095, + "ewc_loss": 0.04992402717471123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.496201341273263e-05, + "grad_norm": 31.315767288208008, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8756380677223206, + "num_tokens": 771142178.0, + "step": 20210 + }, + { + "epoch": 2.5710469405928, + "ewc_loss": 0.04978041350841522, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4890206987038255e-05, + "grad_norm": 31.225563049316406, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8721717596054077, + "num_tokens": 771179296.0, + "step": 20211 + }, + { + "epoch": 2.5711741508713906, + "ewc_loss": 0.04980172961950302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.490086444595363e-05, + "grad_norm": 31.25257682800293, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8635192513465881, + "num_tokens": 771219084.0, + "step": 20212 + }, + { + "epoch": 2.571301361149981, + "ewc_loss": 0.04972789064049721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.486394441802986e-05, + "grad_norm": 31.245033264160156, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8781247138977051, + "num_tokens": 771250609.0, + "step": 20213 + }, + { + "epoch": 2.571428571428571, + "ewc_loss": 0.04986700788140297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4933504391810857e-05, + "grad_norm": 31.31707763671875, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8693686723709106, + "num_tokens": 771286191.0, + "step": 20214 + }, + { + "epoch": 2.571555781707162, + "ewc_loss": 0.04971468821167946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4857343305484392e-05, + "grad_norm": 31.14563751220703, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8703055381774902, + "num_tokens": 771320898.0, + "step": 20215 + }, + { + "epoch": 2.5716829919857522, + "ewc_loss": 0.04972846060991287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4864229999366216e-05, + "grad_norm": 31.168256759643555, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8629239201545715, + "num_tokens": 771353977.0, + "step": 20216 + }, + { + "epoch": 2.571810202264343, + "ewc_loss": 0.049760837107896805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4880419005057774e-05, + "grad_norm": 31.24126625061035, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8774625062942505, + "num_tokens": 771391960.0, + "step": 20217 + }, + { + "epoch": 2.5719374125429333, + "ewc_loss": 0.049852415919303894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4926208425313234e-05, + "grad_norm": 31.29230308532715, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8631200790405273, + "num_tokens": 771427764.0, + "step": 20218 + }, + { + "epoch": 2.572064622821524, + "ewc_loss": 0.04970545694231987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4852728529367596e-05, + "grad_norm": 31.13714599609375, + "learning_rate": 1e-06, + "loss": 0.4137, + "mean_token_accuracy": 0.8715068101882935, + "num_tokens": 771469706.0, + "step": 20219 + }, + { + "epoch": 2.5721918331001143, + "ewc_loss": 0.04976033791899681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.488016980350949e-05, + "grad_norm": 31.27115821838379, + "learning_rate": 1e-06, + "loss": 0.3544, + "mean_token_accuracy": 0.8918644189834595, + "num_tokens": 771504296.0, + "step": 20220 + }, + { + "epoch": 2.572319043378705, + "ewc_loss": 0.04976634308695793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.488317113602534e-05, + "grad_norm": 31.090471267700195, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8718515634536743, + "num_tokens": 771544496.0, + "step": 20221 + }, + { + "epoch": 2.5724462536572954, + "ewc_loss": 0.04973268136382103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.486634002707433e-05, + "grad_norm": 31.319135665893555, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8800342679023743, + "num_tokens": 771584322.0, + "step": 20222 + }, + { + "epoch": 2.572573463935886, + "ewc_loss": 0.04979655519127846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4898277843021788e-05, + "grad_norm": 31.19874382019043, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.880350649356842, + "num_tokens": 771621424.0, + "step": 20223 + }, + { + "epoch": 2.5727006742144765, + "ewc_loss": 0.049782656133174896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.489132748451084e-05, + "grad_norm": 31.237714767456055, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8726149797439575, + "num_tokens": 771658173.0, + "step": 20224 + }, + { + "epoch": 2.572827884493067, + "ewc_loss": 0.049717701971530914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.485885124769993e-05, + "grad_norm": 31.15815544128418, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8637363314628601, + "num_tokens": 771698182.0, + "step": 20225 + }, + { + "epoch": 2.5729550947716575, + "ewc_loss": 0.0497036837041378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.485184268152807e-05, + "grad_norm": 31.080528259277344, + "learning_rate": 1e-06, + "loss": 0.4555, + "mean_token_accuracy": 0.8563721179962158, + "num_tokens": 771731559.0, + "step": 20226 + }, + { + "epoch": 2.573082305050248, + "ewc_loss": 0.049766190350055695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.488309473847039e-05, + "grad_norm": 31.17743492126465, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8674306869506836, + "num_tokens": 771774362.0, + "step": 20227 + }, + { + "epoch": 2.5732095153288386, + "ewc_loss": 0.04983741417527199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.491870691301301e-05, + "grad_norm": 31.17328453063965, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8620864748954773, + "num_tokens": 771810540.0, + "step": 20228 + }, + { + "epoch": 2.573336725607429, + "ewc_loss": 0.049780700355768204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4890350687201135e-05, + "grad_norm": 31.233793258666992, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8752621412277222, + "num_tokens": 771846221.0, + "step": 20229 + }, + { + "epoch": 2.5734639358860196, + "ewc_loss": 0.049745600670576096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4872801077435724e-05, + "grad_norm": 31.22992515563965, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8806348443031311, + "num_tokens": 771882268.0, + "step": 20230 + }, + { + "epoch": 2.57359114616461, + "ewc_loss": 0.04972924292087555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.486462108208798e-05, + "grad_norm": 31.312768936157227, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8712780475616455, + "num_tokens": 771913176.0, + "step": 20231 + }, + { + "epoch": 2.5737183564432007, + "ewc_loss": 0.049748312681913376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4874156224541366e-05, + "grad_norm": 31.190359115600586, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8658180236816406, + "num_tokens": 771948163.0, + "step": 20232 + }, + { + "epoch": 2.573845566721791, + "ewc_loss": 0.04966358840465546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4831793780322187e-05, + "grad_norm": 31.19596290588379, + "learning_rate": 1e-06, + "loss": 0.378, + "mean_token_accuracy": 0.8840618133544922, + "num_tokens": 771988296.0, + "step": 20233 + }, + { + "epoch": 2.5739727770003817, + "ewc_loss": 0.04975127801299095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4875638700905256e-05, + "grad_norm": 31.31993865966797, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8785886764526367, + "num_tokens": 772024330.0, + "step": 20234 + }, + { + "epoch": 2.5740999872789723, + "ewc_loss": 0.049643028527498245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.482151467120275e-05, + "grad_norm": 31.23211669921875, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8592009544372559, + "num_tokens": 772062291.0, + "step": 20235 + }, + { + "epoch": 2.574227197557563, + "ewc_loss": 0.04969324171543121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.484662036295049e-05, + "grad_norm": 31.152469635009766, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8730124235153198, + "num_tokens": 772110239.0, + "step": 20236 + }, + { + "epoch": 2.5743544078361533, + "ewc_loss": 0.049662474542856216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.48312371695647e-05, + "grad_norm": 31.30565643310547, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8671801686286926, + "num_tokens": 772153201.0, + "step": 20237 + }, + { + "epoch": 2.574481618114744, + "ewc_loss": 0.049695711582899094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4847855456755497e-05, + "grad_norm": 31.257482528686523, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.873173713684082, + "num_tokens": 772186816.0, + "step": 20238 + }, + { + "epoch": 2.574608828393334, + "ewc_loss": 0.04958474636077881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.479237264196854e-05, + "grad_norm": 31.28314208984375, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8799023628234863, + "num_tokens": 772224238.0, + "step": 20239 + }, + { + "epoch": 2.574736038671925, + "ewc_loss": 0.04964663088321686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.482331547071226e-05, + "grad_norm": 31.169635772705078, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8798227906227112, + "num_tokens": 772263505.0, + "step": 20240 + }, + { + "epoch": 2.574863248950515, + "ewc_loss": 0.04962316155433655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4811581170069985e-05, + "grad_norm": 31.301565170288086, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8790154457092285, + "num_tokens": 772303205.0, + "step": 20241 + }, + { + "epoch": 2.574990459229106, + "ewc_loss": 0.04969945177435875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4849725377862342e-05, + "grad_norm": 31.140522003173828, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8764231204986572, + "num_tokens": 772344495.0, + "step": 20242 + }, + { + "epoch": 2.575117669507696, + "ewc_loss": 0.04961473494768143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4807368390611373e-05, + "grad_norm": 31.341548919677734, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8654592037200928, + "num_tokens": 772380296.0, + "step": 20243 + }, + { + "epoch": 2.5752448797862866, + "ewc_loss": 0.04968178644776344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4840892365318723e-05, + "grad_norm": 31.10676383972168, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8853340744972229, + "num_tokens": 772423933.0, + "step": 20244 + }, + { + "epoch": 2.575372090064877, + "ewc_loss": 0.04964839294552803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4824195861583576e-05, + "grad_norm": 31.35822105407715, + "learning_rate": 1e-06, + "loss": 0.467, + "mean_token_accuracy": 0.8574775457382202, + "num_tokens": 772464514.0, + "step": 20245 + }, + { + "epoch": 2.5754993003434676, + "ewc_loss": 0.049737993627786636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.486899757059291e-05, + "grad_norm": 31.18781089782715, + "learning_rate": 1e-06, + "loss": 0.353, + "mean_token_accuracy": 0.8906108140945435, + "num_tokens": 772498501.0, + "step": 20246 + }, + { + "epoch": 2.575626510622058, + "ewc_loss": 0.049573685973882675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.478684291418176e-05, + "grad_norm": 31.23727798461914, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.867392897605896, + "num_tokens": 772538072.0, + "step": 20247 + }, + { + "epoch": 2.5757537209006487, + "ewc_loss": 0.04974917694926262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.487458914401941e-05, + "grad_norm": 31.207691192626953, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8723903298377991, + "num_tokens": 772577434.0, + "step": 20248 + }, + { + "epoch": 2.575880931179239, + "ewc_loss": 0.04966752976179123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.483376556483563e-05, + "grad_norm": 31.201074600219727, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8860849738121033, + "num_tokens": 772612917.0, + "step": 20249 + }, + { + "epoch": 2.5760081414578297, + "ewc_loss": 0.049697887152433395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4848943212418817e-05, + "grad_norm": 31.259449005126953, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8831096291542053, + "num_tokens": 772652939.0, + "step": 20250 + }, + { + "epoch": 2.5761353517364203, + "ewc_loss": 0.04970533028244972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4852664864738472e-05, + "grad_norm": 31.24602508544922, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8785172700881958, + "num_tokens": 772688018.0, + "step": 20251 + }, + { + "epoch": 2.576262562015011, + "ewc_loss": 0.049709491431713104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4854745788616128e-05, + "grad_norm": 31.20340919494629, + "learning_rate": 1e-06, + "loss": 0.4554, + "mean_token_accuracy": 0.861270546913147, + "num_tokens": 772728071.0, + "step": 20252 + }, + { + "epoch": 2.5763897722936013, + "ewc_loss": 0.04973817616701126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4869088520063087e-05, + "grad_norm": 31.208715438842773, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8732098340988159, + "num_tokens": 772762730.0, + "step": 20253 + }, + { + "epoch": 2.576516982572192, + "ewc_loss": 0.04968968406319618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.484484139131382e-05, + "grad_norm": 31.23515510559082, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8739922046661377, + "num_tokens": 772799125.0, + "step": 20254 + }, + { + "epoch": 2.5766441928507824, + "ewc_loss": 0.049770537763834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4885268430807628e-05, + "grad_norm": 31.230884552001953, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8832830786705017, + "num_tokens": 772838657.0, + "step": 20255 + }, + { + "epoch": 2.576771403129373, + "ewc_loss": 0.04967224597930908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4836122975102626e-05, + "grad_norm": 31.27984619140625, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8787456750869751, + "num_tokens": 772876470.0, + "step": 20256 + }, + { + "epoch": 2.5768986134079634, + "ewc_loss": 0.049654271453619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4827135348459706e-05, + "grad_norm": 31.306089401245117, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8653002977371216, + "num_tokens": 772910369.0, + "step": 20257 + }, + { + "epoch": 2.577025823686554, + "ewc_loss": 0.049689825624227524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.484491233190056e-05, + "grad_norm": 31.30329132080078, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8769863247871399, + "num_tokens": 772948129.0, + "step": 20258 + }, + { + "epoch": 2.5771530339651445, + "ewc_loss": 0.04958547651767731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4792738258838654e-05, + "grad_norm": 31.28143882751465, + "learning_rate": 1e-06, + "loss": 0.3388, + "mean_token_accuracy": 0.8978335857391357, + "num_tokens": 772985359.0, + "step": 20259 + }, + { + "epoch": 2.577280244243735, + "ewc_loss": 0.0496450811624527, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.482254058122635e-05, + "grad_norm": 31.21297264099121, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8732863664627075, + "num_tokens": 773019402.0, + "step": 20260 + }, + { + "epoch": 2.5774074545223256, + "ewc_loss": 0.04962804168462753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.481402043486014e-05, + "grad_norm": 31.242298126220703, + "learning_rate": 1e-06, + "loss": 0.4246, + "mean_token_accuracy": 0.8719282150268555, + "num_tokens": 773059176.0, + "step": 20261 + }, + { + "epoch": 2.5775346648009156, + "ewc_loss": 0.0497167594730854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4858380129444413e-05, + "grad_norm": 31.214818954467773, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8735735416412354, + "num_tokens": 773094274.0, + "step": 20262 + }, + { + "epoch": 2.5776618750795066, + "ewc_loss": 0.04963136464357376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.481568299117498e-05, + "grad_norm": 31.18159294128418, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8710763454437256, + "num_tokens": 773135679.0, + "step": 20263 + }, + { + "epoch": 2.5777890853580967, + "ewc_loss": 0.04970215633511543, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.485107870597858e-05, + "grad_norm": 31.26593589782715, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.868489146232605, + "num_tokens": 773174270.0, + "step": 20264 + }, + { + "epoch": 2.5779162956366877, + "ewc_loss": 0.04966191202402115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4830955226207152e-05, + "grad_norm": 31.159116744995117, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8734615445137024, + "num_tokens": 773213582.0, + "step": 20265 + }, + { + "epoch": 2.5780435059152778, + "ewc_loss": 0.0496828556060791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4841427148203366e-05, + "grad_norm": 31.239322662353516, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.866987943649292, + "num_tokens": 773244980.0, + "step": 20266 + }, + { + "epoch": 2.5781707161938687, + "ewc_loss": 0.049722976982593536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4861488782335073e-05, + "grad_norm": 31.300073623657227, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8859276175498962, + "num_tokens": 773287395.0, + "step": 20267 + }, + { + "epoch": 2.578297926472459, + "ewc_loss": 0.049691978842020035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.484598917362746e-05, + "grad_norm": 31.252233505249023, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8680727481842041, + "num_tokens": 773324593.0, + "step": 20268 + }, + { + "epoch": 2.5784251367510493, + "ewc_loss": 0.04958323761820793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4791619580355473e-05, + "grad_norm": 31.151187896728516, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8663889169692993, + "num_tokens": 773361802.0, + "step": 20269 + }, + { + "epoch": 2.57855234702964, + "ewc_loss": 0.04957412928342819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4787064830888994e-05, + "grad_norm": 31.224061965942383, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8765054941177368, + "num_tokens": 773401348.0, + "step": 20270 + }, + { + "epoch": 2.5786795573082304, + "ewc_loss": 0.0497143529355526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4857175958459266e-05, + "grad_norm": 31.278018951416016, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8884729743003845, + "num_tokens": 773439040.0, + "step": 20271 + }, + { + "epoch": 2.578806767586821, + "ewc_loss": 0.04960917681455612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4804588974802755e-05, + "grad_norm": 31.165903091430664, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8839380741119385, + "num_tokens": 773471382.0, + "step": 20272 + }, + { + "epoch": 2.5789339778654115, + "ewc_loss": 0.04969996213912964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.484998185536824e-05, + "grad_norm": 31.192163467407227, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.862054705619812, + "num_tokens": 773508924.0, + "step": 20273 + }, + { + "epoch": 2.579061188144002, + "ewc_loss": 0.04967857152223587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4839286197675392e-05, + "grad_norm": 31.20623207092285, + "learning_rate": 1e-06, + "loss": 0.3666, + "mean_token_accuracy": 0.8865073919296265, + "num_tokens": 773550219.0, + "step": 20274 + }, + { + "epoch": 2.5791883984225925, + "ewc_loss": 0.0497172512114048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.485862569301389e-05, + "grad_norm": 31.215152740478516, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8676605820655823, + "num_tokens": 773589363.0, + "step": 20275 + }, + { + "epoch": 2.579315608701183, + "ewc_loss": 0.04965938627719879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.482969284756109e-05, + "grad_norm": 31.313419342041016, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8714802861213684, + "num_tokens": 773630289.0, + "step": 20276 + }, + { + "epoch": 2.5794428189797736, + "ewc_loss": 0.04969727620482445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.484863762219902e-05, + "grad_norm": 31.218982696533203, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.873301088809967, + "num_tokens": 773668660.0, + "step": 20277 + }, + { + "epoch": 2.579570029258364, + "ewc_loss": 0.04966944083571434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4834720534272492e-05, + "grad_norm": 31.297632217407227, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8689454197883606, + "num_tokens": 773704318.0, + "step": 20278 + }, + { + "epoch": 2.5796972395369546, + "ewc_loss": 0.04962383210659027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4811915864120238e-05, + "grad_norm": 31.153106689453125, + "learning_rate": 1e-06, + "loss": 0.4682, + "mean_token_accuracy": 0.8581418991088867, + "num_tokens": 773745821.0, + "step": 20279 + }, + { + "epoch": 2.579824449815545, + "ewc_loss": 0.04965803399682045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4829016183502972e-05, + "grad_norm": 31.29388999938965, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8813769817352295, + "num_tokens": 773782880.0, + "step": 20280 + }, + { + "epoch": 2.5799516600941357, + "ewc_loss": 0.04971301928162575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4856510208337568e-05, + "grad_norm": 31.281185150146484, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8584328889846802, + "num_tokens": 773824880.0, + "step": 20281 + }, + { + "epoch": 2.580078870372726, + "ewc_loss": 0.049624375998973846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.481218871253077e-05, + "grad_norm": 31.38495445251465, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8750165104866028, + "num_tokens": 773863210.0, + "step": 20282 + }, + { + "epoch": 2.5802060806513167, + "ewc_loss": 0.049714233726263046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4857117750798352e-05, + "grad_norm": 31.327239990234375, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8794111013412476, + "num_tokens": 773903432.0, + "step": 20283 + }, + { + "epoch": 2.5803332909299073, + "ewc_loss": 0.049616970121860504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.480848525010515e-05, + "grad_norm": 31.360971450805664, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8797056674957275, + "num_tokens": 773942424.0, + "step": 20284 + }, + { + "epoch": 2.580460501208498, + "ewc_loss": 0.04959690943360329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4798455342533998e-05, + "grad_norm": 31.12058448791504, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8708692789077759, + "num_tokens": 773976598.0, + "step": 20285 + }, + { + "epoch": 2.5805877114870883, + "ewc_loss": 0.049558304250240326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.477915222698357e-05, + "grad_norm": 31.280031204223633, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8667560815811157, + "num_tokens": 774016696.0, + "step": 20286 + }, + { + "epoch": 2.5807149217656784, + "ewc_loss": 0.049692410975694656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.484620563336648e-05, + "grad_norm": 31.155311584472656, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8657540082931519, + "num_tokens": 774059046.0, + "step": 20287 + }, + { + "epoch": 2.5808421320442694, + "ewc_loss": 0.049622438848018646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.481121919117868e-05, + "grad_norm": 31.357751846313477, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8675639629364014, + "num_tokens": 774095322.0, + "step": 20288 + }, + { + "epoch": 2.5809693423228595, + "ewc_loss": 0.04967784509062767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4838922399794683e-05, + "grad_norm": 31.173046112060547, + "learning_rate": 1e-06, + "loss": 0.4553, + "mean_token_accuracy": 0.8602564334869385, + "num_tokens": 774134506.0, + "step": 20289 + }, + { + "epoch": 2.5810965526014504, + "ewc_loss": 0.04959191381931305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4795956051093526e-05, + "grad_norm": 31.30028533935547, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8869940042495728, + "num_tokens": 774176997.0, + "step": 20290 + }, + { + "epoch": 2.5812237628800405, + "ewc_loss": 0.04965556040406227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4827781089697964e-05, + "grad_norm": 31.273530960083008, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.86683189868927, + "num_tokens": 774213900.0, + "step": 20291 + }, + { + "epoch": 2.5813509731586315, + "ewc_loss": 0.04961267486214638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4806337023619562e-05, + "grad_norm": 31.335296630859375, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8643333315849304, + "num_tokens": 774259687.0, + "step": 20292 + }, + { + "epoch": 2.5814781834372216, + "ewc_loss": 0.049741845577955246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.487092206138186e-05, + "grad_norm": 31.33095932006836, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8760491013526917, + "num_tokens": 774297200.0, + "step": 20293 + }, + { + "epoch": 2.581605393715812, + "ewc_loss": 0.04954889044165611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4774444682407193e-05, + "grad_norm": 31.335725784301758, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8846555948257446, + "num_tokens": 774336642.0, + "step": 20294 + }, + { + "epoch": 2.5817326039944026, + "ewc_loss": 0.04965867102146149, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4829336325637996e-05, + "grad_norm": 31.238994598388672, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8567655682563782, + "num_tokens": 774376762.0, + "step": 20295 + }, + { + "epoch": 2.581859814272993, + "ewc_loss": 0.0496518537402153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.482592753949575e-05, + "grad_norm": 31.35813331604004, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8804470300674438, + "num_tokens": 774421360.0, + "step": 20296 + }, + { + "epoch": 2.5819870245515837, + "ewc_loss": 0.04971214011311531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.485607001290191e-05, + "grad_norm": 31.306074142456055, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8799401521682739, + "num_tokens": 774460850.0, + "step": 20297 + }, + { + "epoch": 2.582114234830174, + "ewc_loss": 0.04964527115225792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4822635168675333e-05, + "grad_norm": 31.24596405029297, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8692038059234619, + "num_tokens": 774497542.0, + "step": 20298 + }, + { + "epoch": 2.5822414451087647, + "ewc_loss": 0.04957977309823036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4789886083453894e-05, + "grad_norm": 31.246583938598633, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8631837964057922, + "num_tokens": 774538164.0, + "step": 20299 + }, + { + "epoch": 2.5823686553873553, + "ewc_loss": 0.0496465340256691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4823266357998364e-05, + "grad_norm": 31.2855167388916, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8669442534446716, + "num_tokens": 774576118.0, + "step": 20300 + }, + { + "epoch": 2.582495865665946, + "ewc_loss": 0.04963642358779907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.481821138644591e-05, + "grad_norm": 31.282527923583984, + "learning_rate": 1e-06, + "loss": 0.3563, + "mean_token_accuracy": 0.8933049440383911, + "num_tokens": 774617506.0, + "step": 20301 + }, + { + "epoch": 2.5826230759445363, + "ewc_loss": 0.049613576382398605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4806788132991642e-05, + "grad_norm": 31.254222869873047, + "learning_rate": 1e-06, + "loss": 0.4986, + "mean_token_accuracy": 0.8488031029701233, + "num_tokens": 774653720.0, + "step": 20302 + }, + { + "epoch": 2.582750286223127, + "ewc_loss": 0.04961897432804108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4809487513266504e-05, + "grad_norm": 31.260684967041016, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8927288055419922, + "num_tokens": 774691056.0, + "step": 20303 + }, + { + "epoch": 2.5828774965017174, + "ewc_loss": 0.049632735550403595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4816366931190714e-05, + "grad_norm": 31.286558151245117, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8653407096862793, + "num_tokens": 774726514.0, + "step": 20304 + }, + { + "epoch": 2.583004706780308, + "ewc_loss": 0.04960961267352104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.480480725353118e-05, + "grad_norm": 31.456241607666016, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8623391389846802, + "num_tokens": 774760831.0, + "step": 20305 + }, + { + "epoch": 2.5831319170588984, + "ewc_loss": 0.04964105039834976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.482052514096722e-05, + "grad_norm": 31.2753849029541, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8808179497718811, + "num_tokens": 774799399.0, + "step": 20306 + }, + { + "epoch": 2.583259127337489, + "ewc_loss": 0.0495234876871109, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4761744498391636e-05, + "grad_norm": 31.20492172241211, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8642998933792114, + "num_tokens": 774839910.0, + "step": 20307 + }, + { + "epoch": 2.5833863376160795, + "ewc_loss": 0.049636758863925934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4818378733471036e-05, + "grad_norm": 31.445846557617188, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8735651969909668, + "num_tokens": 774875694.0, + "step": 20308 + }, + { + "epoch": 2.58351354789467, + "ewc_loss": 0.049552734941244125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.477636735420674e-05, + "grad_norm": 31.119583129882812, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8742936849594116, + "num_tokens": 774908020.0, + "step": 20309 + }, + { + "epoch": 2.5836407581732606, + "ewc_loss": 0.04956503212451935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4782515538390726e-05, + "grad_norm": 31.417713165283203, + "learning_rate": 1e-06, + "loss": 0.4454, + "mean_token_accuracy": 0.8619378805160522, + "num_tokens": 774946716.0, + "step": 20310 + }, + { + "epoch": 2.583767968451851, + "ewc_loss": 0.0497346967458725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4867347747203894e-05, + "grad_norm": 31.36102867126465, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8666622638702393, + "num_tokens": 774981732.0, + "step": 20311 + }, + { + "epoch": 2.583895178730441, + "ewc_loss": 0.049575075507164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4787537768133916e-05, + "grad_norm": 31.176721572875977, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8774415254592896, + "num_tokens": 775015280.0, + "step": 20312 + }, + { + "epoch": 2.584022389009032, + "ewc_loss": 0.04977532848715782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4887664039852098e-05, + "grad_norm": 31.37879180908203, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8793011903762817, + "num_tokens": 775054518.0, + "step": 20313 + }, + { + "epoch": 2.5841495992876222, + "ewc_loss": 0.04964370280504227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4821851184242405e-05, + "grad_norm": 31.173442840576172, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8788896203041077, + "num_tokens": 775091548.0, + "step": 20314 + }, + { + "epoch": 2.584276809566213, + "ewc_loss": 0.04967949911952019, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4839750039973296e-05, + "grad_norm": 31.360017776489258, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8593546748161316, + "num_tokens": 775130916.0, + "step": 20315 + }, + { + "epoch": 2.5844040198448033, + "ewc_loss": 0.04966982826590538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4834913347149268e-05, + "grad_norm": 31.200355529785156, + "learning_rate": 1e-06, + "loss": 0.383, + "mean_token_accuracy": 0.8843587040901184, + "num_tokens": 775165875.0, + "step": 20316 + }, + { + "epoch": 2.584531230123394, + "ewc_loss": 0.04969433695077896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4847167878760956e-05, + "grad_norm": 31.420042037963867, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8592472076416016, + "num_tokens": 775211463.0, + "step": 20317 + }, + { + "epoch": 2.5846584404019843, + "ewc_loss": 0.04975929111242294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4879645934561267e-05, + "grad_norm": 31.29403305053711, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8813044428825378, + "num_tokens": 775248338.0, + "step": 20318 + }, + { + "epoch": 2.584785650680575, + "ewc_loss": 0.04954963177442551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4774815756245516e-05, + "grad_norm": 31.23807144165039, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8603881597518921, + "num_tokens": 775285707.0, + "step": 20319 + }, + { + "epoch": 2.5849128609591654, + "ewc_loss": 0.04971960559487343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4859802579157986e-05, + "grad_norm": 31.512252807617188, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8806499242782593, + "num_tokens": 775322682.0, + "step": 20320 + }, + { + "epoch": 2.585040071237756, + "ewc_loss": 0.04959666728973389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.479833347024396e-05, + "grad_norm": 31.192745208740234, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8822618126869202, + "num_tokens": 775358274.0, + "step": 20321 + }, + { + "epoch": 2.5851672815163464, + "ewc_loss": 0.049490440636873245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4745220798649825e-05, + "grad_norm": 31.230506896972656, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8663890361785889, + "num_tokens": 775394472.0, + "step": 20322 + }, + { + "epoch": 2.585294491794937, + "ewc_loss": 0.0497228279709816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4861414203769527e-05, + "grad_norm": 31.31581687927246, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8822991847991943, + "num_tokens": 775433647.0, + "step": 20323 + }, + { + "epoch": 2.5854217020735275, + "ewc_loss": 0.04973049461841583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.48652468144428e-05, + "grad_norm": 31.2238712310791, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.862881600856781, + "num_tokens": 775469159.0, + "step": 20324 + }, + { + "epoch": 2.585548912352118, + "ewc_loss": 0.04963872581720352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4819362806738354e-05, + "grad_norm": 31.26504898071289, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8802564740180969, + "num_tokens": 775507329.0, + "step": 20325 + }, + { + "epoch": 2.5856761226307086, + "ewc_loss": 0.049815833568573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4907916667871177e-05, + "grad_norm": 31.353042602539062, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8703844547271729, + "num_tokens": 775544638.0, + "step": 20326 + }, + { + "epoch": 2.585803332909299, + "ewc_loss": 0.04965134710073471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4825672880979255e-05, + "grad_norm": 31.175594329833984, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.875251054763794, + "num_tokens": 775584918.0, + "step": 20327 + }, + { + "epoch": 2.5859305431878896, + "ewc_loss": 0.04970329627394676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.485164804966189e-05, + "grad_norm": 31.28672981262207, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8676119446754456, + "num_tokens": 775624214.0, + "step": 20328 + }, + { + "epoch": 2.58605775346648, + "ewc_loss": 0.049754176288843155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4877088435459882e-05, + "grad_norm": 31.13842010498047, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8600186705589294, + "num_tokens": 775668372.0, + "step": 20329 + }, + { + "epoch": 2.5861849637450707, + "ewc_loss": 0.0497126579284668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4856328309397213e-05, + "grad_norm": 31.32859992980957, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8447977900505066, + "num_tokens": 775700578.0, + "step": 20330 + }, + { + "epoch": 2.586312174023661, + "ewc_loss": 0.04986418038606644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.49320910370443e-05, + "grad_norm": 31.357641220092773, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8785195350646973, + "num_tokens": 775746980.0, + "step": 20331 + }, + { + "epoch": 2.5864393843022517, + "ewc_loss": 0.04970533028244972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4852664864738472e-05, + "grad_norm": 31.369873046875, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8667336702346802, + "num_tokens": 775780762.0, + "step": 20332 + }, + { + "epoch": 2.5865665945808423, + "ewc_loss": 0.049711525440216064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.485576260369271e-05, + "grad_norm": 31.26633644104004, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8781405091285706, + "num_tokens": 775815591.0, + "step": 20333 + }, + { + "epoch": 2.586693804859433, + "ewc_loss": 0.049706488847732544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4853245122358203e-05, + "grad_norm": 31.381622314453125, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8625442981719971, + "num_tokens": 775856414.0, + "step": 20334 + }, + { + "epoch": 2.5868210151380233, + "ewc_loss": 0.04977339133620262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.488669633748941e-05, + "grad_norm": 31.299226760864258, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8720107078552246, + "num_tokens": 775896939.0, + "step": 20335 + }, + { + "epoch": 2.586948225416614, + "ewc_loss": 0.04976791888475418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4883958758437075e-05, + "grad_norm": 31.29045867919922, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8666330575942993, + "num_tokens": 775941778.0, + "step": 20336 + }, + { + "epoch": 2.587075435695204, + "ewc_loss": 0.04970422759652138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4852113710949197e-05, + "grad_norm": 31.21074104309082, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8706890344619751, + "num_tokens": 775979346.0, + "step": 20337 + }, + { + "epoch": 2.587202645973795, + "ewc_loss": 0.04973505437374115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4867527827154845e-05, + "grad_norm": 31.286836624145508, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8691829442977905, + "num_tokens": 776013793.0, + "step": 20338 + }, + { + "epoch": 2.587329856252385, + "ewc_loss": 0.049809060990810394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4904529709601775e-05, + "grad_norm": 31.350051879882812, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8840051293373108, + "num_tokens": 776058513.0, + "step": 20339 + }, + { + "epoch": 2.587457066530976, + "ewc_loss": 0.04966795817017555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4833978386595845e-05, + "grad_norm": 31.259098052978516, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8714998960494995, + "num_tokens": 776102790.0, + "step": 20340 + }, + { + "epoch": 2.587584276809566, + "ewc_loss": 0.04977026581764221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4885133825591765e-05, + "grad_norm": 31.363609313964844, + "learning_rate": 1e-06, + "loss": 0.3669, + "mean_token_accuracy": 0.8876059055328369, + "num_tokens": 776133898.0, + "step": 20341 + }, + { + "epoch": 2.5877114870881566, + "ewc_loss": 0.04975211247801781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.487605706846807e-05, + "grad_norm": 31.23856544494629, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8709354400634766, + "num_tokens": 776167590.0, + "step": 20342 + }, + { + "epoch": 2.587838697366747, + "ewc_loss": 0.04970938712358475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4854693037923425e-05, + "grad_norm": 31.313657760620117, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.865425705909729, + "num_tokens": 776207966.0, + "step": 20343 + }, + { + "epoch": 2.5879659076453376, + "ewc_loss": 0.04973940923810005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.486970515747089e-05, + "grad_norm": 31.28788948059082, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8713540434837341, + "num_tokens": 776245601.0, + "step": 20344 + }, + { + "epoch": 2.588093117923928, + "ewc_loss": 0.04974450170993805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.487224992364645e-05, + "grad_norm": 31.360864639282227, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8805326819419861, + "num_tokens": 776285377.0, + "step": 20345 + }, + { + "epoch": 2.5882203282025187, + "ewc_loss": 0.04964914917945862, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4824574211379513e-05, + "grad_norm": 31.229299545288086, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8615545630455017, + "num_tokens": 776328116.0, + "step": 20346 + }, + { + "epoch": 2.588347538481109, + "ewc_loss": 0.04978875070810318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4894376110751182e-05, + "grad_norm": 31.44503402709961, + "learning_rate": 1e-06, + "loss": 0.4725, + "mean_token_accuracy": 0.8569713830947876, + "num_tokens": 776369008.0, + "step": 20347 + }, + { + "epoch": 2.5884747487596997, + "ewc_loss": 0.04970833286643028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4854165530996397e-05, + "grad_norm": 31.17352294921875, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8622652292251587, + "num_tokens": 776410151.0, + "step": 20348 + }, + { + "epoch": 2.5886019590382903, + "ewc_loss": 0.049637310206890106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4818655219860375e-05, + "grad_norm": 31.364450454711914, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8954858779907227, + "num_tokens": 776445944.0, + "step": 20349 + }, + { + "epoch": 2.588729169316881, + "ewc_loss": 0.04978828877210617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.489414509909693e-05, + "grad_norm": 31.273040771484375, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.876218855381012, + "num_tokens": 776485388.0, + "step": 20350 + }, + { + "epoch": 2.5888563795954713, + "ewc_loss": 0.04959695413708687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.479847717040684e-05, + "grad_norm": 31.279251098632812, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8538689017295837, + "num_tokens": 776529135.0, + "step": 20351 + }, + { + "epoch": 2.588983589874062, + "ewc_loss": 0.0496537983417511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4826898879837245e-05, + "grad_norm": 31.191650390625, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.86497962474823, + "num_tokens": 776564310.0, + "step": 20352 + }, + { + "epoch": 2.5891108001526524, + "ewc_loss": 0.04963553696870804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4817769372020848e-05, + "grad_norm": 31.27043914794922, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.884701669216156, + "num_tokens": 776609004.0, + "step": 20353 + }, + { + "epoch": 2.589238010431243, + "ewc_loss": 0.0497899055480957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4894952730392106e-05, + "grad_norm": 31.35175895690918, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8778166174888611, + "num_tokens": 776645796.0, + "step": 20354 + }, + { + "epoch": 2.5893652207098334, + "ewc_loss": 0.04957788065075874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.478894020896405e-05, + "grad_norm": 31.10625457763672, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8639323711395264, + "num_tokens": 776684248.0, + "step": 20355 + }, + { + "epoch": 2.589492430988424, + "ewc_loss": 0.04968222603201866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4841112463036552e-05, + "grad_norm": 31.264102935791016, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.874280571937561, + "num_tokens": 776722684.0, + "step": 20356 + }, + { + "epoch": 2.5896196412670145, + "ewc_loss": 0.04977869987487793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4889350243029185e-05, + "grad_norm": 31.202726364135742, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8762098550796509, + "num_tokens": 776764635.0, + "step": 20357 + }, + { + "epoch": 2.589746851545605, + "ewc_loss": 0.049707092344760895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4853545255609788e-05, + "grad_norm": 31.333166122436523, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8612704277038574, + "num_tokens": 776805683.0, + "step": 20358 + }, + { + "epoch": 2.5898740618241956, + "ewc_loss": 0.04975949600338936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4879747797967866e-05, + "grad_norm": 31.20584487915039, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8746166825294495, + "num_tokens": 776841229.0, + "step": 20359 + }, + { + "epoch": 2.5900012721027856, + "ewc_loss": 0.04964599758386612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4822998966556042e-05, + "grad_norm": 31.262216567993164, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8789080381393433, + "num_tokens": 776881319.0, + "step": 20360 + }, + { + "epoch": 2.5901284823813766, + "ewc_loss": 0.049693383276462555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4846691303537227e-05, + "grad_norm": 31.341697692871094, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8772044777870178, + "num_tokens": 776919436.0, + "step": 20361 + }, + { + "epoch": 2.5902556926599667, + "ewc_loss": 0.049729123711586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4864561055437662e-05, + "grad_norm": 31.322113037109375, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.8544172048568726, + "num_tokens": 776958644.0, + "step": 20362 + }, + { + "epoch": 2.5903829029385577, + "ewc_loss": 0.0496862530708313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4843126084306277e-05, + "grad_norm": 31.161296844482422, + "learning_rate": 1e-06, + "loss": 0.3651, + "mean_token_accuracy": 0.8889248371124268, + "num_tokens": 776994509.0, + "step": 20363 + }, + { + "epoch": 2.5905101132171477, + "ewc_loss": 0.04965857416391373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.48292872129241e-05, + "grad_norm": 31.34977912902832, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8788485527038574, + "num_tokens": 777037557.0, + "step": 20364 + }, + { + "epoch": 2.5906373234957387, + "ewc_loss": 0.0498194582760334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.490972838131711e-05, + "grad_norm": 31.31607437133789, + "learning_rate": 1e-06, + "loss": 0.4865, + "mean_token_accuracy": 0.852337658405304, + "num_tokens": 777077237.0, + "step": 20365 + }, + { + "epoch": 2.590764533774329, + "ewc_loss": 0.04959184303879738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.479592149029486e-05, + "grad_norm": 31.281965255737305, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.878505527973175, + "num_tokens": 777120813.0, + "step": 20366 + }, + { + "epoch": 2.5908917440529193, + "ewc_loss": 0.0497773140668869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4888657208066434e-05, + "grad_norm": 31.257474899291992, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.859581708908081, + "num_tokens": 777155809.0, + "step": 20367 + }, + { + "epoch": 2.59101895433151, + "ewc_loss": 0.04977044090628624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4885221137083136e-05, + "grad_norm": 31.167587280273438, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8716421127319336, + "num_tokens": 777195868.0, + "step": 20368 + }, + { + "epoch": 2.5911461646101004, + "ewc_loss": 0.04971477389335632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4857386961230077e-05, + "grad_norm": 31.345754623413086, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8764907121658325, + "num_tokens": 777233406.0, + "step": 20369 + }, + { + "epoch": 2.591273374888691, + "ewc_loss": 0.049852337688207626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4926168407546356e-05, + "grad_norm": 31.281417846679688, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8742191791534424, + "num_tokens": 777272817.0, + "step": 20370 + }, + { + "epoch": 2.5914005851672814, + "ewc_loss": 0.049756865948438644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4878432668629102e-05, + "grad_norm": 31.27115821838379, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8676543235778809, + "num_tokens": 777312398.0, + "step": 20371 + }, + { + "epoch": 2.591527795445872, + "ewc_loss": 0.04973721131682396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4868606487871148e-05, + "grad_norm": 31.177656173706055, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8815882205963135, + "num_tokens": 777349375.0, + "step": 20372 + }, + { + "epoch": 2.5916550057244625, + "ewc_loss": 0.0498276986181736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4913850211305544e-05, + "grad_norm": 31.373937606811523, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8824702501296997, + "num_tokens": 777394806.0, + "step": 20373 + }, + { + "epoch": 2.591782216003053, + "ewc_loss": 0.049743689596652985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.487184428900946e-05, + "grad_norm": 31.25183868408203, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8775694966316223, + "num_tokens": 777434350.0, + "step": 20374 + }, + { + "epoch": 2.5919094262816436, + "ewc_loss": 0.04972894862294197, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.486447374394629e-05, + "grad_norm": 31.05664825439453, + "learning_rate": 1e-06, + "loss": 0.487, + "mean_token_accuracy": 0.8468055725097656, + "num_tokens": 777469690.0, + "step": 20375 + }, + { + "epoch": 2.592036636560234, + "ewc_loss": 0.04976779222488403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4883896912797354e-05, + "grad_norm": 31.412052154541016, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8663471341133118, + "num_tokens": 777502095.0, + "step": 20376 + }, + { + "epoch": 2.5921638468388246, + "ewc_loss": 0.04994291439652443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.497145760571584e-05, + "grad_norm": 31.18359375, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8837589025497437, + "num_tokens": 777540387.0, + "step": 20377 + }, + { + "epoch": 2.592291057117415, + "ewc_loss": 0.04980355501174927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.490177757863421e-05, + "grad_norm": 31.36650848388672, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.87819504737854, + "num_tokens": 777581855.0, + "step": 20378 + }, + { + "epoch": 2.5924182673960057, + "ewc_loss": 0.04982646927237511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4913235392887145e-05, + "grad_norm": 31.215944290161133, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8723089694976807, + "num_tokens": 777619852.0, + "step": 20379 + }, + { + "epoch": 2.592545477674596, + "ewc_loss": 0.049731794744729996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4865898012649268e-05, + "grad_norm": 31.355838775634766, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8653292655944824, + "num_tokens": 777662772.0, + "step": 20380 + }, + { + "epoch": 2.5926726879531867, + "ewc_loss": 0.04982464760541916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.491232407919597e-05, + "grad_norm": 31.219539642333984, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8554704785346985, + "num_tokens": 777701171.0, + "step": 20381 + }, + { + "epoch": 2.5927998982317773, + "ewc_loss": 0.04982230067253113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4911150831030682e-05, + "grad_norm": 31.33625602722168, + "learning_rate": 1e-06, + "loss": 0.4651, + "mean_token_accuracy": 0.858675479888916, + "num_tokens": 777738245.0, + "step": 20382 + }, + { + "epoch": 2.592927108510368, + "ewc_loss": 0.049857594072818756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.492879684723448e-05, + "grad_norm": 31.173471450805664, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8726515769958496, + "num_tokens": 777775732.0, + "step": 20383 + }, + { + "epoch": 2.5930543187889583, + "ewc_loss": 0.04972090572118759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4860453777364455e-05, + "grad_norm": 31.29644775390625, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8724513053894043, + "num_tokens": 777811077.0, + "step": 20384 + }, + { + "epoch": 2.5931815290675484, + "ewc_loss": 0.04990801587700844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4954008040367626e-05, + "grad_norm": 31.24390983581543, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8703116178512573, + "num_tokens": 777854107.0, + "step": 20385 + }, + { + "epoch": 2.5933087393461394, + "ewc_loss": 0.04975741356611252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4878707336029038e-05, + "grad_norm": 31.31217384338379, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8715826272964478, + "num_tokens": 777891950.0, + "step": 20386 + }, + { + "epoch": 2.5934359496247295, + "ewc_loss": 0.04981599748134613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4907998522394337e-05, + "grad_norm": 31.286205291748047, + "learning_rate": 1e-06, + "loss": 0.3618, + "mean_token_accuracy": 0.8897468447685242, + "num_tokens": 777932638.0, + "step": 20387 + }, + { + "epoch": 2.5935631599033204, + "ewc_loss": 0.04980255290865898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4901277356548235e-05, + "grad_norm": 31.3080997467041, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8709967732429504, + "num_tokens": 777977733.0, + "step": 20388 + }, + { + "epoch": 2.5936903701819105, + "ewc_loss": 0.04974617063999176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4873084839782678e-05, + "grad_norm": 31.249399185180664, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8770380616188049, + "num_tokens": 778020451.0, + "step": 20389 + }, + { + "epoch": 2.5938175804605015, + "ewc_loss": 0.049763232469558716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4881615900085308e-05, + "grad_norm": 31.226346969604492, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8779988288879395, + "num_tokens": 778059594.0, + "step": 20390 + }, + { + "epoch": 2.5939447907390916, + "ewc_loss": 0.0498400516808033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4920025680330582e-05, + "grad_norm": 31.300514221191406, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8757557272911072, + "num_tokens": 778099235.0, + "step": 20391 + }, + { + "epoch": 2.594072001017682, + "ewc_loss": 0.04982338473200798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.491169288987294e-05, + "grad_norm": 31.35506820678711, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8831022381782532, + "num_tokens": 778135430.0, + "step": 20392 + }, + { + "epoch": 2.5941992112962726, + "ewc_loss": 0.049820128828287125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4910064894356765e-05, + "grad_norm": 31.234012603759766, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8651313185691833, + "num_tokens": 778175111.0, + "step": 20393 + }, + { + "epoch": 2.594326421574863, + "ewc_loss": 0.04979528486728668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.489764301571995e-05, + "grad_norm": 31.369733810424805, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8632054924964905, + "num_tokens": 778211954.0, + "step": 20394 + }, + { + "epoch": 2.5944536318534537, + "ewc_loss": 0.04981904849410057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4909524654503912e-05, + "grad_norm": 31.337051391601562, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8569846153259277, + "num_tokens": 778246580.0, + "step": 20395 + }, + { + "epoch": 2.594580842132044, + "ewc_loss": 0.04974522441625595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4872611902537756e-05, + "grad_norm": 31.355302810668945, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8785942196846008, + "num_tokens": 778288659.0, + "step": 20396 + }, + { + "epoch": 2.5947080524106347, + "ewc_loss": 0.04985792934894562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4928964194259606e-05, + "grad_norm": 31.415695190429688, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8715733289718628, + "num_tokens": 778325365.0, + "step": 20397 + }, + { + "epoch": 2.5948352626892253, + "ewc_loss": 0.04972865805029869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4864328224794008e-05, + "grad_norm": 31.291400909423828, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8795884251594543, + "num_tokens": 778362982.0, + "step": 20398 + }, + { + "epoch": 2.594962472967816, + "ewc_loss": 0.04974662512540817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.487331221345812e-05, + "grad_norm": 31.318700790405273, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8568949699401855, + "num_tokens": 778394230.0, + "step": 20399 + }, + { + "epoch": 2.5950896832464063, + "ewc_loss": 0.04980657622218132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4903287339839153e-05, + "grad_norm": 31.389707565307617, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8619464039802551, + "num_tokens": 778434345.0, + "step": 20400 + }, + { + "epoch": 2.595216893524997, + "ewc_loss": 0.049681633710861206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4840817786753178e-05, + "grad_norm": 31.248308181762695, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8676658868789673, + "num_tokens": 778478303.0, + "step": 20401 + }, + { + "epoch": 2.5953441038035874, + "ewc_loss": 0.049730636179447174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4865317755029537e-05, + "grad_norm": 31.258018493652344, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8775629997253418, + "num_tokens": 778513915.0, + "step": 20402 + }, + { + "epoch": 2.595471314082178, + "ewc_loss": 0.04975227639079094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.487613892299123e-05, + "grad_norm": 31.307571411132812, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8874956369400024, + "num_tokens": 778554294.0, + "step": 20403 + }, + { + "epoch": 2.5955985243607684, + "ewc_loss": 0.04973405972123146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4867029424058273e-05, + "grad_norm": 31.307052612304688, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8698247671127319, + "num_tokens": 778592836.0, + "step": 20404 + }, + { + "epoch": 2.595725734639359, + "ewc_loss": 0.04976588860154152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4882943762349896e-05, + "grad_norm": 31.35295295715332, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8623057007789612, + "num_tokens": 778629316.0, + "step": 20405 + }, + { + "epoch": 2.5958529449179495, + "ewc_loss": 0.04974915459752083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4874576411093585e-05, + "grad_norm": 31.303682327270508, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8673734664916992, + "num_tokens": 778659065.0, + "step": 20406 + }, + { + "epoch": 2.59598015519654, + "ewc_loss": 0.04970422387123108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4852111891959794e-05, + "grad_norm": 31.350126266479492, + "learning_rate": 1e-06, + "loss": 0.3676, + "mean_token_accuracy": 0.8881469964981079, + "num_tokens": 778693949.0, + "step": 20407 + }, + { + "epoch": 2.5961073654751305, + "ewc_loss": 0.04978989437222481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4894947273423895e-05, + "grad_norm": 31.387104034423828, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8763778209686279, + "num_tokens": 778738045.0, + "step": 20408 + }, + { + "epoch": 2.596234575753721, + "ewc_loss": 0.04973362013697624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4866809326340444e-05, + "grad_norm": 31.298391342163086, + "learning_rate": 1e-06, + "loss": 0.541, + "mean_token_accuracy": 0.8298205137252808, + "num_tokens": 778769578.0, + "step": 20409 + }, + { + "epoch": 2.596361786032311, + "ewc_loss": 0.04977445304393768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4887225663405843e-05, + "grad_norm": 31.36800765991211, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8671587705612183, + "num_tokens": 778810052.0, + "step": 20410 + }, + { + "epoch": 2.596488996310902, + "ewc_loss": 0.04974731430411339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4873657821444795e-05, + "grad_norm": 31.296737670898438, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8658934831619263, + "num_tokens": 778850868.0, + "step": 20411 + }, + { + "epoch": 2.596616206589492, + "ewc_loss": 0.04971202462911606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4856011805240996e-05, + "grad_norm": 31.473793029785156, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.878726065158844, + "num_tokens": 778890129.0, + "step": 20412 + }, + { + "epoch": 2.596743416868083, + "ewc_loss": 0.049813948571681976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.490697443136014e-05, + "grad_norm": 31.283119201660156, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8753323554992676, + "num_tokens": 778923181.0, + "step": 20413 + }, + { + "epoch": 2.5968706271466733, + "ewc_loss": 0.04971152916550636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4855764422682114e-05, + "grad_norm": 31.262426376342773, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.863766074180603, + "num_tokens": 778963328.0, + "step": 20414 + }, + { + "epoch": 2.596997837425264, + "ewc_loss": 0.04975070059299469, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4875349481590092e-05, + "grad_norm": 31.35357093811035, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8825165629386902, + "num_tokens": 779003932.0, + "step": 20415 + }, + { + "epoch": 2.5971250477038543, + "ewc_loss": 0.0497177429497242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.485887125658337e-05, + "grad_norm": 31.10726547241211, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8611805438995361, + "num_tokens": 779043913.0, + "step": 20416 + }, + { + "epoch": 2.597252257982445, + "ewc_loss": 0.04963439330458641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.481719639035873e-05, + "grad_norm": 31.320404052734375, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8685239553451538, + "num_tokens": 779077126.0, + "step": 20417 + }, + { + "epoch": 2.5973794682610354, + "ewc_loss": 0.049927737563848495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4963868781924248e-05, + "grad_norm": 31.234601974487305, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8763647079467773, + "num_tokens": 779111544.0, + "step": 20418 + }, + { + "epoch": 2.597506678539626, + "ewc_loss": 0.04975056275725365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.487528217898216e-05, + "grad_norm": 31.367002487182617, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8710480332374573, + "num_tokens": 779145493.0, + "step": 20419 + }, + { + "epoch": 2.5976338888182164, + "ewc_loss": 0.04994542524218559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4972712708404288e-05, + "grad_norm": 31.3627872467041, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.864412784576416, + "num_tokens": 779189216.0, + "step": 20420 + }, + { + "epoch": 2.597761099096807, + "ewc_loss": 0.04974215105175972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.487107485649176e-05, + "grad_norm": 31.287492752075195, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8788399696350098, + "num_tokens": 779225914.0, + "step": 20421 + }, + { + "epoch": 2.5978883093753975, + "ewc_loss": 0.04979163780808449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4895818569348194e-05, + "grad_norm": 31.183042526245117, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8764280080795288, + "num_tokens": 779264118.0, + "step": 20422 + }, + { + "epoch": 2.598015519653988, + "ewc_loss": 0.049885157495737076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4942579329945147e-05, + "grad_norm": 31.412439346313477, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8755383491516113, + "num_tokens": 779301694.0, + "step": 20423 + }, + { + "epoch": 2.5981427299325786, + "ewc_loss": 0.049801234155893326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4900617063394748e-05, + "grad_norm": 31.222497940063477, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8704016208648682, + "num_tokens": 779339012.0, + "step": 20424 + }, + { + "epoch": 2.598269940211169, + "ewc_loss": 0.04975457116961479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4877284886315465e-05, + "grad_norm": 31.2581787109375, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8765064477920532, + "num_tokens": 779376358.0, + "step": 20425 + }, + { + "epoch": 2.5983971504897596, + "ewc_loss": 0.04979914799332619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4899574782466516e-05, + "grad_norm": 31.24649429321289, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8703715801239014, + "num_tokens": 779416344.0, + "step": 20426 + }, + { + "epoch": 2.59852436076835, + "ewc_loss": 0.04989681392908096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4948407371994108e-05, + "grad_norm": 31.297964096069336, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8608918190002441, + "num_tokens": 779452189.0, + "step": 20427 + }, + { + "epoch": 2.5986515710469407, + "ewc_loss": 0.04984679073095322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.492339444870595e-05, + "grad_norm": 31.22878074645996, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8711791634559631, + "num_tokens": 779493513.0, + "step": 20428 + }, + { + "epoch": 2.598778781325531, + "ewc_loss": 0.04985131695866585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4925659090513363e-05, + "grad_norm": 31.245847702026367, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8780266046524048, + "num_tokens": 779533061.0, + "step": 20429 + }, + { + "epoch": 2.5989059916041217, + "ewc_loss": 0.0498611219227314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.493056126695592e-05, + "grad_norm": 31.27274513244629, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.864170491695404, + "num_tokens": 779570159.0, + "step": 20430 + }, + { + "epoch": 2.5990332018827123, + "ewc_loss": 0.04988647252321243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4943235985119827e-05, + "grad_norm": 31.218446731567383, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8695133924484253, + "num_tokens": 779609066.0, + "step": 20431 + }, + { + "epoch": 2.599160412161303, + "ewc_loss": 0.049750491976737976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.487524579919409e-05, + "grad_norm": 31.163570404052734, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8709468841552734, + "num_tokens": 779651087.0, + "step": 20432 + }, + { + "epoch": 2.5992876224398933, + "ewc_loss": 0.04986416548490524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4932081942097284e-05, + "grad_norm": 31.35260009765625, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8855905532836914, + "num_tokens": 779689531.0, + "step": 20433 + }, + { + "epoch": 2.599414832718484, + "ewc_loss": 0.04980688542127609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4903441953938454e-05, + "grad_norm": 31.281578063964844, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8670187592506409, + "num_tokens": 779724062.0, + "step": 20434 + }, + { + "epoch": 2.599542042997074, + "ewc_loss": 0.0498967207968235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4948360078269616e-05, + "grad_norm": 31.387277603149414, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8586965203285217, + "num_tokens": 779766029.0, + "step": 20435 + }, + { + "epoch": 2.599669253275665, + "ewc_loss": 0.04978298768401146, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.489149301254656e-05, + "grad_norm": 31.221513748168945, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8562660813331604, + "num_tokens": 779811065.0, + "step": 20436 + }, + { + "epoch": 2.599796463554255, + "ewc_loss": 0.049832213670015335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4916107577155344e-05, + "grad_norm": 31.320886611938477, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8582851886749268, + "num_tokens": 779852442.0, + "step": 20437 + }, + { + "epoch": 2.599923673832846, + "ewc_loss": 0.0498175173997879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.490875885996502e-05, + "grad_norm": 31.293899536132812, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8761106729507446, + "num_tokens": 779885993.0, + "step": 20438 + }, + { + "epoch": 2.600050884111436, + "ewc_loss": 0.04982199892401695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4910999854910187e-05, + "grad_norm": 31.312557220458984, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8755548000335693, + "num_tokens": 779927791.0, + "step": 20439 + }, + { + "epoch": 2.6001780943900266, + "ewc_loss": 0.04986178129911423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.493089050403796e-05, + "grad_norm": 31.269624710083008, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8750776052474976, + "num_tokens": 779961517.0, + "step": 20440 + }, + { + "epoch": 2.600305304668617, + "ewc_loss": 0.049842871725559235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.492143539711833e-05, + "grad_norm": 31.31148338317871, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8590289354324341, + "num_tokens": 779992902.0, + "step": 20441 + }, + { + "epoch": 2.6004325149472076, + "ewc_loss": 0.04982498288154602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4912491426221095e-05, + "grad_norm": 31.234155654907227, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.872143030166626, + "num_tokens": 780032849.0, + "step": 20442 + }, + { + "epoch": 2.600559725225798, + "ewc_loss": 0.04982522130012512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.491261147952173e-05, + "grad_norm": 31.338342666625977, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8837289810180664, + "num_tokens": 780072563.0, + "step": 20443 + }, + { + "epoch": 2.6006869355043887, + "ewc_loss": 0.049783460795879364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4891731300158426e-05, + "grad_norm": 31.216163635253906, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8714174032211304, + "num_tokens": 780115299.0, + "step": 20444 + }, + { + "epoch": 2.600814145782979, + "ewc_loss": 0.0497184656560421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4859233235474676e-05, + "grad_norm": 31.381513595581055, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8696919679641724, + "num_tokens": 780155792.0, + "step": 20445 + }, + { + "epoch": 2.6009413560615697, + "ewc_loss": 0.04983728006482124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.491863961040508e-05, + "grad_norm": 31.237361907958984, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8675997257232666, + "num_tokens": 780189040.0, + "step": 20446 + }, + { + "epoch": 2.6010685663401603, + "ewc_loss": 0.049764424562454224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.488221252860967e-05, + "grad_norm": 31.282588958740234, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8710014820098877, + "num_tokens": 780229463.0, + "step": 20447 + }, + { + "epoch": 2.601195776618751, + "ewc_loss": 0.04981211572885513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4906057660700753e-05, + "grad_norm": 31.208629608154297, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8832699060440063, + "num_tokens": 780265961.0, + "step": 20448 + }, + { + "epoch": 2.6013229868973413, + "ewc_loss": 0.04980984702706337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4904924430302344e-05, + "grad_norm": 31.293628692626953, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8656097650527954, + "num_tokens": 780303220.0, + "step": 20449 + }, + { + "epoch": 2.601450197175932, + "ewc_loss": 0.04993271082639694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4966355340438895e-05, + "grad_norm": 31.27480125427246, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8689529895782471, + "num_tokens": 780340456.0, + "step": 20450 + }, + { + "epoch": 2.6015774074545224, + "ewc_loss": 0.049809109419584274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4904555175453424e-05, + "grad_norm": 31.45012092590332, + "learning_rate": 1e-06, + "loss": 0.4429, + "mean_token_accuracy": 0.869428813457489, + "num_tokens": 780381402.0, + "step": 20451 + }, + { + "epoch": 2.601704617733113, + "ewc_loss": 0.049826864153146744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4913431843742728e-05, + "grad_norm": 31.346343994140625, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8754385709762573, + "num_tokens": 780422571.0, + "step": 20452 + }, + { + "epoch": 2.6018318280117034, + "ewc_loss": 0.04976128414273262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.488064274075441e-05, + "grad_norm": 31.364713668823242, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8672078847885132, + "num_tokens": 780462447.0, + "step": 20453 + }, + { + "epoch": 2.601959038290294, + "ewc_loss": 0.0497438944876194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.487194797140546e-05, + "grad_norm": 31.28011131286621, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8708478808403015, + "num_tokens": 780499183.0, + "step": 20454 + }, + { + "epoch": 2.6020862485688845, + "ewc_loss": 0.049825508147478104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4912753360695206e-05, + "grad_norm": 31.377056121826172, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8596370816230774, + "num_tokens": 780539216.0, + "step": 20455 + }, + { + "epoch": 2.602213458847475, + "ewc_loss": 0.04982093349099159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.491046689101495e-05, + "grad_norm": 31.22120475769043, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8778709173202515, + "num_tokens": 780578725.0, + "step": 20456 + }, + { + "epoch": 2.6023406691260655, + "ewc_loss": 0.04977194964885712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4885974198696204e-05, + "grad_norm": 31.346725463867188, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8716386556625366, + "num_tokens": 780615175.0, + "step": 20457 + }, + { + "epoch": 2.6024678794046556, + "ewc_loss": 0.04981701821088791, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4908509658416733e-05, + "grad_norm": 31.309734344482422, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8740487098693848, + "num_tokens": 780651186.0, + "step": 20458 + }, + { + "epoch": 2.6025950896832466, + "ewc_loss": 0.04973237216472626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4866185412975028e-05, + "grad_norm": 31.352901458740234, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8636016845703125, + "num_tokens": 780685550.0, + "step": 20459 + }, + { + "epoch": 2.6027222999618367, + "ewc_loss": 0.049769945442676544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.488497193553485e-05, + "grad_norm": 31.256240844726562, + "learning_rate": 1e-06, + "loss": 0.349, + "mean_token_accuracy": 0.8931059241294861, + "num_tokens": 780725477.0, + "step": 20460 + }, + { + "epoch": 2.6028495102404277, + "ewc_loss": 0.04979965463280678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.489982762199361e-05, + "grad_norm": 31.31452751159668, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8734710216522217, + "num_tokens": 780762481.0, + "step": 20461 + }, + { + "epoch": 2.6029767205190177, + "ewc_loss": 0.04986650496721268, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4933253371273167e-05, + "grad_norm": 31.34199333190918, + "learning_rate": 1e-06, + "loss": 0.4832, + "mean_token_accuracy": 0.8516315817832947, + "num_tokens": 780800305.0, + "step": 20462 + }, + { + "epoch": 2.6031039307976087, + "ewc_loss": 0.049791909754276276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.489595499355346e-05, + "grad_norm": 31.347187042236328, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8730051517486572, + "num_tokens": 780836271.0, + "step": 20463 + }, + { + "epoch": 2.603231141076199, + "ewc_loss": 0.04976225271821022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4881126591935754e-05, + "grad_norm": 31.299959182739258, + "learning_rate": 1e-06, + "loss": 0.3772, + "mean_token_accuracy": 0.8830925226211548, + "num_tokens": 780877363.0, + "step": 20464 + }, + { + "epoch": 2.6033583513547893, + "ewc_loss": 0.04985513165593147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.492756539140828e-05, + "grad_norm": 31.338274002075195, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8601940870285034, + "num_tokens": 780922982.0, + "step": 20465 + }, + { + "epoch": 2.60348556163338, + "ewc_loss": 0.04980366677045822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.490183396730572e-05, + "grad_norm": 31.4229679107666, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.86935955286026, + "num_tokens": 780957925.0, + "step": 20466 + }, + { + "epoch": 2.6036127719119704, + "ewc_loss": 0.04978320002555847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.489160033292137e-05, + "grad_norm": 31.205718994140625, + "learning_rate": 1e-06, + "loss": 0.4905, + "mean_token_accuracy": 0.8535804748535156, + "num_tokens": 780996922.0, + "step": 20467 + }, + { + "epoch": 2.603739982190561, + "ewc_loss": 0.049775801599025726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.488790050847456e-05, + "grad_norm": 31.378068923950195, + "learning_rate": 1e-06, + "loss": 0.4887, + "mean_token_accuracy": 0.8540052175521851, + "num_tokens": 781037209.0, + "step": 20468 + }, + { + "epoch": 2.6038671924691514, + "ewc_loss": 0.04985741525888443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4928707716753706e-05, + "grad_norm": 31.14318084716797, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8648425340652466, + "num_tokens": 781078077.0, + "step": 20469 + }, + { + "epoch": 2.603994402747742, + "ewc_loss": 0.04981759935617447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4908798877731897e-05, + "grad_norm": 31.324684143066406, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8692814707756042, + "num_tokens": 781122516.0, + "step": 20470 + }, + { + "epoch": 2.6041216130263325, + "ewc_loss": 0.049872174859046936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4936087356763892e-05, + "grad_norm": 31.41448974609375, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8633902072906494, + "num_tokens": 781156290.0, + "step": 20471 + }, + { + "epoch": 2.604248823304923, + "ewc_loss": 0.04972989484667778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4864946681191213e-05, + "grad_norm": 31.181415557861328, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8788459300994873, + "num_tokens": 781192769.0, + "step": 20472 + }, + { + "epoch": 2.6043760335835135, + "ewc_loss": 0.049789704382419586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.489485268597491e-05, + "grad_norm": 31.247697830200195, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8753045201301575, + "num_tokens": 781227594.0, + "step": 20473 + }, + { + "epoch": 2.604503243862104, + "ewc_loss": 0.04982982203364372, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.491491068212781e-05, + "grad_norm": 31.325077056884766, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8819348812103271, + "num_tokens": 781259369.0, + "step": 20474 + }, + { + "epoch": 2.6046304541406946, + "ewc_loss": 0.04991097003221512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4955485059763305e-05, + "grad_norm": 31.170753479003906, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8706871271133423, + "num_tokens": 781296788.0, + "step": 20475 + }, + { + "epoch": 2.604757664419285, + "ewc_loss": 0.0498213954269886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.49106979026692e-05, + "grad_norm": 31.344125747680664, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8730102181434631, + "num_tokens": 781333103.0, + "step": 20476 + }, + { + "epoch": 2.6048848746978757, + "ewc_loss": 0.049951400607824326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.497569948900491e-05, + "grad_norm": 31.20509910583496, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8796581029891968, + "num_tokens": 781372691.0, + "step": 20477 + }, + { + "epoch": 2.605012084976466, + "ewc_loss": 0.0498485267162323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4924263925640844e-05, + "grad_norm": 31.292835235595703, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.866987407207489, + "num_tokens": 781410480.0, + "step": 20478 + }, + { + "epoch": 2.6051392952550567, + "ewc_loss": 0.04995408654212952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.497704372217413e-05, + "grad_norm": 31.3044376373291, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.862690806388855, + "num_tokens": 781454110.0, + "step": 20479 + }, + { + "epoch": 2.6052665055336472, + "ewc_loss": 0.04989255219697952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4946275516413152e-05, + "grad_norm": 31.188871383666992, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8689066171646118, + "num_tokens": 781494034.0, + "step": 20480 + }, + { + "epoch": 2.6053937158122378, + "ewc_loss": 0.04991001635789871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4955008484539576e-05, + "grad_norm": 31.32132911682129, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8690065741539001, + "num_tokens": 781536225.0, + "step": 20481 + }, + { + "epoch": 2.6055209260908283, + "ewc_loss": 0.04992944374680519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.496472188795451e-05, + "grad_norm": 31.25119400024414, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8768696188926697, + "num_tokens": 781573987.0, + "step": 20482 + }, + { + "epoch": 2.6056481363694184, + "ewc_loss": 0.049961529672145844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4980765374493785e-05, + "grad_norm": 31.41497230529785, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8762321472167969, + "num_tokens": 781608987.0, + "step": 20483 + }, + { + "epoch": 2.6057753466480094, + "ewc_loss": 0.05002095550298691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5010478566400707e-05, + "grad_norm": 31.273611068725586, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.878319263458252, + "num_tokens": 781646185.0, + "step": 20484 + }, + { + "epoch": 2.6059025569265994, + "ewc_loss": 0.049857236444950104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4928618586272933e-05, + "grad_norm": 31.33070945739746, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8612162470817566, + "num_tokens": 781682999.0, + "step": 20485 + }, + { + "epoch": 2.6060297672051904, + "ewc_loss": 0.04996927082538605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.498463618394453e-05, + "grad_norm": 31.342979431152344, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.8687844276428223, + "num_tokens": 781726052.0, + "step": 20486 + }, + { + "epoch": 2.6061569774837805, + "ewc_loss": 0.04988696053624153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4943479729699902e-05, + "grad_norm": 31.30643653869629, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8840252757072449, + "num_tokens": 781760562.0, + "step": 20487 + }, + { + "epoch": 2.6062841877623715, + "ewc_loss": 0.04983345419168472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4916727852541953e-05, + "grad_norm": 31.25956916809082, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.860214352607727, + "num_tokens": 781803765.0, + "step": 20488 + }, + { + "epoch": 2.6064113980409616, + "ewc_loss": 0.04997527226805687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4987635697470978e-05, + "grad_norm": 31.274850845336914, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.871817409992218, + "num_tokens": 781845419.0, + "step": 20489 + }, + { + "epoch": 2.606538608319552, + "ewc_loss": 0.049922920763492584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4961460439953953e-05, + "grad_norm": 31.3554744720459, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8793835043907166, + "num_tokens": 781883371.0, + "step": 20490 + }, + { + "epoch": 2.6066658185981426, + "ewc_loss": 0.049947574734687805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4973787731141783e-05, + "grad_norm": 31.245189666748047, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8698474764823914, + "num_tokens": 781919669.0, + "step": 20491 + }, + { + "epoch": 2.606793028876733, + "ewc_loss": 0.0499085858464241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4954293621703982e-05, + "grad_norm": 31.329286575317383, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.8773109316825867, + "num_tokens": 781953212.0, + "step": 20492 + }, + { + "epoch": 2.6069202391553237, + "ewc_loss": 0.049949947744607925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4974973712232895e-05, + "grad_norm": 31.34925651550293, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8747666478157043, + "num_tokens": 781996824.0, + "step": 20493 + }, + { + "epoch": 2.607047449433914, + "ewc_loss": 0.049870528280735016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4935263354564086e-05, + "grad_norm": 31.108383178710938, + "learning_rate": 1e-06, + "loss": 0.3543, + "mean_token_accuracy": 0.8904911279678345, + "num_tokens": 782035350.0, + "step": 20494 + }, + { + "epoch": 2.6071746597125047, + "ewc_loss": 0.04988661780953407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.494330874469597e-05, + "grad_norm": 31.373559951782227, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8654518127441406, + "num_tokens": 782073217.0, + "step": 20495 + }, + { + "epoch": 2.6073018699910953, + "ewc_loss": 0.04993360862135887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.496680463082157e-05, + "grad_norm": 31.32141876220703, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8641105890274048, + "num_tokens": 782110777.0, + "step": 20496 + }, + { + "epoch": 2.607429080269686, + "ewc_loss": 0.04981492832303047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4907463739509694e-05, + "grad_norm": 31.262588500976562, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8659642338752747, + "num_tokens": 782153300.0, + "step": 20497 + }, + { + "epoch": 2.6075562905482763, + "ewc_loss": 0.04985206574201584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.492603198334109e-05, + "grad_norm": 31.37528419494629, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8765131831169128, + "num_tokens": 782191945.0, + "step": 20498 + }, + { + "epoch": 2.607683500826867, + "ewc_loss": 0.04995780810713768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4978904548333958e-05, + "grad_norm": 31.464101791381836, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8819307088851929, + "num_tokens": 782230472.0, + "step": 20499 + }, + { + "epoch": 2.6078107111054574, + "ewc_loss": 0.049818720668554306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4909360945457593e-05, + "grad_norm": 31.293458938598633, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8612930774688721, + "num_tokens": 782263868.0, + "step": 20500 + }, + { + "epoch": 2.607937921384048, + "ewc_loss": 0.049855854362249374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4927927370299585e-05, + "grad_norm": 31.360271453857422, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8736551403999329, + "num_tokens": 782299781.0, + "step": 20501 + }, + { + "epoch": 2.6080651316626384, + "ewc_loss": 0.04982242360711098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4911212676670402e-05, + "grad_norm": 31.38632583618164, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.86454176902771, + "num_tokens": 782343376.0, + "step": 20502 + }, + { + "epoch": 2.608192341941229, + "ewc_loss": 0.04977119714021683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4885599486879073e-05, + "grad_norm": 31.271728515625, + "learning_rate": 1e-06, + "loss": 0.3871, + "mean_token_accuracy": 0.8836397528648376, + "num_tokens": 782389894.0, + "step": 20503 + }, + { + "epoch": 2.6083195522198195, + "ewc_loss": 0.04981890693306923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4909453713917173e-05, + "grad_norm": 31.34010124206543, + "learning_rate": 1e-06, + "loss": 0.4068, + "mean_token_accuracy": 0.8750934600830078, + "num_tokens": 782430055.0, + "step": 20504 + }, + { + "epoch": 2.60844676249841, + "ewc_loss": 0.04990595579147339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.495297849236522e-05, + "grad_norm": 31.27728271484375, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8727914094924927, + "num_tokens": 782467211.0, + "step": 20505 + }, + { + "epoch": 2.6085739727770005, + "ewc_loss": 0.049746714532375336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.487335768819321e-05, + "grad_norm": 31.291799545288086, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.863449215888977, + "num_tokens": 782505913.0, + "step": 20506 + }, + { + "epoch": 2.608701183055591, + "ewc_loss": 0.049882229417562485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4941115043475293e-05, + "grad_norm": 31.212528228759766, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8706848621368408, + "num_tokens": 782541858.0, + "step": 20507 + }, + { + "epoch": 2.608828393334181, + "ewc_loss": 0.04984874650835991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4924373065005057e-05, + "grad_norm": 31.2797794342041, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8719748258590698, + "num_tokens": 782575854.0, + "step": 20508 + }, + { + "epoch": 2.608955603612772, + "ewc_loss": 0.049947112798690796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4973556719487533e-05, + "grad_norm": 31.32855224609375, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8731871843338013, + "num_tokens": 782614005.0, + "step": 20509 + }, + { + "epoch": 2.609082813891362, + "ewc_loss": 0.049838561564683914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4919279894675128e-05, + "grad_norm": 31.39154052734375, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8791899681091309, + "num_tokens": 782650147.0, + "step": 20510 + }, + { + "epoch": 2.609210024169953, + "ewc_loss": 0.049865592271089554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4932796804932877e-05, + "grad_norm": 31.3027286529541, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8711559176445007, + "num_tokens": 782683215.0, + "step": 20511 + }, + { + "epoch": 2.6093372344485433, + "ewc_loss": 0.04990821331739426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4954106265795417e-05, + "grad_norm": 31.480716705322266, + "learning_rate": 1e-06, + "loss": 0.4046, + "mean_token_accuracy": 0.8772499561309814, + "num_tokens": 782721184.0, + "step": 20512 + }, + { + "epoch": 2.609464444727134, + "ewc_loss": 0.04990767315030098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4953837055363692e-05, + "grad_norm": 31.416547775268555, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8662674427032471, + "num_tokens": 782759891.0, + "step": 20513 + }, + { + "epoch": 2.6095916550057243, + "ewc_loss": 0.04985488951206207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4927445338107646e-05, + "grad_norm": 31.399490356445312, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8607760071754456, + "num_tokens": 782796398.0, + "step": 20514 + }, + { + "epoch": 2.609718865284315, + "ewc_loss": 0.04977065324783325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.488532663846854e-05, + "grad_norm": 31.327802658081055, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8812614679336548, + "num_tokens": 782835968.0, + "step": 20515 + }, + { + "epoch": 2.6098460755629054, + "ewc_loss": 0.04977896809577942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4889484848245047e-05, + "grad_norm": 31.307193756103516, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8828853964805603, + "num_tokens": 782871367.0, + "step": 20516 + }, + { + "epoch": 2.609973285841496, + "ewc_loss": 0.04985993728041649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4929968276410364e-05, + "grad_norm": 31.49583625793457, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8922774791717529, + "num_tokens": 782905766.0, + "step": 20517 + }, + { + "epoch": 2.6101004961200864, + "ewc_loss": 0.049789849668741226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4894925445551053e-05, + "grad_norm": 31.17723846435547, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8762522339820862, + "num_tokens": 782942256.0, + "step": 20518 + }, + { + "epoch": 2.610227706398677, + "ewc_loss": 0.04973448067903519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4867240426829085e-05, + "grad_norm": 31.342178344726562, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8819378614425659, + "num_tokens": 782976156.0, + "step": 20519 + }, + { + "epoch": 2.6103549166772675, + "ewc_loss": 0.04991663992404938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4958320864243433e-05, + "grad_norm": 31.17205238342285, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8809150457382202, + "num_tokens": 783011450.0, + "step": 20520 + }, + { + "epoch": 2.610482126955858, + "ewc_loss": 0.049846578389406204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4923288947320543e-05, + "grad_norm": 31.414653778076172, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8773996233940125, + "num_tokens": 783050355.0, + "step": 20521 + }, + { + "epoch": 2.6106093372344485, + "ewc_loss": 0.05002214014530182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.501106973795686e-05, + "grad_norm": 31.386425018310547, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.8818833827972412, + "num_tokens": 783091621.0, + "step": 20522 + }, + { + "epoch": 2.610736547513039, + "ewc_loss": 0.049748387187719345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.487419442331884e-05, + "grad_norm": 31.33160400390625, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8611030578613281, + "num_tokens": 783130471.0, + "step": 20523 + }, + { + "epoch": 2.6108637577916296, + "ewc_loss": 0.049911659210920334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4955828848760575e-05, + "grad_norm": 31.381010055541992, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8765534162521362, + "num_tokens": 783168340.0, + "step": 20524 + }, + { + "epoch": 2.61099096807022, + "ewc_loss": 0.04980586841702461, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4902934455894865e-05, + "grad_norm": 31.291934967041016, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8529634475708008, + "num_tokens": 783207279.0, + "step": 20525 + }, + { + "epoch": 2.6111181783488107, + "ewc_loss": 0.049771543592214584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.488577229087241e-05, + "grad_norm": 31.239439010620117, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8644951581954956, + "num_tokens": 783243010.0, + "step": 20526 + }, + { + "epoch": 2.611245388627401, + "ewc_loss": 0.049922212958335876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.496110573702026e-05, + "grad_norm": 31.2941837310791, + "learning_rate": 1e-06, + "loss": 0.4506, + "mean_token_accuracy": 0.8609124422073364, + "num_tokens": 783287548.0, + "step": 20527 + }, + { + "epoch": 2.6113725989059917, + "ewc_loss": 0.049987003207206726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4993501938297413e-05, + "grad_norm": 31.41123390197754, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8688824772834778, + "num_tokens": 783331549.0, + "step": 20528 + }, + { + "epoch": 2.6114998091845822, + "ewc_loss": 0.049990277737379074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4995139028760605e-05, + "grad_norm": 31.30232810974121, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8818150162696838, + "num_tokens": 783369445.0, + "step": 20529 + }, + { + "epoch": 2.6116270194631728, + "ewc_loss": 0.049840252846479416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4920125724747777e-05, + "grad_norm": 31.310794830322266, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8672198057174683, + "num_tokens": 783413218.0, + "step": 20530 + }, + { + "epoch": 2.6117542297417633, + "ewc_loss": 0.049846578389406204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4923288947320543e-05, + "grad_norm": 31.31871795654297, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8707409501075745, + "num_tokens": 783449944.0, + "step": 20531 + }, + { + "epoch": 2.611881440020354, + "ewc_loss": 0.04984157532453537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4920787836890668e-05, + "grad_norm": 31.26693344116211, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8719279170036316, + "num_tokens": 783486142.0, + "step": 20532 + }, + { + "epoch": 2.612008650298944, + "ewc_loss": 0.049906644970178604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.495332228136249e-05, + "grad_norm": 31.39394760131836, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8772611618041992, + "num_tokens": 783521170.0, + "step": 20533 + }, + { + "epoch": 2.612135860577535, + "ewc_loss": 0.04987376183271408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4936880436143838e-05, + "grad_norm": 31.339200973510742, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8699769973754883, + "num_tokens": 783563902.0, + "step": 20534 + }, + { + "epoch": 2.612263070856125, + "ewc_loss": 0.049815189093351364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.490759470674675e-05, + "grad_norm": 31.283369064331055, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.877055823802948, + "num_tokens": 783597655.0, + "step": 20535 + }, + { + "epoch": 2.612390281134716, + "ewc_loss": 0.04983510822057724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4917553673731163e-05, + "grad_norm": 31.372711181640625, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8624442219734192, + "num_tokens": 783635233.0, + "step": 20536 + }, + { + "epoch": 2.612517491413306, + "ewc_loss": 0.04988710209727287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.494355067028664e-05, + "grad_norm": 31.326852798461914, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8727900385856628, + "num_tokens": 783679677.0, + "step": 20537 + }, + { + "epoch": 2.6126447016918966, + "ewc_loss": 0.0498797707259655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4939885406638496e-05, + "grad_norm": 31.33037567138672, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8792603015899658, + "num_tokens": 783720608.0, + "step": 20538 + }, + { + "epoch": 2.612771911970487, + "ewc_loss": 0.04989518225193024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4947590645751916e-05, + "grad_norm": 31.38343620300293, + "learning_rate": 1e-06, + "loss": 0.3875, + "mean_token_accuracy": 0.8818856477737427, + "num_tokens": 783753198.0, + "step": 20539 + }, + { + "epoch": 2.6128991222490776, + "ewc_loss": 0.04981909319758415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4909546482376754e-05, + "grad_norm": 31.33038330078125, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8708153963088989, + "num_tokens": 783791068.0, + "step": 20540 + }, + { + "epoch": 2.613026332527668, + "ewc_loss": 0.049878284335136414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4939141439972445e-05, + "grad_norm": 31.44661521911621, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8724439740180969, + "num_tokens": 783831319.0, + "step": 20541 + }, + { + "epoch": 2.6131535428062587, + "ewc_loss": 0.04989224672317505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4946122721303254e-05, + "grad_norm": 31.28351593017578, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8763175010681152, + "num_tokens": 783867537.0, + "step": 20542 + }, + { + "epoch": 2.613280753084849, + "ewc_loss": 0.04978935793042183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.489467806299217e-05, + "grad_norm": 31.434892654418945, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8665512204170227, + "num_tokens": 783907787.0, + "step": 20543 + }, + { + "epoch": 2.6134079633634397, + "ewc_loss": 0.04988239333033562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4941196897998452e-05, + "grad_norm": 31.198280334472656, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.865455150604248, + "num_tokens": 783948558.0, + "step": 20544 + }, + { + "epoch": 2.6135351736420303, + "ewc_loss": 0.049798354506492615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.489917642378714e-05, + "grad_norm": 31.42657470703125, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8893624544143677, + "num_tokens": 783986726.0, + "step": 20545 + }, + { + "epoch": 2.613662383920621, + "ewc_loss": 0.04992145672440529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4960729206213728e-05, + "grad_norm": 31.298141479492188, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8540730476379395, + "num_tokens": 784023449.0, + "step": 20546 + }, + { + "epoch": 2.6137895941992113, + "ewc_loss": 0.049823373556137085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4911687432904728e-05, + "grad_norm": 31.403526306152344, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8885079622268677, + "num_tokens": 784059529.0, + "step": 20547 + }, + { + "epoch": 2.613916804477802, + "ewc_loss": 0.04992271587252617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4961358576547354e-05, + "grad_norm": 31.209333419799805, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8730301856994629, + "num_tokens": 784103530.0, + "step": 20548 + }, + { + "epoch": 2.6140440147563924, + "ewc_loss": 0.049861032515764236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.493051579222083e-05, + "grad_norm": 31.50396156311035, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8662209510803223, + "num_tokens": 784141721.0, + "step": 20549 + }, + { + "epoch": 2.614171225034983, + "ewc_loss": 0.04997199773788452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4985998607007787e-05, + "grad_norm": 31.4150447845459, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.872106671333313, + "num_tokens": 784175868.0, + "step": 20550 + }, + { + "epoch": 2.6142984353135734, + "ewc_loss": 0.049769215285778046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.488460813765414e-05, + "grad_norm": 31.28236961364746, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8839847445487976, + "num_tokens": 784210646.0, + "step": 20551 + }, + { + "epoch": 2.614425645592164, + "ewc_loss": 0.04985430836677551, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4927154299803078e-05, + "grad_norm": 31.216047286987305, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8771795630455017, + "num_tokens": 784255588.0, + "step": 20552 + }, + { + "epoch": 2.6145528558707545, + "ewc_loss": 0.04980592802166939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4902963559725322e-05, + "grad_norm": 31.30398178100586, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8744467496871948, + "num_tokens": 784292877.0, + "step": 20553 + }, + { + "epoch": 2.614680066149345, + "ewc_loss": 0.04988371953368187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4941859010141343e-05, + "grad_norm": 31.234546661376953, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8755271434783936, + "num_tokens": 784336150.0, + "step": 20554 + }, + { + "epoch": 2.6148072764279355, + "ewc_loss": 0.04987996444106102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4939981813076884e-05, + "grad_norm": 31.371417999267578, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8842039108276367, + "num_tokens": 784371547.0, + "step": 20555 + }, + { + "epoch": 2.6149344867065256, + "ewc_loss": 0.049871478229761124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4935739929787815e-05, + "grad_norm": 31.159067153930664, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8753154277801514, + "num_tokens": 784411473.0, + "step": 20556 + }, + { + "epoch": 2.6150616969851166, + "ewc_loss": 0.049971818923950195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4985909476527013e-05, + "grad_norm": 31.45639991760254, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8625183701515198, + "num_tokens": 784456713.0, + "step": 20557 + }, + { + "epoch": 2.6151889072637067, + "ewc_loss": 0.04995804652571678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4979022782645188e-05, + "grad_norm": 31.33882713317871, + "learning_rate": 1e-06, + "loss": 0.3721, + "mean_token_accuracy": 0.8869362473487854, + "num_tokens": 784493685.0, + "step": 20558 + }, + { + "epoch": 2.6153161175422976, + "ewc_loss": 0.04982718080282211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4913590095820837e-05, + "grad_norm": 31.47003173828125, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8835811614990234, + "num_tokens": 784534641.0, + "step": 20559 + }, + { + "epoch": 2.6154433278208877, + "ewc_loss": 0.04991136118769646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4955679691629484e-05, + "grad_norm": 31.325515747070312, + "learning_rate": 1e-06, + "loss": 0.4917, + "mean_token_accuracy": 0.8497496247291565, + "num_tokens": 784576184.0, + "step": 20560 + }, + { + "epoch": 2.6155705380994787, + "ewc_loss": 0.049711789935827255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.485589538991917e-05, + "grad_norm": 31.284574508666992, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8758790493011475, + "num_tokens": 784613851.0, + "step": 20561 + }, + { + "epoch": 2.615697748378069, + "ewc_loss": 0.0499543771147728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4977189241326414e-05, + "grad_norm": 31.41822052001953, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8753576278686523, + "num_tokens": 784654133.0, + "step": 20562 + }, + { + "epoch": 2.6158249586566593, + "ewc_loss": 0.04978533089160919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.489266626071185e-05, + "grad_norm": 31.34312629699707, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8741827011108398, + "num_tokens": 784690529.0, + "step": 20563 + }, + { + "epoch": 2.61595216893525, + "ewc_loss": 0.04982740432024002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4913702873163857e-05, + "grad_norm": 31.32984161376953, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8714939951896667, + "num_tokens": 784731972.0, + "step": 20564 + }, + { + "epoch": 2.6160793792138404, + "ewc_loss": 0.04981442913413048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.490721453796141e-05, + "grad_norm": 31.52364158630371, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8686047792434692, + "num_tokens": 784773275.0, + "step": 20565 + }, + { + "epoch": 2.616206589492431, + "ewc_loss": 0.049907658249139786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4953829779406078e-05, + "grad_norm": 31.43699836730957, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8573973178863525, + "num_tokens": 784811468.0, + "step": 20566 + }, + { + "epoch": 2.6163337997710214, + "ewc_loss": 0.04975469410419464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4877346731955186e-05, + "grad_norm": 31.464305877685547, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8743959665298462, + "num_tokens": 784850611.0, + "step": 20567 + }, + { + "epoch": 2.616461010049612, + "ewc_loss": 0.04981108382344246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4905541067710146e-05, + "grad_norm": 31.272777557373047, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8782984018325806, + "num_tokens": 784886502.0, + "step": 20568 + }, + { + "epoch": 2.6165882203282025, + "ewc_loss": 0.04974520206451416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4872600988601334e-05, + "grad_norm": 31.513809204101562, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8638677000999451, + "num_tokens": 784924489.0, + "step": 20569 + }, + { + "epoch": 2.616715430606793, + "ewc_loss": 0.0498327910900116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4916394977481104e-05, + "grad_norm": 31.42771339416504, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.877246618270874, + "num_tokens": 784957224.0, + "step": 20570 + }, + { + "epoch": 2.6168426408853835, + "ewc_loss": 0.049618933349847794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4809467504383065e-05, + "grad_norm": 31.31979751586914, + "learning_rate": 1e-06, + "loss": 0.4962, + "mean_token_accuracy": 0.8480921387672424, + "num_tokens": 784997754.0, + "step": 20571 + }, + { + "epoch": 2.616969851163974, + "ewc_loss": 0.0497315414249897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.486577068339102e-05, + "grad_norm": 31.27727508544922, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8578841686248779, + "num_tokens": 785033932.0, + "step": 20572 + }, + { + "epoch": 2.6170970614425646, + "ewc_loss": 0.049777351319789886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.488867539796047e-05, + "grad_norm": 31.45218849182129, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8690071702003479, + "num_tokens": 785073190.0, + "step": 20573 + }, + { + "epoch": 2.617224271721155, + "ewc_loss": 0.0499090813100338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4954541004262865e-05, + "grad_norm": 31.367374420166016, + "learning_rate": 1e-06, + "loss": 0.3631, + "mean_token_accuracy": 0.8888915777206421, + "num_tokens": 785108397.0, + "step": 20574 + }, + { + "epoch": 2.6173514819997457, + "ewc_loss": 0.04975171014666557, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4875855160644278e-05, + "grad_norm": 31.334789276123047, + "learning_rate": 1e-06, + "loss": 0.3424, + "mean_token_accuracy": 0.8989391326904297, + "num_tokens": 785142671.0, + "step": 20575 + }, + { + "epoch": 2.617478692278336, + "ewc_loss": 0.049803294241428375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4901646611397155e-05, + "grad_norm": 31.319257736206055, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.871040403842926, + "num_tokens": 785183467.0, + "step": 20576 + }, + { + "epoch": 2.6176059025569267, + "ewc_loss": 0.04981386661529541, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.490693259460386e-05, + "grad_norm": 31.283220291137695, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.861781120300293, + "num_tokens": 785222072.0, + "step": 20577 + }, + { + "epoch": 2.6177331128355172, + "ewc_loss": 0.04973306134343147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4866531020961702e-05, + "grad_norm": 31.322572708129883, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8612686395645142, + "num_tokens": 785260458.0, + "step": 20578 + }, + { + "epoch": 2.6178603231141078, + "ewc_loss": 0.04990513622760773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4952567400760017e-05, + "grad_norm": 31.2110595703125, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8592525720596313, + "num_tokens": 785302434.0, + "step": 20579 + }, + { + "epoch": 2.6179875333926983, + "ewc_loss": 0.04981296882033348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4906485123210587e-05, + "grad_norm": 31.321510314941406, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8571350574493408, + "num_tokens": 785338251.0, + "step": 20580 + }, + { + "epoch": 2.6181147436712884, + "ewc_loss": 0.04992002621293068, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.496001252438873e-05, + "grad_norm": 31.348268508911133, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8672398924827576, + "num_tokens": 785384789.0, + "step": 20581 + }, + { + "epoch": 2.6182419539498794, + "ewc_loss": 0.0499664843082428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4983242838061415e-05, + "grad_norm": 31.447736740112305, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8721592426300049, + "num_tokens": 785424978.0, + "step": 20582 + }, + { + "epoch": 2.6183691642284694, + "ewc_loss": 0.04990671947598457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4953360480139963e-05, + "grad_norm": 31.28996467590332, + "learning_rate": 1e-06, + "loss": 0.438, + "mean_token_accuracy": 0.8644865155220032, + "num_tokens": 785464421.0, + "step": 20583 + }, + { + "epoch": 2.6184963745070604, + "ewc_loss": 0.04985523968935013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4927619961090386e-05, + "grad_norm": 31.444355010986328, + "learning_rate": 1e-06, + "loss": 0.3525, + "mean_token_accuracy": 0.890887975692749, + "num_tokens": 785496166.0, + "step": 20584 + }, + { + "epoch": 2.6186235847856505, + "ewc_loss": 0.04990476369857788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4952381863840856e-05, + "grad_norm": 31.284008026123047, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8574063777923584, + "num_tokens": 785528504.0, + "step": 20585 + }, + { + "epoch": 2.6187507950642415, + "ewc_loss": 0.04995730519294739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.497865352779627e-05, + "grad_norm": 31.387161254882812, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8756828308105469, + "num_tokens": 785570896.0, + "step": 20586 + }, + { + "epoch": 2.6188780053428315, + "ewc_loss": 0.04993470013141632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4967350327642635e-05, + "grad_norm": 31.28329849243164, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8693022727966309, + "num_tokens": 785612464.0, + "step": 20587 + }, + { + "epoch": 2.619005215621422, + "ewc_loss": 0.04992751032114029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4963756004581228e-05, + "grad_norm": 31.397932052612305, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8715469837188721, + "num_tokens": 785655500.0, + "step": 20588 + }, + { + "epoch": 2.6191324259000126, + "ewc_loss": 0.049929168075323105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.496458364475984e-05, + "grad_norm": 31.410907745361328, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8741050958633423, + "num_tokens": 785691186.0, + "step": 20589 + }, + { + "epoch": 2.619259636178603, + "ewc_loss": 0.04986422136425972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.493211104592774e-05, + "grad_norm": 31.23284912109375, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8851306438446045, + "num_tokens": 785728575.0, + "step": 20590 + }, + { + "epoch": 2.6193868464571937, + "ewc_loss": 0.04993771016597748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4968854631879367e-05, + "grad_norm": 31.422962188720703, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8691464066505432, + "num_tokens": 785765782.0, + "step": 20591 + }, + { + "epoch": 2.619514056735784, + "ewc_loss": 0.04990129917860031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.495065018592868e-05, + "grad_norm": 31.41789436340332, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8763562440872192, + "num_tokens": 785800130.0, + "step": 20592 + }, + { + "epoch": 2.6196412670143747, + "ewc_loss": 0.049880363047122955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4940181901911274e-05, + "grad_norm": 31.298641204833984, + "learning_rate": 1e-06, + "loss": 0.331, + "mean_token_accuracy": 0.8997177481651306, + "num_tokens": 785836647.0, + "step": 20593 + }, + { + "epoch": 2.6197684772929652, + "ewc_loss": 0.049799591302871704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4899794880184345e-05, + "grad_norm": 31.293771743774414, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8658321499824524, + "num_tokens": 785873897.0, + "step": 20594 + }, + { + "epoch": 2.6198956875715558, + "ewc_loss": 0.04987288638949394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4936443878686987e-05, + "grad_norm": 31.344263076782227, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8680444359779358, + "num_tokens": 785914296.0, + "step": 20595 + }, + { + "epoch": 2.6200228978501463, + "ewc_loss": 0.04990723356604576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4953616957645863e-05, + "grad_norm": 31.466196060180664, + "learning_rate": 1e-06, + "loss": 0.4585, + "mean_token_accuracy": 0.8598965406417847, + "num_tokens": 785951426.0, + "step": 20596 + }, + { + "epoch": 2.620150108128737, + "ewc_loss": 0.049903180450201035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4951590603450313e-05, + "grad_norm": 31.214937210083008, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8745708465576172, + "num_tokens": 785989906.0, + "step": 20597 + }, + { + "epoch": 2.6202773184073274, + "ewc_loss": 0.049783192574977875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4891596694942564e-05, + "grad_norm": 31.340709686279297, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8671350479125977, + "num_tokens": 786024763.0, + "step": 20598 + }, + { + "epoch": 2.620404528685918, + "ewc_loss": 0.04991431161761284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.495715489203576e-05, + "grad_norm": 31.370990753173828, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8674942255020142, + "num_tokens": 786063055.0, + "step": 20599 + }, + { + "epoch": 2.6205317389645084, + "ewc_loss": 0.04980001971125603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4900009520933963e-05, + "grad_norm": 31.32743263244629, + "learning_rate": 1e-06, + "loss": 0.5169, + "mean_token_accuracy": 0.845887303352356, + "num_tokens": 786097675.0, + "step": 20600 + }, + { + "epoch": 2.620658949243099, + "ewc_loss": 0.04988699406385422, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4943497919593938e-05, + "grad_norm": 31.19196128845215, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8842246532440186, + "num_tokens": 786133721.0, + "step": 20601 + }, + { + "epoch": 2.6207861595216895, + "ewc_loss": 0.04989723861217499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4948620193754323e-05, + "grad_norm": 31.477340698242188, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8539613485336304, + "num_tokens": 786179615.0, + "step": 20602 + }, + { + "epoch": 2.62091336980028, + "ewc_loss": 0.05003780499100685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5018902306328528e-05, + "grad_norm": 31.327739715576172, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8861836194992065, + "num_tokens": 786214945.0, + "step": 20603 + }, + { + "epoch": 2.6210405800788705, + "ewc_loss": 0.049860358238220215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4930179279181175e-05, + "grad_norm": 31.256572723388672, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8733454346656799, + "num_tokens": 786250880.0, + "step": 20604 + }, + { + "epoch": 2.621167790357461, + "ewc_loss": 0.050093874335289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5046936571015976e-05, + "grad_norm": 31.446117401123047, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.87263023853302, + "num_tokens": 786286095.0, + "step": 20605 + }, + { + "epoch": 2.621295000636051, + "ewc_loss": 0.05007539689540863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5037697923835367e-05, + "grad_norm": 31.42039680480957, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8771759271621704, + "num_tokens": 786319547.0, + "step": 20606 + }, + { + "epoch": 2.621422210914642, + "ewc_loss": 0.049944281578063965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4972141545731574e-05, + "grad_norm": 31.20268440246582, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8714017271995544, + "num_tokens": 786358699.0, + "step": 20607 + }, + { + "epoch": 2.621549421193232, + "ewc_loss": 0.05011043697595596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.505521842977032e-05, + "grad_norm": 31.46784782409668, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8752801418304443, + "num_tokens": 786399732.0, + "step": 20608 + }, + { + "epoch": 2.621676631471823, + "ewc_loss": 0.05014948546886444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5074743462027982e-05, + "grad_norm": 31.31230354309082, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8637287616729736, + "num_tokens": 786437375.0, + "step": 20609 + }, + { + "epoch": 2.6218038417504133, + "ewc_loss": 0.049973707646131516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4986853532027453e-05, + "grad_norm": 31.31760025024414, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8612936735153198, + "num_tokens": 786475538.0, + "step": 20610 + }, + { + "epoch": 2.621931052029004, + "ewc_loss": 0.050061553716659546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5030776669154875e-05, + "grad_norm": 31.228500366210938, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8741514682769775, + "num_tokens": 786516802.0, + "step": 20611 + }, + { + "epoch": 2.6220582623075943, + "ewc_loss": 0.050029534846544266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5014767743414268e-05, + "grad_norm": 31.229713439941406, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8718409538269043, + "num_tokens": 786558470.0, + "step": 20612 + }, + { + "epoch": 2.622185472586185, + "ewc_loss": 0.050042856484651566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5021428882610053e-05, + "grad_norm": 31.346149444580078, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8668045401573181, + "num_tokens": 786601163.0, + "step": 20613 + }, + { + "epoch": 2.6223126828647754, + "ewc_loss": 0.05004018917679787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.502009374438785e-05, + "grad_norm": 31.239757537841797, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8792137503623962, + "num_tokens": 786636741.0, + "step": 20614 + }, + { + "epoch": 2.622439893143366, + "ewc_loss": 0.05004090443253517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.502045208530035e-05, + "grad_norm": 31.309104919433594, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8600978255271912, + "num_tokens": 786672931.0, + "step": 20615 + }, + { + "epoch": 2.6225671034219564, + "ewc_loss": 0.050039585679769516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5019793611136265e-05, + "grad_norm": 31.29212760925293, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8840693235397339, + "num_tokens": 786709310.0, + "step": 20616 + }, + { + "epoch": 2.622694313700547, + "ewc_loss": 0.05008586496114731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5042932975338772e-05, + "grad_norm": 31.41919708251953, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8780708909034729, + "num_tokens": 786747113.0, + "step": 20617 + }, + { + "epoch": 2.6228215239791375, + "ewc_loss": 0.05011247470974922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5056237063836306e-05, + "grad_norm": 31.271495819091797, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8704267740249634, + "num_tokens": 786782033.0, + "step": 20618 + }, + { + "epoch": 2.622948734257728, + "ewc_loss": 0.04998311400413513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4991557438625023e-05, + "grad_norm": 31.322044372558594, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8732761144638062, + "num_tokens": 786827229.0, + "step": 20619 + }, + { + "epoch": 2.6230759445363185, + "ewc_loss": 0.050051432102918625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.502571624063421e-05, + "grad_norm": 31.36887550354004, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.8793833255767822, + "num_tokens": 786861722.0, + "step": 20620 + }, + { + "epoch": 2.623203154814909, + "ewc_loss": 0.049964770674705505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4982386094052345e-05, + "grad_norm": 31.348403930664062, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8746710419654846, + "num_tokens": 786896202.0, + "step": 20621 + }, + { + "epoch": 2.6233303650934996, + "ewc_loss": 0.04997865855693817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.498932917660568e-05, + "grad_norm": 31.325927734375, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8741745948791504, + "num_tokens": 786932154.0, + "step": 20622 + }, + { + "epoch": 2.62345757537209, + "ewc_loss": 0.05000506713986397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.500253322068602e-05, + "grad_norm": 31.334897994995117, + "learning_rate": 1e-06, + "loss": 0.4482, + "mean_token_accuracy": 0.8635348081588745, + "num_tokens": 786972184.0, + "step": 20623 + }, + { + "epoch": 2.6235847856506807, + "ewc_loss": 0.05006682872772217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5033414203790016e-05, + "grad_norm": 31.488697052001953, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8656305074691772, + "num_tokens": 787015514.0, + "step": 20624 + }, + { + "epoch": 2.623711995929271, + "ewc_loss": 0.04996446520090103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4982233298942447e-05, + "grad_norm": 31.296720504760742, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8717966079711914, + "num_tokens": 787054023.0, + "step": 20625 + }, + { + "epoch": 2.6238392062078617, + "ewc_loss": 0.04996556416153908, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4982782633742318e-05, + "grad_norm": 31.35033416748047, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8694770932197571, + "num_tokens": 787088858.0, + "step": 20626 + }, + { + "epoch": 2.6239664164864522, + "ewc_loss": 0.050013381987810135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5006691430462524e-05, + "grad_norm": 31.38648796081543, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8752139806747437, + "num_tokens": 787124805.0, + "step": 20627 + }, + { + "epoch": 2.6240936267650428, + "ewc_loss": 0.05005080625414848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.50254033744568e-05, + "grad_norm": 31.31218147277832, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8838026523590088, + "num_tokens": 787164922.0, + "step": 20628 + }, + { + "epoch": 2.6242208370436333, + "ewc_loss": 0.04994295910000801, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4971479433588684e-05, + "grad_norm": 31.347986221313477, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8837720155715942, + "num_tokens": 787204936.0, + "step": 20629 + }, + { + "epoch": 2.624348047322224, + "ewc_loss": 0.04998082295060158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.499041147530079e-05, + "grad_norm": 31.247825622558594, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8839104771614075, + "num_tokens": 787238292.0, + "step": 20630 + }, + { + "epoch": 2.624475257600814, + "ewc_loss": 0.04984607920050621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4923039745772257e-05, + "grad_norm": 31.306612014770508, + "learning_rate": 1e-06, + "loss": 0.3555, + "mean_token_accuracy": 0.8908146619796753, + "num_tokens": 787269054.0, + "step": 20631 + }, + { + "epoch": 2.624602467879405, + "ewc_loss": 0.05004756525158882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.502378265489824e-05, + "grad_norm": 31.37022590637207, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.87082439661026, + "num_tokens": 787310022.0, + "step": 20632 + }, + { + "epoch": 2.624729678157995, + "ewc_loss": 0.049885932356119156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4942966774688102e-05, + "grad_norm": 31.181161880493164, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8613642454147339, + "num_tokens": 787350917.0, + "step": 20633 + }, + { + "epoch": 2.624856888436586, + "ewc_loss": 0.04996437951922417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.498218964319676e-05, + "grad_norm": 31.23265838623047, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8801848292350769, + "num_tokens": 787388187.0, + "step": 20634 + }, + { + "epoch": 2.624984098715176, + "ewc_loss": 0.05003898963332176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5019495296874084e-05, + "grad_norm": 31.334157943725586, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8777841925621033, + "num_tokens": 787424998.0, + "step": 20635 + }, + { + "epoch": 2.6251113089937665, + "ewc_loss": 0.049963582307100296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4981791284517385e-05, + "grad_norm": 31.257823944091797, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8846652507781982, + "num_tokens": 787465456.0, + "step": 20636 + }, + { + "epoch": 2.625238519272357, + "ewc_loss": 0.050053756684064865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5026878574863076e-05, + "grad_norm": 31.332969665527344, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8713077902793884, + "num_tokens": 787504797.0, + "step": 20637 + }, + { + "epoch": 2.6253657295509476, + "ewc_loss": 0.05010855197906494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5054276193259284e-05, + "grad_norm": 31.44758415222168, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8819471001625061, + "num_tokens": 787542911.0, + "step": 20638 + }, + { + "epoch": 2.625492939829538, + "ewc_loss": 0.04995670169591904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.497835157555528e-05, + "grad_norm": 31.314308166503906, + "learning_rate": 1e-06, + "loss": 0.4406, + "mean_token_accuracy": 0.862859845161438, + "num_tokens": 787576143.0, + "step": 20639 + }, + { + "epoch": 2.6256201501081287, + "ewc_loss": 0.04993712902069092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4968565412564203e-05, + "grad_norm": 31.38602066040039, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8744800686836243, + "num_tokens": 787619912.0, + "step": 20640 + }, + { + "epoch": 2.625747360386719, + "ewc_loss": 0.04999854415655136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.499927177268546e-05, + "grad_norm": 31.231796264648438, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.8631765842437744, + "num_tokens": 787658181.0, + "step": 20641 + }, + { + "epoch": 2.6258745706653097, + "ewc_loss": 0.050045229494571686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5022614863701165e-05, + "grad_norm": 31.42742156982422, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8635324239730835, + "num_tokens": 787694889.0, + "step": 20642 + }, + { + "epoch": 2.6260017809439002, + "ewc_loss": 0.05008485168218613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5042425477295183e-05, + "grad_norm": 31.39251708984375, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8860383033752441, + "num_tokens": 787737633.0, + "step": 20643 + }, + { + "epoch": 2.6261289912224908, + "ewc_loss": 0.04996228963136673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4981145543279126e-05, + "grad_norm": 31.48023796081543, + "learning_rate": 1e-06, + "loss": 0.4627, + "mean_token_accuracy": 0.8596454858779907, + "num_tokens": 787779069.0, + "step": 20644 + }, + { + "epoch": 2.6262562015010813, + "ewc_loss": 0.04997750744223595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.498875437595416e-05, + "grad_norm": 31.24146842956543, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8602248430252075, + "num_tokens": 787815544.0, + "step": 20645 + }, + { + "epoch": 2.626383411779672, + "ewc_loss": 0.04992997646331787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.496498746040743e-05, + "grad_norm": 31.37009048461914, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8666108846664429, + "num_tokens": 787852684.0, + "step": 20646 + }, + { + "epoch": 2.6265106220582624, + "ewc_loss": 0.049971744418144226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4985873096738942e-05, + "grad_norm": 31.254512786865234, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8683080077171326, + "num_tokens": 787893452.0, + "step": 20647 + }, + { + "epoch": 2.626637832336853, + "ewc_loss": 0.04996323958039284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4981620299513452e-05, + "grad_norm": 31.41795539855957, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8711732029914856, + "num_tokens": 787929430.0, + "step": 20648 + }, + { + "epoch": 2.6267650426154434, + "ewc_loss": 0.05004795640707016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.502397728676442e-05, + "grad_norm": 31.260974884033203, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8694154620170593, + "num_tokens": 787969768.0, + "step": 20649 + }, + { + "epoch": 2.626892252894034, + "ewc_loss": 0.0499914325773716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.499571564840153e-05, + "grad_norm": 31.472131729125977, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8720529079437256, + "num_tokens": 788009605.0, + "step": 20650 + }, + { + "epoch": 2.6270194631726245, + "ewc_loss": 0.05001642554998398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.500821210560389e-05, + "grad_norm": 31.31256866455078, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8842185735702515, + "num_tokens": 788044652.0, + "step": 20651 + }, + { + "epoch": 2.627146673451215, + "ewc_loss": 0.04990960285067558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.495480111974757e-05, + "grad_norm": 31.4886417388916, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8465428352355957, + "num_tokens": 788085981.0, + "step": 20652 + }, + { + "epoch": 2.6272738837298055, + "ewc_loss": 0.05002019181847572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5010096578625962e-05, + "grad_norm": 31.458881378173828, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8690289855003357, + "num_tokens": 788132196.0, + "step": 20653 + }, + { + "epoch": 2.6274010940083956, + "ewc_loss": 0.04993180185556412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.496590059308801e-05, + "grad_norm": 31.319110870361328, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8850753307342529, + "num_tokens": 788160099.0, + "step": 20654 + }, + { + "epoch": 2.6275283042869866, + "ewc_loss": 0.04994684085249901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4973420295282267e-05, + "grad_norm": 31.388765335083008, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8704718947410583, + "num_tokens": 788197264.0, + "step": 20655 + }, + { + "epoch": 2.6276555145655767, + "ewc_loss": 0.04998233541846275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4991168174892664e-05, + "grad_norm": 31.365480422973633, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8845402598381042, + "num_tokens": 788232900.0, + "step": 20656 + }, + { + "epoch": 2.6277827248441676, + "ewc_loss": 0.049954839050769806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4977420252980664e-05, + "grad_norm": 31.412391662597656, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8666781187057495, + "num_tokens": 788273720.0, + "step": 20657 + }, + { + "epoch": 2.6279099351227577, + "ewc_loss": 0.04987489804625511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4937449779827148e-05, + "grad_norm": 31.330595016479492, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8749783635139465, + "num_tokens": 788308425.0, + "step": 20658 + }, + { + "epoch": 2.6280371454013487, + "ewc_loss": 0.04994974657893181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.49748736678157e-05, + "grad_norm": 31.28536605834961, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8643490076065063, + "num_tokens": 788343701.0, + "step": 20659 + }, + { + "epoch": 2.628164355679939, + "ewc_loss": 0.0499846413731575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4992321414174512e-05, + "grad_norm": 31.405765533447266, + "learning_rate": 1e-06, + "loss": 0.484, + "mean_token_accuracy": 0.8565992116928101, + "num_tokens": 788372789.0, + "step": 20660 + }, + { + "epoch": 2.6282915659585293, + "ewc_loss": 0.05005564168095589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5027820811374113e-05, + "grad_norm": 31.3480281829834, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.876535177230835, + "num_tokens": 788408422.0, + "step": 20661 + }, + { + "epoch": 2.62841877623712, + "ewc_loss": 0.049987420439720154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.499371112207882e-05, + "grad_norm": 31.37435531616211, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8742656707763672, + "num_tokens": 788446113.0, + "step": 20662 + }, + { + "epoch": 2.6285459865157104, + "ewc_loss": 0.050108667463064194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5054334400920197e-05, + "grad_norm": 31.355243682861328, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8659039735794067, + "num_tokens": 788485009.0, + "step": 20663 + }, + { + "epoch": 2.628673196794301, + "ewc_loss": 0.0500250868499279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.501254311937373e-05, + "grad_norm": 31.369915008544922, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8750369548797607, + "num_tokens": 788523446.0, + "step": 20664 + }, + { + "epoch": 2.6288004070728914, + "ewc_loss": 0.05005747079849243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5028735763044097e-05, + "grad_norm": 31.158695220947266, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8807862997055054, + "num_tokens": 788557423.0, + "step": 20665 + }, + { + "epoch": 2.628927617351482, + "ewc_loss": 0.050020452588796616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5010225726873614e-05, + "grad_norm": 31.479751586914062, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8693175911903381, + "num_tokens": 788588177.0, + "step": 20666 + }, + { + "epoch": 2.6290548276300725, + "ewc_loss": 0.05019634962081909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.509817568352446e-05, + "grad_norm": 31.332950592041016, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8734058737754822, + "num_tokens": 788624877.0, + "step": 20667 + }, + { + "epoch": 2.629182037908663, + "ewc_loss": 0.05008365586400032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5041827029781416e-05, + "grad_norm": 31.32478904724121, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8821263313293457, + "num_tokens": 788662532.0, + "step": 20668 + }, + { + "epoch": 2.6293092481872535, + "ewc_loss": 0.0501377172768116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.506885903130751e-05, + "grad_norm": 31.33829689025879, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8727041482925415, + "num_tokens": 788702734.0, + "step": 20669 + }, + { + "epoch": 2.629436458465844, + "ewc_loss": 0.050120819360017776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.506040982552804e-05, + "grad_norm": 31.464021682739258, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8723322153091431, + "num_tokens": 788738041.0, + "step": 20670 + }, + { + "epoch": 2.6295636687444346, + "ewc_loss": 0.050146739929914474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5073370125028305e-05, + "grad_norm": 31.387704849243164, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8729965090751648, + "num_tokens": 788772998.0, + "step": 20671 + }, + { + "epoch": 2.629690879023025, + "ewc_loss": 0.05001210421323776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.500605296518188e-05, + "grad_norm": 31.26780891418457, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8553246259689331, + "num_tokens": 788814310.0, + "step": 20672 + }, + { + "epoch": 2.6298180893016156, + "ewc_loss": 0.05013163387775421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5065817681024782e-05, + "grad_norm": 31.32332992553711, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8757501840591431, + "num_tokens": 788852811.0, + "step": 20673 + }, + { + "epoch": 2.629945299580206, + "ewc_loss": 0.05012461170554161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5062305212486535e-05, + "grad_norm": 31.2862491607666, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8807273507118225, + "num_tokens": 788888756.0, + "step": 20674 + }, + { + "epoch": 2.6300725098587967, + "ewc_loss": 0.0501844584941864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5092229407164268e-05, + "grad_norm": 31.52567481994629, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8764625787734985, + "num_tokens": 788926292.0, + "step": 20675 + }, + { + "epoch": 2.6301997201373872, + "ewc_loss": 0.05018821358680725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5094106604228728e-05, + "grad_norm": 31.392576217651367, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8811246752738953, + "num_tokens": 788965625.0, + "step": 20676 + }, + { + "epoch": 2.6303269304159778, + "ewc_loss": 0.050055358558893204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5027678930200636e-05, + "grad_norm": 31.391765594482422, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8766129612922668, + "num_tokens": 789007390.0, + "step": 20677 + }, + { + "epoch": 2.6304541406945683, + "ewc_loss": 0.05016734078526497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5083671062020585e-05, + "grad_norm": 31.45645523071289, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8761029839515686, + "num_tokens": 789047940.0, + "step": 20678 + }, + { + "epoch": 2.6305813509731584, + "ewc_loss": 0.05002109333872795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5010545869008638e-05, + "grad_norm": 31.327741622924805, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8719158172607422, + "num_tokens": 789086321.0, + "step": 20679 + }, + { + "epoch": 2.6307085612517493, + "ewc_loss": 0.05017973855137825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5089868358918466e-05, + "grad_norm": 31.511272430419922, + "learning_rate": 1e-06, + "loss": 0.3779, + "mean_token_accuracy": 0.8838785886764526, + "num_tokens": 789121757.0, + "step": 20680 + }, + { + "epoch": 2.6308357715303394, + "ewc_loss": 0.05014818534255028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5074092263821512e-05, + "grad_norm": 31.311243057250977, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8855526447296143, + "num_tokens": 789160076.0, + "step": 20681 + }, + { + "epoch": 2.6309629818089304, + "ewc_loss": 0.04998768866062164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.499384390830528e-05, + "grad_norm": 31.38672637939453, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8637180328369141, + "num_tokens": 789197080.0, + "step": 20682 + }, + { + "epoch": 2.6310901920875205, + "ewc_loss": 0.05015094205737114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.50754710577894e-05, + "grad_norm": 31.244827270507812, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8675229549407959, + "num_tokens": 789249876.0, + "step": 20683 + }, + { + "epoch": 2.6312174023661115, + "ewc_loss": 0.050050459802150726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5025230570463464e-05, + "grad_norm": 31.33654022216797, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8715094327926636, + "num_tokens": 789290146.0, + "step": 20684 + }, + { + "epoch": 2.6313446126447015, + "ewc_loss": 0.050081364810466766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5040682885446586e-05, + "grad_norm": 31.244709014892578, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8758320808410645, + "num_tokens": 789335610.0, + "step": 20685 + }, + { + "epoch": 2.631471822923292, + "ewc_loss": 0.05005127191543579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5025636205100454e-05, + "grad_norm": 31.495159149169922, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8678233623504639, + "num_tokens": 789371956.0, + "step": 20686 + }, + { + "epoch": 2.6315990332018826, + "ewc_loss": 0.050106581300497055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5053290301002562e-05, + "grad_norm": 31.290447235107422, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.868967592716217, + "num_tokens": 789415257.0, + "step": 20687 + }, + { + "epoch": 2.631726243480473, + "ewc_loss": 0.0499461367726326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4973069230327383e-05, + "grad_norm": 31.3070068359375, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8729876279830933, + "num_tokens": 789452533.0, + "step": 20688 + }, + { + "epoch": 2.6318534537590637, + "ewc_loss": 0.05016947165131569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.508473517082166e-05, + "grad_norm": 31.313318252563477, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8696281909942627, + "num_tokens": 789488780.0, + "step": 20689 + }, + { + "epoch": 2.631980664037654, + "ewc_loss": 0.050029754638671875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.501487688277848e-05, + "grad_norm": 31.317951202392578, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8700233697891235, + "num_tokens": 789528169.0, + "step": 20690 + }, + { + "epoch": 2.6321078743162447, + "ewc_loss": 0.05008448287844658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5042241759365425e-05, + "grad_norm": 31.348054885864258, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.865170955657959, + "num_tokens": 789568382.0, + "step": 20691 + }, + { + "epoch": 2.6322350845948352, + "ewc_loss": 0.050001367926597595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5000683308462612e-05, + "grad_norm": 31.35861587524414, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8699181079864502, + "num_tokens": 789606895.0, + "step": 20692 + }, + { + "epoch": 2.6323622948734258, + "ewc_loss": 0.050130654126405716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5065326553885825e-05, + "grad_norm": 31.397836685180664, + "learning_rate": 1e-06, + "loss": 0.4673, + "mean_token_accuracy": 0.8568916320800781, + "num_tokens": 789649106.0, + "step": 20693 + }, + { + "epoch": 2.6324895051520163, + "ewc_loss": 0.05002773925662041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5013869162648916e-05, + "grad_norm": 31.260557174682617, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8763845562934875, + "num_tokens": 789689449.0, + "step": 20694 + }, + { + "epoch": 2.632616715430607, + "ewc_loss": 0.050111930817365646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5055966034415178e-05, + "grad_norm": 31.339130401611328, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8828601241111755, + "num_tokens": 789726179.0, + "step": 20695 + }, + { + "epoch": 2.6327439257091974, + "ewc_loss": 0.05002174153923988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5010871468111873e-05, + "grad_norm": 31.3236083984375, + "learning_rate": 1e-06, + "loss": 0.4641, + "mean_token_accuracy": 0.85489422082901, + "num_tokens": 789764858.0, + "step": 20696 + }, + { + "epoch": 2.632871135987788, + "ewc_loss": 0.05017562210559845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5087811081903055e-05, + "grad_norm": 31.39277458190918, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8591573238372803, + "num_tokens": 789799196.0, + "step": 20697 + }, + { + "epoch": 2.6329983462663784, + "ewc_loss": 0.05008319020271301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5041596018127166e-05, + "grad_norm": 31.32400131225586, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.870785653591156, + "num_tokens": 789840498.0, + "step": 20698 + }, + { + "epoch": 2.633125556544969, + "ewc_loss": 0.05006970092654228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5034851205418818e-05, + "grad_norm": 31.376237869262695, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8750247359275818, + "num_tokens": 789874269.0, + "step": 20699 + }, + { + "epoch": 2.6332527668235595, + "ewc_loss": 0.05004095286130905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5020475732162595e-05, + "grad_norm": 31.35121726989746, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8733506202697754, + "num_tokens": 789914938.0, + "step": 20700 + }, + { + "epoch": 2.63337997710215, + "ewc_loss": 0.04994095861911774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4970478989416733e-05, + "grad_norm": 31.358539581298828, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8729230761528015, + "num_tokens": 789959219.0, + "step": 20701 + }, + { + "epoch": 2.6335071873807405, + "ewc_loss": 0.05004672706127167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5023364287335426e-05, + "grad_norm": 31.532323837280273, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8656490445137024, + "num_tokens": 789993707.0, + "step": 20702 + }, + { + "epoch": 2.633634397659331, + "ewc_loss": 0.05004599317908287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.502299685147591e-05, + "grad_norm": 31.511350631713867, + "learning_rate": 1e-06, + "loss": 0.4781, + "mean_token_accuracy": 0.8526803851127625, + "num_tokens": 790034285.0, + "step": 20703 + }, + { + "epoch": 2.633761607937921, + "ewc_loss": 0.05001196637749672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5005983843584545e-05, + "grad_norm": 31.446252822875977, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8747827410697937, + "num_tokens": 790069714.0, + "step": 20704 + }, + { + "epoch": 2.633888818216512, + "ewc_loss": 0.04991751164197922, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.495875560271088e-05, + "grad_norm": 31.231155395507812, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8709239959716797, + "num_tokens": 790110733.0, + "step": 20705 + }, + { + "epoch": 2.634016028495102, + "ewc_loss": 0.049969498068094254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.498474896128755e-05, + "grad_norm": 31.40314292907715, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8758978247642517, + "num_tokens": 790144468.0, + "step": 20706 + }, + { + "epoch": 2.634143238773693, + "ewc_loss": 0.04999814182519913, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.499907168385107e-05, + "grad_norm": 31.36087417602539, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8696526885032654, + "num_tokens": 790186881.0, + "step": 20707 + }, + { + "epoch": 2.6342704490522832, + "ewc_loss": 0.04992617294192314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4963086616480723e-05, + "grad_norm": 31.31413459777832, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8788478374481201, + "num_tokens": 790225083.0, + "step": 20708 + }, + { + "epoch": 2.6343976593308738, + "ewc_loss": 0.05001528188586235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5007640942931175e-05, + "grad_norm": 31.362255096435547, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8719167709350586, + "num_tokens": 790266822.0, + "step": 20709 + }, + { + "epoch": 2.6345248696094643, + "ewc_loss": 0.04991193488240242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4955967091955245e-05, + "grad_norm": 31.275287628173828, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.869558572769165, + "num_tokens": 790305948.0, + "step": 20710 + }, + { + "epoch": 2.634652079888055, + "ewc_loss": 0.05005698651075363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5028493837453425e-05, + "grad_norm": 31.483675003051758, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8736035823822021, + "num_tokens": 790341639.0, + "step": 20711 + }, + { + "epoch": 2.6347792901666454, + "ewc_loss": 0.05003867670893669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.501933886378538e-05, + "grad_norm": 31.34550666809082, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8807863593101501, + "num_tokens": 790377442.0, + "step": 20712 + }, + { + "epoch": 2.634906500445236, + "ewc_loss": 0.04998013749718666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4990069505292922e-05, + "grad_norm": 31.358129501342773, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8717570304870605, + "num_tokens": 790414911.0, + "step": 20713 + }, + { + "epoch": 2.6350337107238264, + "ewc_loss": 0.05000835284590721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.500417576811742e-05, + "grad_norm": 31.42652702331543, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.8707390427589417, + "num_tokens": 790453968.0, + "step": 20714 + }, + { + "epoch": 2.635160921002417, + "ewc_loss": 0.05002950504422188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.501475319149904e-05, + "grad_norm": 31.419536590576172, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8680814504623413, + "num_tokens": 790493216.0, + "step": 20715 + }, + { + "epoch": 2.6352881312810075, + "ewc_loss": 0.04999810457229614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4999051674967632e-05, + "grad_norm": 31.39472007751465, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8674060106277466, + "num_tokens": 790529380.0, + "step": 20716 + }, + { + "epoch": 2.635415341559598, + "ewc_loss": 0.05005497857928276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5027489755302668e-05, + "grad_norm": 31.435224533081055, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8674548864364624, + "num_tokens": 790569568.0, + "step": 20717 + }, + { + "epoch": 2.6355425518381885, + "ewc_loss": 0.04994957894086838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4974789994303137e-05, + "grad_norm": 31.356719970703125, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8780563473701477, + "num_tokens": 790602269.0, + "step": 20718 + }, + { + "epoch": 2.635669762116779, + "ewc_loss": 0.04999682679772377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4998413209686987e-05, + "grad_norm": 31.532012939453125, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8609719276428223, + "num_tokens": 790639723.0, + "step": 20719 + }, + { + "epoch": 2.6357969723953696, + "ewc_loss": 0.050065577030181885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5032788471435197e-05, + "grad_norm": 31.4390869140625, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8583664894104004, + "num_tokens": 790676103.0, + "step": 20720 + }, + { + "epoch": 2.63592418267396, + "ewc_loss": 0.04987804591655731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4939023205661215e-05, + "grad_norm": 31.47170066833496, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8744218945503235, + "num_tokens": 790712118.0, + "step": 20721 + }, + { + "epoch": 2.6360513929525506, + "ewc_loss": 0.05003177747130394, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5015888240886852e-05, + "grad_norm": 31.394336700439453, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8717121481895447, + "num_tokens": 790747079.0, + "step": 20722 + }, + { + "epoch": 2.636178603231141, + "ewc_loss": 0.04995543137192726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.497771492926404e-05, + "grad_norm": 31.362356185913086, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8619005680084229, + "num_tokens": 790781741.0, + "step": 20723 + }, + { + "epoch": 2.6363058135097317, + "ewc_loss": 0.049983371049165726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.499168476788327e-05, + "grad_norm": 31.479616165161133, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8705289363861084, + "num_tokens": 790820243.0, + "step": 20724 + }, + { + "epoch": 2.6364330237883222, + "ewc_loss": 0.05010335147380829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5051675038412213e-05, + "grad_norm": 31.43091583251953, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8863999843597412, + "num_tokens": 790851492.0, + "step": 20725 + }, + { + "epoch": 2.6365602340669128, + "ewc_loss": 0.049899544566869736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4949771614046767e-05, + "grad_norm": 31.444948196411133, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8769963979721069, + "num_tokens": 790882293.0, + "step": 20726 + }, + { + "epoch": 2.636687444345503, + "ewc_loss": 0.050034817308187485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5017408916028216e-05, + "grad_norm": 31.365808486938477, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8688328266143799, + "num_tokens": 790922188.0, + "step": 20727 + }, + { + "epoch": 2.636814654624094, + "ewc_loss": 0.049979355186223984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.498967842257116e-05, + "grad_norm": 31.42669677734375, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8814042210578918, + "num_tokens": 790960675.0, + "step": 20728 + }, + { + "epoch": 2.636941864902684, + "ewc_loss": 0.050151851028203964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5075925805140287e-05, + "grad_norm": 31.47294807434082, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.880885124206543, + "num_tokens": 790999352.0, + "step": 20729 + }, + { + "epoch": 2.637069075181275, + "ewc_loss": 0.05011529102921486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.505764496163465e-05, + "grad_norm": 31.544578552246094, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8731157183647156, + "num_tokens": 791035898.0, + "step": 20730 + }, + { + "epoch": 2.637196285459865, + "ewc_loss": 0.05006406083703041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5032029952853918e-05, + "grad_norm": 31.308692932128906, + "learning_rate": 1e-06, + "loss": 0.4915, + "mean_token_accuracy": 0.8480468988418579, + "num_tokens": 791079731.0, + "step": 20731 + }, + { + "epoch": 2.637323495738456, + "ewc_loss": 0.0500478632748127, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5023931812029332e-05, + "grad_norm": 31.416854858398438, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8851344585418701, + "num_tokens": 791118565.0, + "step": 20732 + }, + { + "epoch": 2.637450706017046, + "ewc_loss": 0.05011750012636185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5058750907192007e-05, + "grad_norm": 31.45265007019043, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8733551502227783, + "num_tokens": 791154876.0, + "step": 20733 + }, + { + "epoch": 2.6375779162956365, + "ewc_loss": 0.049985893070697784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4992947146529332e-05, + "grad_norm": 31.296005249023438, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8720468878746033, + "num_tokens": 791196003.0, + "step": 20734 + }, + { + "epoch": 2.637705126574227, + "ewc_loss": 0.050102997571229935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.505149859644007e-05, + "grad_norm": 31.459339141845703, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8704713582992554, + "num_tokens": 791236462.0, + "step": 20735 + }, + { + "epoch": 2.6378323368528176, + "ewc_loss": 0.05019139498472214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.509569821995683e-05, + "grad_norm": 31.544889450073242, + "learning_rate": 1e-06, + "loss": 0.4536, + "mean_token_accuracy": 0.858456015586853, + "num_tokens": 791271918.0, + "step": 20736 + }, + { + "epoch": 2.637959547131408, + "ewc_loss": 0.05002492293715477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5012461264850572e-05, + "grad_norm": 31.318925857543945, + "learning_rate": 1e-06, + "loss": 0.3776, + "mean_token_accuracy": 0.884218692779541, + "num_tokens": 791309813.0, + "step": 20737 + }, + { + "epoch": 2.6380867574099987, + "ewc_loss": 0.05014223977923393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.507112003513612e-05, + "grad_norm": 31.413389205932617, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8804415464401245, + "num_tokens": 791338854.0, + "step": 20738 + }, + { + "epoch": 2.638213967688589, + "ewc_loss": 0.05011150240898132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.505575139366556e-05, + "grad_norm": 31.34284782409668, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8769718408584595, + "num_tokens": 791376580.0, + "step": 20739 + }, + { + "epoch": 2.6383411779671797, + "ewc_loss": 0.050106801092624664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.505340125935618e-05, + "grad_norm": 31.437835693359375, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8680739402770996, + "num_tokens": 791415931.0, + "step": 20740 + }, + { + "epoch": 2.6384683882457702, + "ewc_loss": 0.050163570791482925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5081784770009108e-05, + "grad_norm": 31.48789405822754, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8677542805671692, + "num_tokens": 791448201.0, + "step": 20741 + }, + { + "epoch": 2.6385955985243608, + "ewc_loss": 0.050101686269044876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5050843760254793e-05, + "grad_norm": 31.432823181152344, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8642998933792114, + "num_tokens": 791489183.0, + "step": 20742 + }, + { + "epoch": 2.6387228088029513, + "ewc_loss": 0.050072118639945984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5036059014382772e-05, + "grad_norm": 31.473400115966797, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8735131025314331, + "num_tokens": 791528414.0, + "step": 20743 + }, + { + "epoch": 2.638850019081542, + "ewc_loss": 0.05009998753666878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5049994292203337e-05, + "grad_norm": 31.302486419677734, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.8906360864639282, + "num_tokens": 791566379.0, + "step": 20744 + }, + { + "epoch": 2.6389772293601323, + "ewc_loss": 0.05004329979419708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5021650799317285e-05, + "grad_norm": 31.427204132080078, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8755803108215332, + "num_tokens": 791599789.0, + "step": 20745 + }, + { + "epoch": 2.639104439638723, + "ewc_loss": 0.05020695552229881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5103478037635796e-05, + "grad_norm": 31.373628616333008, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8582271337509155, + "num_tokens": 791642333.0, + "step": 20746 + }, + { + "epoch": 2.6392316499173134, + "ewc_loss": 0.05010754615068436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5053772333194502e-05, + "grad_norm": 31.234844207763672, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8730573654174805, + "num_tokens": 791687190.0, + "step": 20747 + }, + { + "epoch": 2.639358860195904, + "ewc_loss": 0.050248462706804276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.512423088774085e-05, + "grad_norm": 31.483558654785156, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8571844696998596, + "num_tokens": 791722806.0, + "step": 20748 + }, + { + "epoch": 2.6394860704744945, + "ewc_loss": 0.05015823617577553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.507911813154351e-05, + "grad_norm": 31.32720947265625, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8574661016464233, + "num_tokens": 791757726.0, + "step": 20749 + }, + { + "epoch": 2.639613280753085, + "ewc_loss": 0.05015669763088226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.507834869902581e-05, + "grad_norm": 31.42087173461914, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8794668316841125, + "num_tokens": 791795160.0, + "step": 20750 + }, + { + "epoch": 2.6397404910316755, + "ewc_loss": 0.05014345422387123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5071727577596903e-05, + "grad_norm": 31.292724609375, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8835371732711792, + "num_tokens": 791831682.0, + "step": 20751 + }, + { + "epoch": 2.6398677013102656, + "ewc_loss": 0.0501105897128582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.505529482732527e-05, + "grad_norm": 31.43195915222168, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8816217184066772, + "num_tokens": 791863230.0, + "step": 20752 + }, + { + "epoch": 2.6399949115888566, + "ewc_loss": 0.050166331231594086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.50831653829664e-05, + "grad_norm": 31.304346084594727, + "learning_rate": 1e-06, + "loss": 0.3705, + "mean_token_accuracy": 0.8847556710243225, + "num_tokens": 791904323.0, + "step": 20753 + }, + { + "epoch": 2.6401221218674467, + "ewc_loss": 0.04997965693473816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.498982757970225e-05, + "grad_norm": 31.33188247680664, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8650907278060913, + "num_tokens": 791951836.0, + "step": 20754 + }, + { + "epoch": 2.6402493321460376, + "ewc_loss": 0.05017821490764618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5089108021347784e-05, + "grad_norm": 31.371091842651367, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8745098114013672, + "num_tokens": 791989782.0, + "step": 20755 + }, + { + "epoch": 2.6403765424246277, + "ewc_loss": 0.05012812092900276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5064060537260957e-05, + "grad_norm": 31.347227096557617, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.874239444732666, + "num_tokens": 792024038.0, + "step": 20756 + }, + { + "epoch": 2.6405037527032187, + "ewc_loss": 0.05011608824133873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5058043320314027e-05, + "grad_norm": 31.360139846801758, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8719987869262695, + "num_tokens": 792067893.0, + "step": 20757 + }, + { + "epoch": 2.6406309629818088, + "ewc_loss": 0.05015777051448822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5078885300899856e-05, + "grad_norm": 31.401830673217773, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.881530225276947, + "num_tokens": 792108593.0, + "step": 20758 + }, + { + "epoch": 2.6407581732603993, + "ewc_loss": 0.050225578248500824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5112789444392547e-05, + "grad_norm": 31.384246826171875, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.885291576385498, + "num_tokens": 792141201.0, + "step": 20759 + }, + { + "epoch": 2.64088538353899, + "ewc_loss": 0.05008998513221741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5044992071343586e-05, + "grad_norm": 31.30259895324707, + "learning_rate": 1e-06, + "loss": 0.3611, + "mean_token_accuracy": 0.891251266002655, + "num_tokens": 792182908.0, + "step": 20760 + }, + { + "epoch": 2.6410125938175804, + "ewc_loss": 0.05021870136260986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5109351554419845e-05, + "grad_norm": 31.381763458251953, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8728238344192505, + "num_tokens": 792225223.0, + "step": 20761 + }, + { + "epoch": 2.641139804096171, + "ewc_loss": 0.05017998069524765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5089990231208503e-05, + "grad_norm": 31.261795043945312, + "learning_rate": 1e-06, + "loss": 0.47, + "mean_token_accuracy": 0.8552960157394409, + "num_tokens": 792258131.0, + "step": 20762 + }, + { + "epoch": 2.6412670143747614, + "ewc_loss": 0.05012749135494232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5063745852094144e-05, + "grad_norm": 31.339616775512695, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8775427341461182, + "num_tokens": 792291429.0, + "step": 20763 + }, + { + "epoch": 2.641394224653352, + "ewc_loss": 0.05020450800657272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.510225385776721e-05, + "grad_norm": 31.29582405090332, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.8677971959114075, + "num_tokens": 792327777.0, + "step": 20764 + }, + { + "epoch": 2.6415214349319425, + "ewc_loss": 0.05016287788748741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5081439162022434e-05, + "grad_norm": 31.38581657409668, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8747764825820923, + "num_tokens": 792367918.0, + "step": 20765 + }, + { + "epoch": 2.641648645210533, + "ewc_loss": 0.05029391497373581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5146957341348752e-05, + "grad_norm": 31.486658096313477, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8759661316871643, + "num_tokens": 792405550.0, + "step": 20766 + }, + { + "epoch": 2.6417758554891235, + "ewc_loss": 0.05010376125574112, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5051880584214814e-05, + "grad_norm": 31.35336685180664, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.874373197555542, + "num_tokens": 792440948.0, + "step": 20767 + }, + { + "epoch": 2.641903065767714, + "ewc_loss": 0.05013503134250641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.506751479813829e-05, + "grad_norm": 31.373645782470703, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8798509836196899, + "num_tokens": 792488065.0, + "step": 20768 + }, + { + "epoch": 2.6420302760463046, + "ewc_loss": 0.05011995509266853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5059976906049997e-05, + "grad_norm": 31.390270233154297, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8796838521957397, + "num_tokens": 792522845.0, + "step": 20769 + }, + { + "epoch": 2.642157486324895, + "ewc_loss": 0.05009660869836807, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5048304451047443e-05, + "grad_norm": 31.40967559814453, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8825992941856384, + "num_tokens": 792561118.0, + "step": 20770 + }, + { + "epoch": 2.6422846966034856, + "ewc_loss": 0.050111692398786545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5055845981114544e-05, + "grad_norm": 31.40547752380371, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8722786903381348, + "num_tokens": 792600125.0, + "step": 20771 + }, + { + "epoch": 2.642411906882076, + "ewc_loss": 0.05010540038347244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5052700948435813e-05, + "grad_norm": 31.38776206970215, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8778694272041321, + "num_tokens": 792633856.0, + "step": 20772 + }, + { + "epoch": 2.6425391171606667, + "ewc_loss": 0.05018620565533638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.509310252207797e-05, + "grad_norm": 31.40420150756836, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8695027232170105, + "num_tokens": 792672427.0, + "step": 20773 + }, + { + "epoch": 2.6426663274392572, + "ewc_loss": 0.05012902244925499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5064511646633036e-05, + "grad_norm": 31.468141555786133, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8503572940826416, + "num_tokens": 792713856.0, + "step": 20774 + }, + { + "epoch": 2.6427935377178478, + "ewc_loss": 0.050143420696258545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.507171120669227e-05, + "grad_norm": 31.41242218017578, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8655234575271606, + "num_tokens": 792757602.0, + "step": 20775 + }, + { + "epoch": 2.6429207479964383, + "ewc_loss": 0.050142910331487656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.507145472918637e-05, + "grad_norm": 31.481287002563477, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8630931377410889, + "num_tokens": 792797067.0, + "step": 20776 + }, + { + "epoch": 2.6430479582750284, + "ewc_loss": 0.050117988139390945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5058994651772082e-05, + "grad_norm": 31.32642936706543, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8764471411705017, + "num_tokens": 792835748.0, + "step": 20777 + }, + { + "epoch": 2.6431751685536193, + "ewc_loss": 0.05004052817821503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5020264729391783e-05, + "grad_norm": 31.363136291503906, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8736139535903931, + "num_tokens": 792875039.0, + "step": 20778 + }, + { + "epoch": 2.6433023788322094, + "ewc_loss": 0.050198014825582504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.509900696168188e-05, + "grad_norm": 31.39478302001953, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8778277635574341, + "num_tokens": 792916580.0, + "step": 20779 + }, + { + "epoch": 2.6434295891108004, + "ewc_loss": 0.05017462745308876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5087314497795887e-05, + "grad_norm": 31.44510841369629, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8749192953109741, + "num_tokens": 792952507.0, + "step": 20780 + }, + { + "epoch": 2.6435567993893905, + "ewc_loss": 0.050123825669288635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.506191231077537e-05, + "grad_norm": 31.38176155090332, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8753918409347534, + "num_tokens": 792994046.0, + "step": 20781 + }, + { + "epoch": 2.6436840096679814, + "ewc_loss": 0.05009172484278679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.504586154827848e-05, + "grad_norm": 31.26445770263672, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8903754353523254, + "num_tokens": 793033855.0, + "step": 20782 + }, + { + "epoch": 2.6438112199465715, + "ewc_loss": 0.050193216651678085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5096607714658603e-05, + "grad_norm": 31.34089469909668, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8757622241973877, + "num_tokens": 793072226.0, + "step": 20783 + }, + { + "epoch": 2.643938430225162, + "ewc_loss": 0.05014621466398239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5073108190554194e-05, + "grad_norm": 31.408323287963867, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8765931129455566, + "num_tokens": 793110884.0, + "step": 20784 + }, + { + "epoch": 2.6440656405037526, + "ewc_loss": 0.05013888329267502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5069441107916646e-05, + "grad_norm": 31.343578338623047, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8549063205718994, + "num_tokens": 793151885.0, + "step": 20785 + }, + { + "epoch": 2.644192850782343, + "ewc_loss": 0.05019132420420647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.509566184016876e-05, + "grad_norm": 31.384279251098633, + "learning_rate": 1e-06, + "loss": 0.4804, + "mean_token_accuracy": 0.8510781526565552, + "num_tokens": 793187253.0, + "step": 20786 + }, + { + "epoch": 2.6443200610609336, + "ewc_loss": 0.05024130642414093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5120652935584076e-05, + "grad_norm": 31.54072380065918, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8772531151771545, + "num_tokens": 793231249.0, + "step": 20787 + }, + { + "epoch": 2.644447271339524, + "ewc_loss": 0.05018205568194389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5091027055168524e-05, + "grad_norm": 31.382966995239258, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.8843035697937012, + "num_tokens": 793269918.0, + "step": 20788 + }, + { + "epoch": 2.6445744816181147, + "ewc_loss": 0.05007036775350571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5035184080479667e-05, + "grad_norm": 31.407896041870117, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8819247484207153, + "num_tokens": 793306856.0, + "step": 20789 + }, + { + "epoch": 2.6447016918967052, + "ewc_loss": 0.05013830587267876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5069153707590885e-05, + "grad_norm": 31.40802764892578, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8637635111808777, + "num_tokens": 793347866.0, + "step": 20790 + }, + { + "epoch": 2.6448289021752958, + "ewc_loss": 0.050103235989809036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5051618649740703e-05, + "grad_norm": 31.39891242980957, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8705687522888184, + "num_tokens": 793391037.0, + "step": 20791 + }, + { + "epoch": 2.6449561124538863, + "ewc_loss": 0.05018815025687218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5094075681408867e-05, + "grad_norm": 31.43208885192871, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.859063982963562, + "num_tokens": 793423560.0, + "step": 20792 + }, + { + "epoch": 2.645083322732477, + "ewc_loss": 0.05006597936153412, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.503299037925899e-05, + "grad_norm": 31.364200592041016, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.878460168838501, + "num_tokens": 793460215.0, + "step": 20793 + }, + { + "epoch": 2.6452105330110673, + "ewc_loss": 0.0501248762011528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5062437998712994e-05, + "grad_norm": 31.478113174438477, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.859745979309082, + "num_tokens": 793492671.0, + "step": 20794 + }, + { + "epoch": 2.645337743289658, + "ewc_loss": 0.05016402527689934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.508201214368455e-05, + "grad_norm": 31.475881576538086, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8710780143737793, + "num_tokens": 793529526.0, + "step": 20795 + }, + { + "epoch": 2.6454649535682484, + "ewc_loss": 0.05008867010474205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5044335416168906e-05, + "grad_norm": 31.383970260620117, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.880436360836029, + "num_tokens": 793566748.0, + "step": 20796 + }, + { + "epoch": 2.645592163846839, + "ewc_loss": 0.050142496824264526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.507124918338377e-05, + "grad_norm": 31.60828971862793, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8868250846862793, + "num_tokens": 793600703.0, + "step": 20797 + }, + { + "epoch": 2.6457193741254295, + "ewc_loss": 0.050118736922740936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5059369363589212e-05, + "grad_norm": 31.405080795288086, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8716272711753845, + "num_tokens": 793639226.0, + "step": 20798 + }, + { + "epoch": 2.64584658440402, + "ewc_loss": 0.049965422600507736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.498271169315558e-05, + "grad_norm": 31.530954360961914, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8859548568725586, + "num_tokens": 793676147.0, + "step": 20799 + }, + { + "epoch": 2.6459737946826105, + "ewc_loss": 0.05019059777259827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.509529804228805e-05, + "grad_norm": 31.420560836791992, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8823713660240173, + "num_tokens": 793715778.0, + "step": 20800 + }, + { + "epoch": 2.646101004961201, + "ewc_loss": 0.05010617524385452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.505308839317877e-05, + "grad_norm": 31.520605087280273, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8885611891746521, + "num_tokens": 793752096.0, + "step": 20801 + }, + { + "epoch": 2.646228215239791, + "ewc_loss": 0.05009680613875389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5048402676475234e-05, + "grad_norm": 31.47060203552246, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8678916692733765, + "num_tokens": 793789809.0, + "step": 20802 + }, + { + "epoch": 2.646355425518382, + "ewc_loss": 0.05008242279291153, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5041212211363018e-05, + "grad_norm": 31.3857421875, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8694280982017517, + "num_tokens": 793826505.0, + "step": 20803 + }, + { + "epoch": 2.646482635796972, + "ewc_loss": 0.05012645572423935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5063227440114133e-05, + "grad_norm": 31.538436889648438, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8720288276672363, + "num_tokens": 793868360.0, + "step": 20804 + }, + { + "epoch": 2.646609846075563, + "ewc_loss": 0.05013178661465645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.506589407857973e-05, + "grad_norm": 31.48501968383789, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8746293783187866, + "num_tokens": 793906566.0, + "step": 20805 + }, + { + "epoch": 2.6467370563541532, + "ewc_loss": 0.05003664270043373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5018322048708797e-05, + "grad_norm": 31.49738121032715, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8707389831542969, + "num_tokens": 793947814.0, + "step": 20806 + }, + { + "epoch": 2.6468642666327438, + "ewc_loss": 0.0499453991651535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4972699975478463e-05, + "grad_norm": 31.36068344116211, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8780609965324402, + "num_tokens": 793987391.0, + "step": 20807 + }, + { + "epoch": 2.6469914769113343, + "ewc_loss": 0.05006980523467064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5034902137122117e-05, + "grad_norm": 31.484281539916992, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.8834009170532227, + "num_tokens": 794024045.0, + "step": 20808 + }, + { + "epoch": 2.647118687189925, + "ewc_loss": 0.05009781941771507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5048910174518824e-05, + "grad_norm": 31.415788650512695, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8770723342895508, + "num_tokens": 794063405.0, + "step": 20809 + }, + { + "epoch": 2.6472458974685154, + "ewc_loss": 0.04999890178442001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.4999450033647008e-05, + "grad_norm": 31.416772842407227, + "learning_rate": 1e-06, + "loss": 0.4843, + "mean_token_accuracy": 0.8487600684165955, + "num_tokens": 794099729.0, + "step": 20810 + }, + { + "epoch": 2.647373107747106, + "ewc_loss": 0.050093647092580795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5046823793672957e-05, + "grad_norm": 31.478242874145508, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8657582998275757, + "num_tokens": 794139669.0, + "step": 20811 + }, + { + "epoch": 2.6475003180256964, + "ewc_loss": 0.049995679408311844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.499784022802487e-05, + "grad_norm": 31.359033584594727, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8743473291397095, + "num_tokens": 794176906.0, + "step": 20812 + }, + { + "epoch": 2.647627528304287, + "ewc_loss": 0.05005228519439697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.502614188415464e-05, + "grad_norm": 31.438365936279297, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8756330609321594, + "num_tokens": 794216594.0, + "step": 20813 + }, + { + "epoch": 2.6477547385828775, + "ewc_loss": 0.05009060353040695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.504530129954219e-05, + "grad_norm": 31.4223690032959, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8763881921768188, + "num_tokens": 794248379.0, + "step": 20814 + }, + { + "epoch": 2.647881948861468, + "ewc_loss": 0.05008784681558609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.50439225055743e-05, + "grad_norm": 31.431419372558594, + "learning_rate": 1e-06, + "loss": 0.4793, + "mean_token_accuracy": 0.8495902419090271, + "num_tokens": 794293604.0, + "step": 20815 + }, + { + "epoch": 2.6480091591400585, + "ewc_loss": 0.05014948174357414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.507474164303858e-05, + "grad_norm": 31.400897979736328, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8633060455322266, + "num_tokens": 794332084.0, + "step": 20816 + }, + { + "epoch": 2.648136369418649, + "ewc_loss": 0.050169602036476135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5084800654440187e-05, + "grad_norm": 31.45952606201172, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8887108564376831, + "num_tokens": 794369158.0, + "step": 20817 + }, + { + "epoch": 2.6482635796972396, + "ewc_loss": 0.0501541905105114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5077095415326767e-05, + "grad_norm": 31.352863311767578, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8770936131477356, + "num_tokens": 794410531.0, + "step": 20818 + }, + { + "epoch": 2.64839078997583, + "ewc_loss": 0.05018969997763634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5094850570894778e-05, + "grad_norm": 31.501419067382812, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8728920817375183, + "num_tokens": 794457556.0, + "step": 20819 + }, + { + "epoch": 2.6485180002544206, + "ewc_loss": 0.05022905021905899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.511452476028353e-05, + "grad_norm": 31.387161254882812, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.861162543296814, + "num_tokens": 794492737.0, + "step": 20820 + }, + { + "epoch": 2.648645210533011, + "ewc_loss": 0.05018555000424385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5092775103985332e-05, + "grad_norm": 31.47819709777832, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8829805850982666, + "num_tokens": 794532589.0, + "step": 20821 + }, + { + "epoch": 2.6487724208116017, + "ewc_loss": 0.05016995966434479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5084980734391138e-05, + "grad_norm": 31.37111473083496, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.861885666847229, + "num_tokens": 794566982.0, + "step": 20822 + }, + { + "epoch": 2.648899631090192, + "ewc_loss": 0.05010904371738434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5054521756828763e-05, + "grad_norm": 31.407299041748047, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8562906980514526, + "num_tokens": 794604278.0, + "step": 20823 + }, + { + "epoch": 2.6490268413687827, + "ewc_loss": 0.0501844547688961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5092227588174865e-05, + "grad_norm": 31.419649124145508, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8817102313041687, + "num_tokens": 794643785.0, + "step": 20824 + }, + { + "epoch": 2.649154051647373, + "ewc_loss": 0.0501602403819561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5080120394704863e-05, + "grad_norm": 31.371601104736328, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8820698857307434, + "num_tokens": 794678302.0, + "step": 20825 + }, + { + "epoch": 2.649281261925964, + "ewc_loss": 0.05015261471271515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.507630779291503e-05, + "grad_norm": 31.361967086791992, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8620543479919434, + "num_tokens": 794715732.0, + "step": 20826 + }, + { + "epoch": 2.649408472204554, + "ewc_loss": 0.050193946808576584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5096973331528716e-05, + "grad_norm": 31.413312911987305, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.877190113067627, + "num_tokens": 794754359.0, + "step": 20827 + }, + { + "epoch": 2.649535682483145, + "ewc_loss": 0.050239913165569305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5119956262642518e-05, + "grad_norm": 31.484445571899414, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8696379661560059, + "num_tokens": 794795770.0, + "step": 20828 + }, + { + "epoch": 2.649662892761735, + "ewc_loss": 0.050184253603219986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.509212754375767e-05, + "grad_norm": 31.353193283081055, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8651435375213623, + "num_tokens": 794835654.0, + "step": 20829 + }, + { + "epoch": 2.649790103040326, + "ewc_loss": 0.05014887824654579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.507443969079759e-05, + "grad_norm": 31.38939666748047, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8748712539672852, + "num_tokens": 794872527.0, + "step": 20830 + }, + { + "epoch": 2.649917313318916, + "ewc_loss": 0.05024255812168121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5121278667938896e-05, + "grad_norm": 31.49248695373535, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8765140175819397, + "num_tokens": 794918577.0, + "step": 20831 + }, + { + "epoch": 2.6500445235975065, + "ewc_loss": 0.050125379115343094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5062689019250683e-05, + "grad_norm": 31.43994140625, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8824711441993713, + "num_tokens": 794953566.0, + "step": 20832 + }, + { + "epoch": 2.650171733876097, + "ewc_loss": 0.05013728141784668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5068640752579086e-05, + "grad_norm": 31.551176071166992, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8790953755378723, + "num_tokens": 794990547.0, + "step": 20833 + }, + { + "epoch": 2.6502989441546876, + "ewc_loss": 0.05010072886943817, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.505036536604166e-05, + "grad_norm": 31.462976455688477, + "learning_rate": 1e-06, + "loss": 0.3643, + "mean_token_accuracy": 0.8894041180610657, + "num_tokens": 795028419.0, + "step": 20834 + }, + { + "epoch": 2.650426154433278, + "ewc_loss": 0.05010563135147095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5052815544768237e-05, + "grad_norm": 31.458166122436523, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8885334730148315, + "num_tokens": 795064303.0, + "step": 20835 + }, + { + "epoch": 2.6505533647118686, + "ewc_loss": 0.050147030502557755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.507351564418059e-05, + "grad_norm": 31.42365837097168, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8514244556427002, + "num_tokens": 795104280.0, + "step": 20836 + }, + { + "epoch": 2.650680574990459, + "ewc_loss": 0.050146132707595825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5073066353797913e-05, + "grad_norm": 31.52216148376465, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8634558320045471, + "num_tokens": 795142547.0, + "step": 20837 + }, + { + "epoch": 2.6508077852690497, + "ewc_loss": 0.05011764168739319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.505882002878934e-05, + "grad_norm": 31.26814079284668, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8702042698860168, + "num_tokens": 795186193.0, + "step": 20838 + }, + { + "epoch": 2.6509349955476402, + "ewc_loss": 0.050109609961509705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5054805519175716e-05, + "grad_norm": 31.486377716064453, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8897581100463867, + "num_tokens": 795215453.0, + "step": 20839 + }, + { + "epoch": 2.6510622058262308, + "ewc_loss": 0.050181373953819275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.509068690415006e-05, + "grad_norm": 31.316444396972656, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.86836838722229, + "num_tokens": 795252820.0, + "step": 20840 + }, + { + "epoch": 2.6511894161048213, + "ewc_loss": 0.050100844353437424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.505042175471317e-05, + "grad_norm": 31.45037269592285, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8705264925956726, + "num_tokens": 795290000.0, + "step": 20841 + }, + { + "epoch": 2.651316626383412, + "ewc_loss": 0.05025206878781319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5126033506239764e-05, + "grad_norm": 31.391250610351562, + "learning_rate": 1e-06, + "loss": 0.3603, + "mean_token_accuracy": 0.8903736472129822, + "num_tokens": 795330389.0, + "step": 20842 + }, + { + "epoch": 2.6514438366620023, + "ewc_loss": 0.05012979730963707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5064899091375992e-05, + "grad_norm": 31.336992263793945, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8705482482910156, + "num_tokens": 795366693.0, + "step": 20843 + }, + { + "epoch": 2.651571046940593, + "ewc_loss": 0.05022381246089935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5111905415542424e-05, + "grad_norm": 31.419464111328125, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8750431537628174, + "num_tokens": 795405748.0, + "step": 20844 + }, + { + "epoch": 2.6516982572191834, + "ewc_loss": 0.050253260880708694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5126630134764127e-05, + "grad_norm": 31.35838508605957, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8837980031967163, + "num_tokens": 795446179.0, + "step": 20845 + }, + { + "epoch": 2.651825467497774, + "ewc_loss": 0.05017564818263054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.508782381482888e-05, + "grad_norm": 31.50997543334961, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8683826923370361, + "num_tokens": 795488234.0, + "step": 20846 + }, + { + "epoch": 2.6519526777763645, + "ewc_loss": 0.05031462386250496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.515731284802314e-05, + "grad_norm": 31.504985809326172, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8698784708976746, + "num_tokens": 795526508.0, + "step": 20847 + }, + { + "epoch": 2.652079888054955, + "ewc_loss": 0.050096333026885986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5048166207852773e-05, + "grad_norm": 31.39767837524414, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.879524827003479, + "num_tokens": 795566299.0, + "step": 20848 + }, + { + "epoch": 2.6522070983335455, + "ewc_loss": 0.050214964896440506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5107481633313e-05, + "grad_norm": 31.470096588134766, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8693798780441284, + "num_tokens": 795603892.0, + "step": 20849 + }, + { + "epoch": 2.6523343086121356, + "ewc_loss": 0.05013090744614601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5065453883144073e-05, + "grad_norm": 31.417713165283203, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8807740211486816, + "num_tokens": 795639622.0, + "step": 20850 + }, + { + "epoch": 2.6524615188907266, + "ewc_loss": 0.05016645789146423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5083229047595523e-05, + "grad_norm": 31.511096954345703, + "learning_rate": 1e-06, + "loss": 0.3489, + "mean_token_accuracy": 0.8947426080703735, + "num_tokens": 795677299.0, + "step": 20851 + }, + { + "epoch": 2.6525887291693167, + "ewc_loss": 0.05015397444367409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5076988094951957e-05, + "grad_norm": 31.430011749267578, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8733441233634949, + "num_tokens": 795714304.0, + "step": 20852 + }, + { + "epoch": 2.6527159394479076, + "ewc_loss": 0.05013617128133774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5068085960811004e-05, + "grad_norm": 31.43749237060547, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.8690264225006104, + "num_tokens": 795752330.0, + "step": 20853 + }, + { + "epoch": 2.6528431497264977, + "ewc_loss": 0.05008382722735405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5041914341272786e-05, + "grad_norm": 31.362171173095703, + "learning_rate": 1e-06, + "loss": 0.4764, + "mean_token_accuracy": 0.8531132936477661, + "num_tokens": 795789503.0, + "step": 20854 + }, + { + "epoch": 2.6529703600050887, + "ewc_loss": 0.0501989983022213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.509949990781024e-05, + "grad_norm": 31.461559295654297, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8718875050544739, + "num_tokens": 795828449.0, + "step": 20855 + }, + { + "epoch": 2.6530975702836788, + "ewc_loss": 0.05005243420600891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5026216462720186e-05, + "grad_norm": 31.278419494628906, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8735626935958862, + "num_tokens": 795868454.0, + "step": 20856 + }, + { + "epoch": 2.6532247805622693, + "ewc_loss": 0.05014225095510483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.507112549210433e-05, + "grad_norm": 31.404630661010742, + "learning_rate": 1e-06, + "loss": 0.4753, + "mean_token_accuracy": 0.8593018054962158, + "num_tokens": 795904629.0, + "step": 20857 + }, + { + "epoch": 2.65335199084086, + "ewc_loss": 0.05020732432603836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5103661755565554e-05, + "grad_norm": 31.38144302368164, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.8587172627449036, + "num_tokens": 795943197.0, + "step": 20858 + }, + { + "epoch": 2.6534792011194503, + "ewc_loss": 0.050109248608350754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.505462362023536e-05, + "grad_norm": 31.251699447631836, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8689332008361816, + "num_tokens": 795984672.0, + "step": 20859 + }, + { + "epoch": 2.653606411398041, + "ewc_loss": 0.05020815134048462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5104076485149562e-05, + "grad_norm": 31.418048858642578, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.8547458052635193, + "num_tokens": 796022599.0, + "step": 20860 + }, + { + "epoch": 2.6537336216766314, + "ewc_loss": 0.05016972869634628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.508486431906931e-05, + "grad_norm": 31.3096923828125, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8707146644592285, + "num_tokens": 796057798.0, + "step": 20861 + }, + { + "epoch": 2.653860831955222, + "ewc_loss": 0.050230830907821655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5115416065091267e-05, + "grad_norm": 31.410608291625977, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8722714185714722, + "num_tokens": 796094928.0, + "step": 20862 + }, + { + "epoch": 2.6539880422338125, + "ewc_loss": 0.050256144255399704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.512807259336114e-05, + "grad_norm": 31.45694351196289, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8748408555984497, + "num_tokens": 796132614.0, + "step": 20863 + }, + { + "epoch": 2.654115252512403, + "ewc_loss": 0.05022765323519707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5113826268352568e-05, + "grad_norm": 31.33701515197754, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8713034391403198, + "num_tokens": 796171653.0, + "step": 20864 + }, + { + "epoch": 2.6542424627909935, + "ewc_loss": 0.050195470452308655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.50977354880888e-05, + "grad_norm": 31.272436141967773, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8663744926452637, + "num_tokens": 796215218.0, + "step": 20865 + }, + { + "epoch": 2.654369673069584, + "ewc_loss": 0.05033615976572037, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5168079446302727e-05, + "grad_norm": 31.37749671936035, + "learning_rate": 1e-06, + "loss": 0.3792, + "mean_token_accuracy": 0.8841109275817871, + "num_tokens": 796249165.0, + "step": 20866 + }, + { + "epoch": 2.6544968833481746, + "ewc_loss": 0.05033393204212189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5166966224787757e-05, + "grad_norm": 31.505529403686523, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.870187520980835, + "num_tokens": 796283878.0, + "step": 20867 + }, + { + "epoch": 2.654624093626765, + "ewc_loss": 0.05025184527039528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5125922547886148e-05, + "grad_norm": 31.27828598022461, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8604159355163574, + "num_tokens": 796330760.0, + "step": 20868 + }, + { + "epoch": 2.6547513039053556, + "ewc_loss": 0.050308097153902054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5154047762043774e-05, + "grad_norm": 31.549116134643555, + "learning_rate": 1e-06, + "loss": 0.4676, + "mean_token_accuracy": 0.8617953658103943, + "num_tokens": 796371547.0, + "step": 20869 + }, + { + "epoch": 2.654878514183946, + "ewc_loss": 0.050376325845718384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5188162908307277e-05, + "grad_norm": 31.444398880004883, + "learning_rate": 1e-06, + "loss": 0.4609, + "mean_token_accuracy": 0.8568146228790283, + "num_tokens": 796409893.0, + "step": 20870 + }, + { + "epoch": 2.6550057244625367, + "ewc_loss": 0.05019502341747284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5097511752392165e-05, + "grad_norm": 31.492431640625, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8573362827301025, + "num_tokens": 796442493.0, + "step": 20871 + }, + { + "epoch": 2.655132934741127, + "ewc_loss": 0.05028210207819939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.514105108275544e-05, + "grad_norm": 31.50602912902832, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8746834397315979, + "num_tokens": 796480853.0, + "step": 20872 + }, + { + "epoch": 2.6552601450197177, + "ewc_loss": 0.05011353641748428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5056768208742142e-05, + "grad_norm": 31.413429260253906, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8786909580230713, + "num_tokens": 796520057.0, + "step": 20873 + }, + { + "epoch": 2.6553873552983083, + "ewc_loss": 0.050230562686920166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5115281459875405e-05, + "grad_norm": 31.533544540405273, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8768531680107117, + "num_tokens": 796549779.0, + "step": 20874 + }, + { + "epoch": 2.6555145655768984, + "ewc_loss": 0.050167638808488846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5083820219151676e-05, + "grad_norm": 31.41057014465332, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8779686093330383, + "num_tokens": 796590923.0, + "step": 20875 + }, + { + "epoch": 2.6556417758554893, + "ewc_loss": 0.05022997409105301, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.511498678359203e-05, + "grad_norm": 31.403318405151367, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8680704832077026, + "num_tokens": 796629455.0, + "step": 20876 + }, + { + "epoch": 2.6557689861340794, + "ewc_loss": 0.05028187483549118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5140936486423016e-05, + "grad_norm": 31.431194305419922, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8692127466201782, + "num_tokens": 796669950.0, + "step": 20877 + }, + { + "epoch": 2.6558961964126704, + "ewc_loss": 0.050244931131601334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.512246646801941e-05, + "grad_norm": 31.333524703979492, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8621379733085632, + "num_tokens": 796712263.0, + "step": 20878 + }, + { + "epoch": 2.6560234066912605, + "ewc_loss": 0.050352804362773895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.517640132282395e-05, + "grad_norm": 31.497745513916016, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8716592788696289, + "num_tokens": 796753025.0, + "step": 20879 + }, + { + "epoch": 2.6561506169698514, + "ewc_loss": 0.050217483192682266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5108742192969657e-05, + "grad_norm": 31.375347137451172, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8843636512756348, + "num_tokens": 796788810.0, + "step": 20880 + }, + { + "epoch": 2.6562778272484415, + "ewc_loss": 0.05019862577319145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5099312551901676e-05, + "grad_norm": 31.415802001953125, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8749682307243347, + "num_tokens": 796826275.0, + "step": 20881 + }, + { + "epoch": 2.656405037527032, + "ewc_loss": 0.050258707255125046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5129353161901236e-05, + "grad_norm": 31.3449764251709, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8663249611854553, + "num_tokens": 796859587.0, + "step": 20882 + }, + { + "epoch": 2.6565322478056226, + "ewc_loss": 0.05025187507271767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5125937099801376e-05, + "grad_norm": 31.378440856933594, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8779595494270325, + "num_tokens": 796899440.0, + "step": 20883 + }, + { + "epoch": 2.656659458084213, + "ewc_loss": 0.05030251666903496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5151257432298735e-05, + "grad_norm": 31.463472366333008, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8612661361694336, + "num_tokens": 796938703.0, + "step": 20884 + }, + { + "epoch": 2.6567866683628036, + "ewc_loss": 0.050268493592739105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5134246243396774e-05, + "grad_norm": 31.42865753173828, + "learning_rate": 1e-06, + "loss": 0.4623, + "mean_token_accuracy": 0.8585240244865417, + "num_tokens": 796982287.0, + "step": 20885 + }, + { + "epoch": 2.656913878641394, + "ewc_loss": 0.05021039769053459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.510519880161155e-05, + "grad_norm": 31.411039352416992, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8741500377655029, + "num_tokens": 797017640.0, + "step": 20886 + }, + { + "epoch": 2.6570410889199847, + "ewc_loss": 0.050404030829668045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.520201451261528e-05, + "grad_norm": 31.498743057250977, + "learning_rate": 1e-06, + "loss": 0.4626, + "mean_token_accuracy": 0.8566094040870667, + "num_tokens": 797054050.0, + "step": 20887 + }, + { + "epoch": 2.6571682991985752, + "ewc_loss": 0.050173353403806686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5086676032515243e-05, + "grad_norm": 31.323972702026367, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8759588599205017, + "num_tokens": 797092429.0, + "step": 20888 + }, + { + "epoch": 2.6572955094771658, + "ewc_loss": 0.050167303532361984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5083651053137146e-05, + "grad_norm": 31.414566040039062, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8688594102859497, + "num_tokens": 797125646.0, + "step": 20889 + }, + { + "epoch": 2.6574227197557563, + "ewc_loss": 0.05031687766313553, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5158438802463934e-05, + "grad_norm": 31.476274490356445, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8746860027313232, + "num_tokens": 797164560.0, + "step": 20890 + }, + { + "epoch": 2.657549930034347, + "ewc_loss": 0.0502348467707634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5117424229392782e-05, + "grad_norm": 31.38505744934082, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8711277842521667, + "num_tokens": 797194168.0, + "step": 20891 + }, + { + "epoch": 2.6576771403129373, + "ewc_loss": 0.0503346212208271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5167310013785027e-05, + "grad_norm": 31.479881286621094, + "learning_rate": 1e-06, + "loss": 0.3624, + "mean_token_accuracy": 0.8913859724998474, + "num_tokens": 797227045.0, + "step": 20892 + }, + { + "epoch": 2.657804350591528, + "ewc_loss": 0.05032196640968323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5160983568639494e-05, + "grad_norm": 31.4601993560791, + "learning_rate": 1e-06, + "loss": 0.3759, + "mean_token_accuracy": 0.8861956596374512, + "num_tokens": 797267607.0, + "step": 20893 + }, + { + "epoch": 2.6579315608701184, + "ewc_loss": 0.05024467781186104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5122339138761163e-05, + "grad_norm": 31.372365951538086, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8612246513366699, + "num_tokens": 797309320.0, + "step": 20894 + }, + { + "epoch": 2.658058771148709, + "ewc_loss": 0.05022019147872925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5110095521085896e-05, + "grad_norm": 31.395614624023438, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8695218563079834, + "num_tokens": 797348258.0, + "step": 20895 + }, + { + "epoch": 2.6581859814272994, + "ewc_loss": 0.05034893751144409, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5174469556077383e-05, + "grad_norm": 31.46168327331543, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.872210681438446, + "num_tokens": 797391563.0, + "step": 20896 + }, + { + "epoch": 2.65831319170589, + "ewc_loss": 0.05034227296710014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5171137167490087e-05, + "grad_norm": 31.476974487304688, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8804595470428467, + "num_tokens": 797428314.0, + "step": 20897 + }, + { + "epoch": 2.6584404019844805, + "ewc_loss": 0.050345566123723984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5172783352900296e-05, + "grad_norm": 31.44141387939453, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8520221710205078, + "num_tokens": 797464480.0, + "step": 20898 + }, + { + "epoch": 2.658567612263071, + "ewc_loss": 0.05026994273066521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.513497202016879e-05, + "grad_norm": 31.472673416137695, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8731330037117004, + "num_tokens": 797504135.0, + "step": 20899 + }, + { + "epoch": 2.658694822541661, + "ewc_loss": 0.05028389021754265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.514194420655258e-05, + "grad_norm": 31.34290313720703, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.874925971031189, + "num_tokens": 797544222.0, + "step": 20900 + }, + { + "epoch": 2.658822032820252, + "ewc_loss": 0.05023534223437309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5117671611951664e-05, + "grad_norm": 31.53076934814453, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8781472444534302, + "num_tokens": 797583695.0, + "step": 20901 + }, + { + "epoch": 2.658949243098842, + "ewc_loss": 0.05025361850857735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5126808395725675e-05, + "grad_norm": 31.369199752807617, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8590586185455322, + "num_tokens": 797625660.0, + "step": 20902 + }, + { + "epoch": 2.659076453377433, + "ewc_loss": 0.05022663623094559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.511331877030898e-05, + "grad_norm": 31.460834503173828, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8707892894744873, + "num_tokens": 797665807.0, + "step": 20903 + }, + { + "epoch": 2.6592036636560232, + "ewc_loss": 0.05022924393415451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5114621166721918e-05, + "grad_norm": 31.4475154876709, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8552417159080505, + "num_tokens": 797707935.0, + "step": 20904 + }, + { + "epoch": 2.6593308739346138, + "ewc_loss": 0.050176072865724564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5088036636589095e-05, + "grad_norm": 31.360668182373047, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8651294708251953, + "num_tokens": 797745502.0, + "step": 20905 + }, + { + "epoch": 2.6594580842132043, + "ewc_loss": 0.05024661496281624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.512330684112385e-05, + "grad_norm": 31.51803970336914, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8701364398002625, + "num_tokens": 797779412.0, + "step": 20906 + }, + { + "epoch": 2.659585294491795, + "ewc_loss": 0.05024757236242294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5123787054326385e-05, + "grad_norm": 31.458946228027344, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.875514566898346, + "num_tokens": 797816677.0, + "step": 20907 + }, + { + "epoch": 2.6597125047703853, + "ewc_loss": 0.0501532219350338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5076611564145423e-05, + "grad_norm": 31.557050704956055, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8540834188461304, + "num_tokens": 797854260.0, + "step": 20908 + }, + { + "epoch": 2.659839715048976, + "ewc_loss": 0.05021152272820473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5105760869337246e-05, + "grad_norm": 31.330209732055664, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8591100573539734, + "num_tokens": 797892349.0, + "step": 20909 + }, + { + "epoch": 2.6599669253275664, + "ewc_loss": 0.05014067515730858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5070337869692594e-05, + "grad_norm": 31.447710037231445, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8753868341445923, + "num_tokens": 797928028.0, + "step": 20910 + }, + { + "epoch": 2.660094135606157, + "ewc_loss": 0.050291579216718674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5145789550151676e-05, + "grad_norm": 31.4229736328125, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.8576204776763916, + "num_tokens": 797963885.0, + "step": 20911 + }, + { + "epoch": 2.6602213458847475, + "ewc_loss": 0.05021503567695618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.510751801310107e-05, + "grad_norm": 31.41449546813965, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.859004020690918, + "num_tokens": 798009842.0, + "step": 20912 + }, + { + "epoch": 2.660348556163338, + "ewc_loss": 0.05031635984778404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.515818050596863e-05, + "grad_norm": 31.563295364379883, + "learning_rate": 1e-06, + "loss": 0.3741, + "mean_token_accuracy": 0.8842199444770813, + "num_tokens": 798043207.0, + "step": 20913 + }, + { + "epoch": 2.6604757664419285, + "ewc_loss": 0.05020721256732941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5103605366894044e-05, + "grad_norm": 31.537508010864258, + "learning_rate": 1e-06, + "loss": 0.3858, + "mean_token_accuracy": 0.8879917860031128, + "num_tokens": 798080887.0, + "step": 20914 + }, + { + "epoch": 2.660602976720519, + "ewc_loss": 0.050214752554893494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5107376131927595e-05, + "grad_norm": 31.552074432373047, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8812812566757202, + "num_tokens": 798126393.0, + "step": 20915 + }, + { + "epoch": 2.6607301869991096, + "ewc_loss": 0.050195664167404175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.509783189452719e-05, + "grad_norm": 31.49627685546875, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8875577449798584, + "num_tokens": 798166099.0, + "step": 20916 + }, + { + "epoch": 2.6608573972777, + "ewc_loss": 0.05023803934454918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.511901948309969e-05, + "grad_norm": 31.59688949584961, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8654881119728088, + "num_tokens": 798202285.0, + "step": 20917 + }, + { + "epoch": 2.6609846075562906, + "ewc_loss": 0.05012520030140877, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.506259988876991e-05, + "grad_norm": 31.443941116333008, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8795287609100342, + "num_tokens": 798243463.0, + "step": 20918 + }, + { + "epoch": 2.661111817834881, + "ewc_loss": 0.05014997348189354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5074987206608057e-05, + "grad_norm": 31.631134033203125, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.873855710029602, + "num_tokens": 798275507.0, + "step": 20919 + }, + { + "epoch": 2.6612390281134717, + "ewc_loss": 0.05012139305472374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.50606972258538e-05, + "grad_norm": 31.466283798217773, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8906359672546387, + "num_tokens": 798310536.0, + "step": 20920 + }, + { + "epoch": 2.661366238392062, + "ewc_loss": 0.05011797323822975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5058987375814468e-05, + "grad_norm": 31.443103790283203, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8673732876777649, + "num_tokens": 798347874.0, + "step": 20921 + }, + { + "epoch": 2.6614934486706527, + "ewc_loss": 0.050164844840765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5082423235289752e-05, + "grad_norm": 31.442493438720703, + "learning_rate": 1e-06, + "loss": 0.3441, + "mean_token_accuracy": 0.8959415555000305, + "num_tokens": 798381865.0, + "step": 20922 + }, + { + "epoch": 2.661620658949243, + "ewc_loss": 0.050101350992918015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5050674594240263e-05, + "grad_norm": 31.378551483154297, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8702613115310669, + "num_tokens": 798423378.0, + "step": 20923 + }, + { + "epoch": 2.661747869227834, + "ewc_loss": 0.05023924261331558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5119621568592265e-05, + "grad_norm": 31.54750633239746, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8789447546005249, + "num_tokens": 798460278.0, + "step": 20924 + }, + { + "epoch": 2.661875079506424, + "ewc_loss": 0.050201285630464554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5100642233155668e-05, + "grad_norm": 31.360063552856445, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8731903433799744, + "num_tokens": 798496622.0, + "step": 20925 + }, + { + "epoch": 2.662002289785015, + "ewc_loss": 0.05009864643216133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.504932308511343e-05, + "grad_norm": 31.418960571289062, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8676484823226929, + "num_tokens": 798533061.0, + "step": 20926 + }, + { + "epoch": 2.662129500063605, + "ewc_loss": 0.050272662192583084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5136330805253237e-05, + "grad_norm": 31.622220993041992, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8640313148498535, + "num_tokens": 798573689.0, + "step": 20927 + }, + { + "epoch": 2.662256710342196, + "ewc_loss": 0.05014955997467041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5074779841816053e-05, + "grad_norm": 31.294132232666016, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8820160627365112, + "num_tokens": 798614058.0, + "step": 20928 + }, + { + "epoch": 2.662383920620786, + "ewc_loss": 0.050126947462558746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5063473003683612e-05, + "grad_norm": 31.570695877075195, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8612996935844421, + "num_tokens": 798657769.0, + "step": 20929 + }, + { + "epoch": 2.6625111308993765, + "ewc_loss": 0.050201088190078735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5100544007727876e-05, + "grad_norm": 31.41854476928711, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8752850294113159, + "num_tokens": 798701084.0, + "step": 20930 + }, + { + "epoch": 2.662638341177967, + "ewc_loss": 0.05004233494400978, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5021166948135942e-05, + "grad_norm": 31.40443992614746, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8772962093353271, + "num_tokens": 798738029.0, + "step": 20931 + }, + { + "epoch": 2.6627655514565576, + "ewc_loss": 0.050149258226156235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.507462886569556e-05, + "grad_norm": 31.455522537231445, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8673454523086548, + "num_tokens": 798773537.0, + "step": 20932 + }, + { + "epoch": 2.662892761735148, + "ewc_loss": 0.05017728731036186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.508864417904988e-05, + "grad_norm": 31.54882049560547, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8692269325256348, + "num_tokens": 798810397.0, + "step": 20933 + }, + { + "epoch": 2.6630199720137386, + "ewc_loss": 0.05020277947187424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5101389837800525e-05, + "grad_norm": 31.51321029663086, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8678448796272278, + "num_tokens": 798850796.0, + "step": 20934 + }, + { + "epoch": 2.663147182292329, + "ewc_loss": 0.05012933164834976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5064666260732338e-05, + "grad_norm": 31.498517990112305, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8708879351615906, + "num_tokens": 798892379.0, + "step": 20935 + }, + { + "epoch": 2.6632743925709197, + "ewc_loss": 0.05013584718108177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.506792407075409e-05, + "grad_norm": 31.434612274169922, + "learning_rate": 1e-06, + "loss": 0.4508, + "mean_token_accuracy": 0.8606258630752563, + "num_tokens": 798929444.0, + "step": 20936 + }, + { + "epoch": 2.66340160284951, + "ewc_loss": 0.05014825984835625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5074130462598987e-05, + "grad_norm": 31.489173889160156, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8784440159797668, + "num_tokens": 798964086.0, + "step": 20937 + }, + { + "epoch": 2.6635288131281007, + "ewc_loss": 0.05013883113861084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5069415642064996e-05, + "grad_norm": 31.48862648010254, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8804535865783691, + "num_tokens": 799007408.0, + "step": 20938 + }, + { + "epoch": 2.6636560234066913, + "ewc_loss": 0.05012185126543045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5060926418518648e-05, + "grad_norm": 31.36301040649414, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8759040832519531, + "num_tokens": 799045930.0, + "step": 20939 + }, + { + "epoch": 2.663783233685282, + "ewc_loss": 0.05010166019201279, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5050831027328968e-05, + "grad_norm": 31.494075775146484, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8826335668563843, + "num_tokens": 799083568.0, + "step": 20940 + }, + { + "epoch": 2.6639104439638723, + "ewc_loss": 0.05016540363430977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5082701540668495e-05, + "grad_norm": 31.431047439575195, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8689360618591309, + "num_tokens": 799123681.0, + "step": 20941 + }, + { + "epoch": 2.664037654242463, + "ewc_loss": 0.050085727125406265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5042863853741437e-05, + "grad_norm": 31.470666885375977, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8672232031822205, + "num_tokens": 799164623.0, + "step": 20942 + }, + { + "epoch": 2.6641648645210534, + "ewc_loss": 0.050166964530944824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5083481887122616e-05, + "grad_norm": 31.402755737304688, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.871092677116394, + "num_tokens": 799200408.0, + "step": 20943 + }, + { + "epoch": 2.664292074799644, + "ewc_loss": 0.05020883306860924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5104416636168025e-05, + "grad_norm": 31.496244430541992, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8608428239822388, + "num_tokens": 799245106.0, + "step": 20944 + }, + { + "epoch": 2.6644192850782344, + "ewc_loss": 0.05020211264491081, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5101056962739676e-05, + "grad_norm": 31.396800994873047, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8788198232650757, + "num_tokens": 799290429.0, + "step": 20945 + }, + { + "epoch": 2.664546495356825, + "ewc_loss": 0.050150178372859955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5075089070014656e-05, + "grad_norm": 31.395139694213867, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8704409599304199, + "num_tokens": 799322951.0, + "step": 20946 + }, + { + "epoch": 2.6646737056354155, + "ewc_loss": 0.0501818023622036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.509090154489968e-05, + "grad_norm": 31.392669677734375, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.868333101272583, + "num_tokens": 799365498.0, + "step": 20947 + }, + { + "epoch": 2.6648009159140056, + "ewc_loss": 0.05028444528579712, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5142222511931323e-05, + "grad_norm": 31.484437942504883, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8659106492996216, + "num_tokens": 799405894.0, + "step": 20948 + }, + { + "epoch": 2.6649281261925966, + "ewc_loss": 0.05018395185470581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5091976567637175e-05, + "grad_norm": 31.446508407592773, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8718290328979492, + "num_tokens": 799439750.0, + "step": 20949 + }, + { + "epoch": 2.6650553364711866, + "ewc_loss": 0.05027832090854645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5139161152765155e-05, + "grad_norm": 31.595565795898438, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8634514808654785, + "num_tokens": 799475170.0, + "step": 20950 + }, + { + "epoch": 2.6651825467497776, + "ewc_loss": 0.05015650391578674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5078252292587422e-05, + "grad_norm": 31.40914535522461, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.8583285212516785, + "num_tokens": 799514231.0, + "step": 20951 + }, + { + "epoch": 2.6653097570283677, + "ewc_loss": 0.05017868056893349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5089340851991437e-05, + "grad_norm": 31.498567581176758, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.871721625328064, + "num_tokens": 799549512.0, + "step": 20952 + }, + { + "epoch": 2.6654369673069587, + "ewc_loss": 0.050293322652578354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5146660846075974e-05, + "grad_norm": 31.510608673095703, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8675752878189087, + "num_tokens": 799592144.0, + "step": 20953 + }, + { + "epoch": 2.6655641775855488, + "ewc_loss": 0.05020073428750038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5100367565755732e-05, + "grad_norm": 31.430912017822266, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8572481870651245, + "num_tokens": 799634779.0, + "step": 20954 + }, + { + "epoch": 2.6656913878641393, + "ewc_loss": 0.05022966116666794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5114830350503325e-05, + "grad_norm": 31.39520263671875, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8831794261932373, + "num_tokens": 799677548.0, + "step": 20955 + }, + { + "epoch": 2.66581859814273, + "ewc_loss": 0.05031416192650795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.515708183636889e-05, + "grad_norm": 31.517507553100586, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8541395664215088, + "num_tokens": 799715679.0, + "step": 20956 + }, + { + "epoch": 2.6659458084213203, + "ewc_loss": 0.05025885999202728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5129429559456185e-05, + "grad_norm": 31.38041877746582, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8642029762268066, + "num_tokens": 799755701.0, + "step": 20957 + }, + { + "epoch": 2.666073018699911, + "ewc_loss": 0.050201985985040665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5100993298110552e-05, + "grad_norm": 31.356996536254883, + "learning_rate": 1e-06, + "loss": 0.4444, + "mean_token_accuracy": 0.8663013577461243, + "num_tokens": 799795151.0, + "step": 20958 + }, + { + "epoch": 2.6662002289785014, + "ewc_loss": 0.050283078104257584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.514153857191559e-05, + "grad_norm": 31.48222541809082, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.877469539642334, + "num_tokens": 799832249.0, + "step": 20959 + }, + { + "epoch": 2.666327439257092, + "ewc_loss": 0.05023573711514473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5117868062807247e-05, + "grad_norm": 31.282493591308594, + "learning_rate": 1e-06, + "loss": 0.4677, + "mean_token_accuracy": 0.8509408831596375, + "num_tokens": 799867762.0, + "step": 20960 + }, + { + "epoch": 2.6664546495356825, + "ewc_loss": 0.0502629280090332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.513146318960935e-05, + "grad_norm": 31.62220001220703, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8684385418891907, + "num_tokens": 799909012.0, + "step": 20961 + }, + { + "epoch": 2.666581859814273, + "ewc_loss": 0.05038328841328621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5191644454025663e-05, + "grad_norm": 31.28791046142578, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.8676706552505493, + "num_tokens": 799950316.0, + "step": 20962 + }, + { + "epoch": 2.6667090700928635, + "ewc_loss": 0.05008730664849281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5043653295142576e-05, + "grad_norm": 31.467971801757812, + "learning_rate": 1e-06, + "loss": 0.3787, + "mean_token_accuracy": 0.8859556317329407, + "num_tokens": 799990037.0, + "step": 20963 + }, + { + "epoch": 2.666836280371454, + "ewc_loss": 0.050325196236371994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5162598831229843e-05, + "grad_norm": 31.257308959960938, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.880292534828186, + "num_tokens": 800023935.0, + "step": 20964 + }, + { + "epoch": 2.6669634906500446, + "ewc_loss": 0.05027657374739647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.513828621886205e-05, + "grad_norm": 31.5502986907959, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8770221471786499, + "num_tokens": 800066357.0, + "step": 20965 + }, + { + "epoch": 2.667090700928635, + "ewc_loss": 0.05052090063691139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.526044954720419e-05, + "grad_norm": 31.43506622314453, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8576907515525818, + "num_tokens": 800100667.0, + "step": 20966 + }, + { + "epoch": 2.6672179112072256, + "ewc_loss": 0.05024672672152519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.512336322979536e-05, + "grad_norm": 31.416854858398438, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8758652806282043, + "num_tokens": 800135101.0, + "step": 20967 + }, + { + "epoch": 2.667345121485816, + "ewc_loss": 0.05040670186281204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5203351469826885e-05, + "grad_norm": 31.446691513061523, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8863791227340698, + "num_tokens": 800176308.0, + "step": 20968 + }, + { + "epoch": 2.6674723317644067, + "ewc_loss": 0.05027631297707558, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5138157070614398e-05, + "grad_norm": 31.429372787475586, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8826565742492676, + "num_tokens": 800205585.0, + "step": 20969 + }, + { + "epoch": 2.667599542042997, + "ewc_loss": 0.050374358892440796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.518717883503996e-05, + "grad_norm": 31.477867126464844, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8710639476776123, + "num_tokens": 800241400.0, + "step": 20970 + }, + { + "epoch": 2.6677267523215877, + "ewc_loss": 0.05041283741593361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5206418285961263e-05, + "grad_norm": 31.54486083984375, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8818141222000122, + "num_tokens": 800279060.0, + "step": 20971 + }, + { + "epoch": 2.6678539626001783, + "ewc_loss": 0.050325971096754074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.51629862759728e-05, + "grad_norm": 31.437286376953125, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8804422616958618, + "num_tokens": 800318606.0, + "step": 20972 + }, + { + "epoch": 2.6679811728787683, + "ewc_loss": 0.050295062363147736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.514753032301087e-05, + "grad_norm": 31.435535430908203, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8713544607162476, + "num_tokens": 800356795.0, + "step": 20973 + }, + { + "epoch": 2.6681083831573593, + "ewc_loss": 0.05035121366381645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.51756064244546e-05, + "grad_norm": 31.40479278564453, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8792130351066589, + "num_tokens": 800393786.0, + "step": 20974 + }, + { + "epoch": 2.6682355934359494, + "ewc_loss": 0.050330132246017456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.516506538086105e-05, + "grad_norm": 31.452184677124023, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8704562783241272, + "num_tokens": 800440153.0, + "step": 20975 + }, + { + "epoch": 2.6683628037145404, + "ewc_loss": 0.05031251534819603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5156257834169082e-05, + "grad_norm": 31.40871238708496, + "learning_rate": 1e-06, + "loss": 0.48, + "mean_token_accuracy": 0.8532772064208984, + "num_tokens": 800476343.0, + "step": 20976 + }, + { + "epoch": 2.6684900139931305, + "ewc_loss": 0.050401750952005386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.520087582524866e-05, + "grad_norm": 31.41135025024414, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.884406328201294, + "num_tokens": 800514227.0, + "step": 20977 + }, + { + "epoch": 2.668617224271721, + "ewc_loss": 0.0502704419195652, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5135221221717075e-05, + "grad_norm": 31.451242446899414, + "learning_rate": 1e-06, + "loss": 0.4255, + "mean_token_accuracy": 0.8704240322113037, + "num_tokens": 800552357.0, + "step": 20978 + }, + { + "epoch": 2.6687444345503115, + "ewc_loss": 0.050433263182640076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5216631911462173e-05, + "grad_norm": 31.415287017822266, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8804265260696411, + "num_tokens": 800587624.0, + "step": 20979 + }, + { + "epoch": 2.668871644828902, + "ewc_loss": 0.050377052277326584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5188526706187986e-05, + "grad_norm": 31.547000885009766, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8657165765762329, + "num_tokens": 800631183.0, + "step": 20980 + }, + { + "epoch": 2.6689988551074926, + "ewc_loss": 0.05034882202744484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.517441134841647e-05, + "grad_norm": 31.47357940673828, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.876084566116333, + "num_tokens": 800670334.0, + "step": 20981 + }, + { + "epoch": 2.669126065386083, + "ewc_loss": 0.05026771500706673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5133856979664415e-05, + "grad_norm": 31.427331924438477, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8681442737579346, + "num_tokens": 800703500.0, + "step": 20982 + }, + { + "epoch": 2.6692532756646736, + "ewc_loss": 0.05029760301113129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5148801796603948e-05, + "grad_norm": 31.372835159301758, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8648009896278381, + "num_tokens": 800742904.0, + "step": 20983 + }, + { + "epoch": 2.669380485943264, + "ewc_loss": 0.05037881061434746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.51894052780699e-05, + "grad_norm": 31.51205825805664, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8686436414718628, + "num_tokens": 800777556.0, + "step": 20984 + }, + { + "epoch": 2.6695076962218547, + "ewc_loss": 0.050382357090711594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5191178792738356e-05, + "grad_norm": 31.417469024658203, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8624700903892517, + "num_tokens": 800817532.0, + "step": 20985 + }, + { + "epoch": 2.669634906500445, + "ewc_loss": 0.05023142695426941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.511571437935345e-05, + "grad_norm": 31.407381057739258, + "learning_rate": 1e-06, + "loss": 0.3861, + "mean_token_accuracy": 0.882124125957489, + "num_tokens": 800856567.0, + "step": 20986 + }, + { + "epoch": 2.6697621167790357, + "ewc_loss": 0.05036107823252678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5180539523717016e-05, + "grad_norm": 31.434560775756836, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8495625257492065, + "num_tokens": 800901234.0, + "step": 20987 + }, + { + "epoch": 2.6698893270576263, + "ewc_loss": 0.050297196954488754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.514859806979075e-05, + "grad_norm": 31.49286651611328, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8693939447402954, + "num_tokens": 800942647.0, + "step": 20988 + }, + { + "epoch": 2.670016537336217, + "ewc_loss": 0.050285886973142624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5142942831735127e-05, + "grad_norm": 31.266639709472656, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8795426487922668, + "num_tokens": 800980194.0, + "step": 20989 + }, + { + "epoch": 2.6701437476148073, + "ewc_loss": 0.050326526165008545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5163262762362137e-05, + "grad_norm": 31.506385803222656, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8763632774353027, + "num_tokens": 801016978.0, + "step": 20990 + }, + { + "epoch": 2.670270957893398, + "ewc_loss": 0.050411663949489594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.520583257137332e-05, + "grad_norm": 31.380098342895508, + "learning_rate": 1e-06, + "loss": 0.4515, + "mean_token_accuracy": 0.8595962524414062, + "num_tokens": 801051921.0, + "step": 20991 + }, + { + "epoch": 2.6703981681719884, + "ewc_loss": 0.05030020326375961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5150102374027483e-05, + "grad_norm": 31.468202590942383, + "learning_rate": 1e-06, + "loss": 0.4863, + "mean_token_accuracy": 0.8466165065765381, + "num_tokens": 801087189.0, + "step": 20992 + }, + { + "epoch": 2.670525378450579, + "ewc_loss": 0.050397638231515884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5198818548233248e-05, + "grad_norm": 31.43231201171875, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8758894801139832, + "num_tokens": 801121574.0, + "step": 20993 + }, + { + "epoch": 2.6706525887291694, + "ewc_loss": 0.05029679089784622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5148396161966957e-05, + "grad_norm": 31.546295166015625, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8764246702194214, + "num_tokens": 801160497.0, + "step": 20994 + }, + { + "epoch": 2.67077979900776, + "ewc_loss": 0.050362683832645416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.518134169804398e-05, + "grad_norm": 31.418596267700195, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8725816011428833, + "num_tokens": 801198556.0, + "step": 20995 + }, + { + "epoch": 2.6709070092863505, + "ewc_loss": 0.05035417899489403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.517708890081849e-05, + "grad_norm": 31.455839157104492, + "learning_rate": 1e-06, + "loss": 0.3376, + "mean_token_accuracy": 0.8970109224319458, + "num_tokens": 801230616.0, + "step": 20996 + }, + { + "epoch": 2.671034219564941, + "ewc_loss": 0.050331078469753265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5165540137095377e-05, + "grad_norm": 31.367488861083984, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8703376054763794, + "num_tokens": 801266582.0, + "step": 20997 + }, + { + "epoch": 2.671161429843531, + "ewc_loss": 0.05045975744724274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5229877792298794e-05, + "grad_norm": 31.507368087768555, + "learning_rate": 1e-06, + "loss": 0.3903, + "mean_token_accuracy": 0.8838365077972412, + "num_tokens": 801305127.0, + "step": 20998 + }, + { + "epoch": 2.671288640122122, + "ewc_loss": 0.050476327538490295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5238163289031945e-05, + "grad_norm": 31.395893096923828, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8758949637413025, + "num_tokens": 801348284.0, + "step": 20999 + }, + { + "epoch": 2.671415850400712, + "ewc_loss": 0.05037323758006096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5186618586303666e-05, + "grad_norm": 31.399736404418945, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.8910820484161377, + "num_tokens": 801385298.0, + "step": 21000 + }, + { + "epoch": 2.671543060679303, + "ewc_loss": 0.05034668743610382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.517334360163659e-05, + "grad_norm": 31.43819808959961, + "learning_rate": 1e-06, + "loss": 0.4462, + "mean_token_accuracy": 0.8638506531715393, + "num_tokens": 801417020.0, + "step": 21001 + }, + { + "epoch": 2.6716702709578932, + "ewc_loss": 0.05036163330078125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5180816010106355e-05, + "grad_norm": 31.289226531982422, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8648982048034668, + "num_tokens": 801455328.0, + "step": 21002 + }, + { + "epoch": 2.6717974812364838, + "ewc_loss": 0.05045892670750618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5229463062714785e-05, + "grad_norm": 31.48360824584961, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8812881708145142, + "num_tokens": 801494342.0, + "step": 21003 + }, + { + "epoch": 2.6719246915150743, + "ewc_loss": 0.05044485628604889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.522242721170187e-05, + "grad_norm": 31.425081253051758, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.873425304889679, + "num_tokens": 801528182.0, + "step": 21004 + }, + { + "epoch": 2.672051901793665, + "ewc_loss": 0.050411157310009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5205577912856825e-05, + "grad_norm": 31.53494644165039, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8719311356544495, + "num_tokens": 801569424.0, + "step": 21005 + }, + { + "epoch": 2.6721791120722553, + "ewc_loss": 0.05029307305812836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5146537154796533e-05, + "grad_norm": 31.35307502746582, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8697602152824402, + "num_tokens": 801608809.0, + "step": 21006 + }, + { + "epoch": 2.672306322350846, + "ewc_loss": 0.05030667036771774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5153334718197584e-05, + "grad_norm": 31.391483306884766, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8633447885513306, + "num_tokens": 801651910.0, + "step": 21007 + }, + { + "epoch": 2.6724335326294364, + "ewc_loss": 0.05037207156419754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.518603650969453e-05, + "grad_norm": 31.433393478393555, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8703036904335022, + "num_tokens": 801692961.0, + "step": 21008 + }, + { + "epoch": 2.672560742908027, + "ewc_loss": 0.05036722496151924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5183611796819605e-05, + "grad_norm": 31.4062557220459, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8598232269287109, + "num_tokens": 801731278.0, + "step": 21009 + }, + { + "epoch": 2.6726879531866174, + "ewc_loss": 0.05035960674285889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.517980283300858e-05, + "grad_norm": 31.50636100769043, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8678778409957886, + "num_tokens": 801769562.0, + "step": 21010 + }, + { + "epoch": 2.672815163465208, + "ewc_loss": 0.05033500865101814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5167504645651206e-05, + "grad_norm": 31.451513290405273, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8682839870452881, + "num_tokens": 801811046.0, + "step": 21011 + }, + { + "epoch": 2.6729423737437985, + "ewc_loss": 0.05035192146897316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.517596112738829e-05, + "grad_norm": 31.544267654418945, + "learning_rate": 1e-06, + "loss": 0.4856, + "mean_token_accuracy": 0.8497694134712219, + "num_tokens": 801847833.0, + "step": 21012 + }, + { + "epoch": 2.673069584022389, + "ewc_loss": 0.0502745546400547, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.513727667974308e-05, + "grad_norm": 31.545656204223633, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8658978939056396, + "num_tokens": 801887819.0, + "step": 21013 + }, + { + "epoch": 2.6731967943009796, + "ewc_loss": 0.050398532301187515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.519926601962652e-05, + "grad_norm": 31.667524337768555, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8829799294471741, + "num_tokens": 801922535.0, + "step": 21014 + }, + { + "epoch": 2.67332400457957, + "ewc_loss": 0.05020933225750923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.510466583771631e-05, + "grad_norm": 31.45592498779297, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8762902021408081, + "num_tokens": 801958025.0, + "step": 21015 + }, + { + "epoch": 2.6734512148581606, + "ewc_loss": 0.0502513088285923, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5125655156443827e-05, + "grad_norm": 31.506683349609375, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8760803937911987, + "num_tokens": 801991291.0, + "step": 21016 + }, + { + "epoch": 2.673578425136751, + "ewc_loss": 0.05021507292985916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5107536202995107e-05, + "grad_norm": 31.482999801635742, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8733212947845459, + "num_tokens": 802024952.0, + "step": 21017 + }, + { + "epoch": 2.6737056354153417, + "ewc_loss": 0.05023382604122162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5116913093370385e-05, + "grad_norm": 31.479555130004883, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8708889484405518, + "num_tokens": 802065741.0, + "step": 21018 + }, + { + "epoch": 2.673832845693932, + "ewc_loss": 0.05033072456717491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.516536187613383e-05, + "grad_norm": 31.65757942199707, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8702341318130493, + "num_tokens": 802107153.0, + "step": 21019 + }, + { + "epoch": 2.6739600559725227, + "ewc_loss": 0.05024740844964981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5123703380813822e-05, + "grad_norm": 31.57103729248047, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.876293957233429, + "num_tokens": 802148177.0, + "step": 21020 + }, + { + "epoch": 2.674087266251113, + "ewc_loss": 0.0502450093626976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5122504666796885e-05, + "grad_norm": 31.474811553955078, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8729519248008728, + "num_tokens": 802186144.0, + "step": 21021 + }, + { + "epoch": 2.674214476529704, + "ewc_loss": 0.05019281432032585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5096407625824213e-05, + "grad_norm": 31.546417236328125, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8847546577453613, + "num_tokens": 802224054.0, + "step": 21022 + }, + { + "epoch": 2.674341686808294, + "ewc_loss": 0.050196509808301926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5098255719058216e-05, + "grad_norm": 31.412954330444336, + "learning_rate": 1e-06, + "loss": 0.4758, + "mean_token_accuracy": 0.8554543256759644, + "num_tokens": 802264483.0, + "step": 21023 + }, + { + "epoch": 2.674468897086885, + "ewc_loss": 0.05019071698188782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5095358068938367e-05, + "grad_norm": 31.51158332824707, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8793320655822754, + "num_tokens": 802302933.0, + "step": 21024 + }, + { + "epoch": 2.674596107365475, + "ewc_loss": 0.05023355409502983, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.511677666916512e-05, + "grad_norm": 31.36156463623047, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8652477860450745, + "num_tokens": 802347354.0, + "step": 21025 + }, + { + "epoch": 2.674723317644066, + "ewc_loss": 0.05026153102517128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5130764697678387e-05, + "grad_norm": 31.521251678466797, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8804357647895813, + "num_tokens": 802392159.0, + "step": 21026 + }, + { + "epoch": 2.674850527922656, + "ewc_loss": 0.05024971812963486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5124858439085074e-05, + "grad_norm": 31.52992820739746, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8752087950706482, + "num_tokens": 802432629.0, + "step": 21027 + }, + { + "epoch": 2.6749777382012465, + "ewc_loss": 0.05010146275162697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5050730982911773e-05, + "grad_norm": 31.392322540283203, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8801612257957458, + "num_tokens": 802471776.0, + "step": 21028 + }, + { + "epoch": 2.675104948479837, + "ewc_loss": 0.05016375705599785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5081879357458092e-05, + "grad_norm": 31.5295352935791, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8650498390197754, + "num_tokens": 802507314.0, + "step": 21029 + }, + { + "epoch": 2.6752321587584276, + "ewc_loss": 0.05025672912597656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.512836545065511e-05, + "grad_norm": 31.630979537963867, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8793485760688782, + "num_tokens": 802550587.0, + "step": 21030 + }, + { + "epoch": 2.675359369037018, + "ewc_loss": 0.050114311277866364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5057155653485097e-05, + "grad_norm": 31.296287536621094, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8719715476036072, + "num_tokens": 802591544.0, + "step": 21031 + }, + { + "epoch": 2.6754865793156086, + "ewc_loss": 0.050138890743255615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5069444745895453e-05, + "grad_norm": 31.454526901245117, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8792821168899536, + "num_tokens": 802627336.0, + "step": 21032 + }, + { + "epoch": 2.675613789594199, + "ewc_loss": 0.05033149570226669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.516574750188738e-05, + "grad_norm": 31.487783432006836, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8702875375747681, + "num_tokens": 802668358.0, + "step": 21033 + }, + { + "epoch": 2.6757409998727897, + "ewc_loss": 0.050095848739147186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5047924282262102e-05, + "grad_norm": 31.37347984313965, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8791090846061707, + "num_tokens": 802708023.0, + "step": 21034 + }, + { + "epoch": 2.67586821015138, + "ewc_loss": 0.05013573169708252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5067865863093175e-05, + "grad_norm": 31.29815673828125, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8932744264602661, + "num_tokens": 802745856.0, + "step": 21035 + }, + { + "epoch": 2.6759954204299707, + "ewc_loss": 0.05020364001393318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5101819119299762e-05, + "grad_norm": 31.39272117614746, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8838480710983276, + "num_tokens": 802779032.0, + "step": 21036 + }, + { + "epoch": 2.6761226307085613, + "ewc_loss": 0.050169337540864944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5084667868213728e-05, + "grad_norm": 31.29739761352539, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8724570274353027, + "num_tokens": 802815736.0, + "step": 21037 + }, + { + "epoch": 2.676249840987152, + "ewc_loss": 0.050367388874292374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5183693651342764e-05, + "grad_norm": 31.61839485168457, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8680679798126221, + "num_tokens": 802854133.0, + "step": 21038 + }, + { + "epoch": 2.6763770512657423, + "ewc_loss": 0.050284188240766525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.514209336368367e-05, + "grad_norm": 31.30472755432129, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8789076209068298, + "num_tokens": 802892969.0, + "step": 21039 + }, + { + "epoch": 2.676504261544333, + "ewc_loss": 0.05019502714276314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.509751357138157e-05, + "grad_norm": 31.479923248291016, + "learning_rate": 1e-06, + "loss": 0.3744, + "mean_token_accuracy": 0.8864232301712036, + "num_tokens": 802936998.0, + "step": 21040 + }, + { + "epoch": 2.6766314718229234, + "ewc_loss": 0.050326988101005554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5163493774016388e-05, + "grad_norm": 31.476299285888672, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8806793689727783, + "num_tokens": 802979040.0, + "step": 21041 + }, + { + "epoch": 2.676758682101514, + "ewc_loss": 0.05031518265604973, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.515759115340188e-05, + "grad_norm": 31.578317642211914, + "learning_rate": 1e-06, + "loss": 0.3856, + "mean_token_accuracy": 0.8824417591094971, + "num_tokens": 803016516.0, + "step": 21042 + }, + { + "epoch": 2.6768858923801044, + "ewc_loss": 0.05030638724565506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5153192837024108e-05, + "grad_norm": 31.458356857299805, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8704733848571777, + "num_tokens": 803055566.0, + "step": 21043 + }, + { + "epoch": 2.677013102658695, + "ewc_loss": 0.05017625540494919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5088127586059272e-05, + "grad_norm": 31.30849838256836, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8741921186447144, + "num_tokens": 803093683.0, + "step": 21044 + }, + { + "epoch": 2.6771403129372855, + "ewc_loss": 0.05025377869606018, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.512688843125943e-05, + "grad_norm": 31.480607986450195, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8813360929489136, + "num_tokens": 803133691.0, + "step": 21045 + }, + { + "epoch": 2.6772675232158756, + "ewc_loss": 0.05027075111865997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5135375835816376e-05, + "grad_norm": 31.44950294494629, + "learning_rate": 1e-06, + "loss": 0.3795, + "mean_token_accuracy": 0.8807607293128967, + "num_tokens": 803169870.0, + "step": 21046 + }, + { + "epoch": 2.6773947334944665, + "ewc_loss": 0.050304003059864044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5152001398964785e-05, + "grad_norm": 31.449981689453125, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8651230335235596, + "num_tokens": 803209038.0, + "step": 21047 + }, + { + "epoch": 2.6775219437730566, + "ewc_loss": 0.05018530413508415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.509265141270589e-05, + "grad_norm": 31.416074752807617, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8581699728965759, + "num_tokens": 803239725.0, + "step": 21048 + }, + { + "epoch": 2.6776491540516476, + "ewc_loss": 0.050334978848695755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5167490093735978e-05, + "grad_norm": 31.402217864990234, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8715841174125671, + "num_tokens": 803273580.0, + "step": 21049 + }, + { + "epoch": 2.6777763643302377, + "ewc_loss": 0.05033605918288231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.516803033358883e-05, + "grad_norm": 31.476879119873047, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8779808282852173, + "num_tokens": 803313583.0, + "step": 21050 + }, + { + "epoch": 2.6779035746088287, + "ewc_loss": 0.05040259659290314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.520129783079028e-05, + "grad_norm": 31.515522003173828, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8795731067657471, + "num_tokens": 803347517.0, + "step": 21051 + }, + { + "epoch": 2.6780307848874187, + "ewc_loss": 0.05030939728021622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5154698960250244e-05, + "grad_norm": 31.403383255004883, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8594779372215271, + "num_tokens": 803388183.0, + "step": 21052 + }, + { + "epoch": 2.6781579951660093, + "ewc_loss": 0.05027176812291145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5135883333859965e-05, + "grad_norm": 31.42304801940918, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.873214840888977, + "num_tokens": 803428377.0, + "step": 21053 + }, + { + "epoch": 2.6782852054446, + "ewc_loss": 0.05039718374609947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5198591174557805e-05, + "grad_norm": 31.47150421142578, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8809728026390076, + "num_tokens": 803460930.0, + "step": 21054 + }, + { + "epoch": 2.6784124157231903, + "ewc_loss": 0.05039786174893379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5198931325576268e-05, + "grad_norm": 31.51106834411621, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8810580968856812, + "num_tokens": 803497971.0, + "step": 21055 + }, + { + "epoch": 2.678539626001781, + "ewc_loss": 0.05038604885339737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5193025066982955e-05, + "grad_norm": 31.462427139282227, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.887359619140625, + "num_tokens": 803535514.0, + "step": 21056 + }, + { + "epoch": 2.6786668362803714, + "ewc_loss": 0.05040387809276581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5201939934049733e-05, + "grad_norm": 31.466327667236328, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8693322539329529, + "num_tokens": 803580767.0, + "step": 21057 + }, + { + "epoch": 2.678794046558962, + "ewc_loss": 0.050386060029268265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5193030523951165e-05, + "grad_norm": 31.482772827148438, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8755114078521729, + "num_tokens": 803619168.0, + "step": 21058 + }, + { + "epoch": 2.6789212568375524, + "ewc_loss": 0.05043372884392738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5216864742105827e-05, + "grad_norm": 31.364967346191406, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8589937090873718, + "num_tokens": 803654785.0, + "step": 21059 + }, + { + "epoch": 2.679048467116143, + "ewc_loss": 0.05034099146723747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5170495064230636e-05, + "grad_norm": 31.3770694732666, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.881095290184021, + "num_tokens": 803692505.0, + "step": 21060 + }, + { + "epoch": 2.6791756773947335, + "ewc_loss": 0.05049033463001251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5245166398235597e-05, + "grad_norm": 31.462158203125, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8802606463432312, + "num_tokens": 803732592.0, + "step": 21061 + }, + { + "epoch": 2.679302887673324, + "ewc_loss": 0.05048521235585213, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5242605261155404e-05, + "grad_norm": 31.539199829101562, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8609435558319092, + "num_tokens": 803767601.0, + "step": 21062 + }, + { + "epoch": 2.6794300979519146, + "ewc_loss": 0.050388023257255554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5194010959239677e-05, + "grad_norm": 31.454519271850586, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8662095069885254, + "num_tokens": 803804329.0, + "step": 21063 + }, + { + "epoch": 2.679557308230505, + "ewc_loss": 0.050340041518211365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.517002030799631e-05, + "grad_norm": 31.47542953491211, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8681367635726929, + "num_tokens": 803846182.0, + "step": 21064 + }, + { + "epoch": 2.6796845185090956, + "ewc_loss": 0.050447724759578705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5223862394341268e-05, + "grad_norm": 31.553091049194336, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8683579564094543, + "num_tokens": 803883561.0, + "step": 21065 + }, + { + "epoch": 2.679811728787686, + "ewc_loss": 0.05050624907016754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.525312447687611e-05, + "grad_norm": 31.603702545166016, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8908213376998901, + "num_tokens": 803916860.0, + "step": 21066 + }, + { + "epoch": 2.6799389390662767, + "ewc_loss": 0.050328634679317474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5164317776216194e-05, + "grad_norm": 31.565696716308594, + "learning_rate": 1e-06, + "loss": 0.4106, + "mean_token_accuracy": 0.87246173620224, + "num_tokens": 803951949.0, + "step": 21067 + }, + { + "epoch": 2.680066149344867, + "ewc_loss": 0.05023902654647827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5119512429228052e-05, + "grad_norm": 31.2834415435791, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8688897490501404, + "num_tokens": 803992094.0, + "step": 21068 + }, + { + "epoch": 2.6801933596234577, + "ewc_loss": 0.05031759664416313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5158798962365836e-05, + "grad_norm": 31.461029052734375, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8731077313423157, + "num_tokens": 804034058.0, + "step": 21069 + }, + { + "epoch": 2.6803205699020483, + "ewc_loss": 0.05050253868103027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5251269107684493e-05, + "grad_norm": 31.523408889770508, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8650370836257935, + "num_tokens": 804072722.0, + "step": 21070 + }, + { + "epoch": 2.6804477801806383, + "ewc_loss": 0.050247922539711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5123961677309126e-05, + "grad_norm": 31.23728370666504, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8796481490135193, + "num_tokens": 804114253.0, + "step": 21071 + }, + { + "epoch": 2.6805749904592293, + "ewc_loss": 0.050402261316776276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5201130483765155e-05, + "grad_norm": 31.602243423461914, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8749308586120605, + "num_tokens": 804154877.0, + "step": 21072 + }, + { + "epoch": 2.6807022007378194, + "ewc_loss": 0.05046933889389038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5234669010387734e-05, + "grad_norm": 31.4943790435791, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8674705624580383, + "num_tokens": 804191060.0, + "step": 21073 + }, + { + "epoch": 2.6808294110164104, + "ewc_loss": 0.050375837832689285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.51879191637272e-05, + "grad_norm": 31.551610946655273, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8651187419891357, + "num_tokens": 804231574.0, + "step": 21074 + }, + { + "epoch": 2.6809566212950005, + "ewc_loss": 0.050378162413835526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5189081497956067e-05, + "grad_norm": 31.37930679321289, + "learning_rate": 1e-06, + "loss": 0.4797, + "mean_token_accuracy": 0.850711464881897, + "num_tokens": 804264027.0, + "step": 21075 + }, + { + "epoch": 2.681083831573591, + "ewc_loss": 0.050403743982315063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5201872631441802e-05, + "grad_norm": 31.591936111450195, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8735719323158264, + "num_tokens": 804302906.0, + "step": 21076 + }, + { + "epoch": 2.6812110418521815, + "ewc_loss": 0.050423119217157364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5211558750015683e-05, + "grad_norm": 31.345857620239258, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.869029700756073, + "num_tokens": 804339107.0, + "step": 21077 + }, + { + "epoch": 2.681338252130772, + "ewc_loss": 0.050307996571063995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.515399864932988e-05, + "grad_norm": 31.450302124023438, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.877680778503418, + "num_tokens": 804374292.0, + "step": 21078 + }, + { + "epoch": 2.6814654624093626, + "ewc_loss": 0.050396714359521866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5198356524924748e-05, + "grad_norm": 31.43136215209961, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8742491006851196, + "num_tokens": 804420287.0, + "step": 21079 + }, + { + "epoch": 2.681592672687953, + "ewc_loss": 0.05040217190980911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.520108682801947e-05, + "grad_norm": 31.477624893188477, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8781906366348267, + "num_tokens": 804453977.0, + "step": 21080 + }, + { + "epoch": 2.6817198829665436, + "ewc_loss": 0.05039304122328758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.519652116461657e-05, + "grad_norm": 31.420053482055664, + "learning_rate": 1e-06, + "loss": 0.4794, + "mean_token_accuracy": 0.8566942811012268, + "num_tokens": 804494958.0, + "step": 21081 + }, + { + "epoch": 2.681847093245134, + "ewc_loss": 0.05039219185709953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.519609552109614e-05, + "grad_norm": 31.468191146850586, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8535556197166443, + "num_tokens": 804532209.0, + "step": 21082 + }, + { + "epoch": 2.6819743035237247, + "ewc_loss": 0.050410859286785126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5205428755725734e-05, + "grad_norm": 31.49751091003418, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8749561905860901, + "num_tokens": 804572937.0, + "step": 21083 + }, + { + "epoch": 2.682101513802315, + "ewc_loss": 0.05041785165667534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5208926672348753e-05, + "grad_norm": 31.61477279663086, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.8723390698432922, + "num_tokens": 804610263.0, + "step": 21084 + }, + { + "epoch": 2.6822287240809057, + "ewc_loss": 0.05030360072851181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.515179949114099e-05, + "grad_norm": 31.355186462402344, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8758261203765869, + "num_tokens": 804645516.0, + "step": 21085 + }, + { + "epoch": 2.6823559343594963, + "ewc_loss": 0.05036187544465065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5180937882396393e-05, + "grad_norm": 31.444875717163086, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8832300901412964, + "num_tokens": 804684540.0, + "step": 21086 + }, + { + "epoch": 2.682483144638087, + "ewc_loss": 0.050433337688446045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5216668291250244e-05, + "grad_norm": 31.560712814331055, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8742408752441406, + "num_tokens": 804723030.0, + "step": 21087 + }, + { + "epoch": 2.6826103549166773, + "ewc_loss": 0.05039685219526291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5198425646522082e-05, + "grad_norm": 31.545522689819336, + "learning_rate": 1e-06, + "loss": 0.3845, + "mean_token_accuracy": 0.8813697099685669, + "num_tokens": 804759202.0, + "step": 21088 + }, + { + "epoch": 2.682737565195268, + "ewc_loss": 0.05036330223083496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5181650926242582e-05, + "grad_norm": 31.485742568969727, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.868506908416748, + "num_tokens": 804803199.0, + "step": 21089 + }, + { + "epoch": 2.6828647754738584, + "ewc_loss": 0.050330501049757004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5165250917780213e-05, + "grad_norm": 31.371339797973633, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8707005977630615, + "num_tokens": 804835675.0, + "step": 21090 + }, + { + "epoch": 2.682991985752449, + "ewc_loss": 0.050287600606679916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5143799575744197e-05, + "grad_norm": 31.49772834777832, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8592652082443237, + "num_tokens": 804876583.0, + "step": 21091 + }, + { + "epoch": 2.6831191960310394, + "ewc_loss": 0.05041957274079323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.520978705433663e-05, + "grad_norm": 31.524425506591797, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8631545305252075, + "num_tokens": 804906110.0, + "step": 21092 + }, + { + "epoch": 2.68324640630963, + "ewc_loss": 0.050446365028619766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.522318209230434e-05, + "grad_norm": 31.59125328063965, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8631709814071655, + "num_tokens": 804942196.0, + "step": 21093 + }, + { + "epoch": 2.6833736165882205, + "ewc_loss": 0.05033417046070099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5167084459098987e-05, + "grad_norm": 31.490188598632812, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8728317022323608, + "num_tokens": 804978760.0, + "step": 21094 + }, + { + "epoch": 2.683500826866811, + "ewc_loss": 0.050335247069597244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5167622879962437e-05, + "grad_norm": 31.57559585571289, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8644466400146484, + "num_tokens": 805014212.0, + "step": 21095 + }, + { + "epoch": 2.683628037145401, + "ewc_loss": 0.05039871111512184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5199355150107294e-05, + "grad_norm": 31.493431091308594, + "learning_rate": 1e-06, + "loss": 0.3462, + "mean_token_accuracy": 0.8943867683410645, + "num_tokens": 805050528.0, + "step": 21096 + }, + { + "epoch": 2.683755247423992, + "ewc_loss": 0.05031634494662285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5158173230011016e-05, + "grad_norm": 31.470029830932617, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8724241256713867, + "num_tokens": 805087355.0, + "step": 21097 + }, + { + "epoch": 2.683882457702582, + "ewc_loss": 0.0503896027803421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.519480221963022e-05, + "grad_norm": 31.429115295410156, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8582120537757874, + "num_tokens": 805117837.0, + "step": 21098 + }, + { + "epoch": 2.684009667981173, + "ewc_loss": 0.05034232139587402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5171160814352334e-05, + "grad_norm": 31.420381546020508, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8715344071388245, + "num_tokens": 805157734.0, + "step": 21099 + }, + { + "epoch": 2.684136878259763, + "ewc_loss": 0.05045773833990097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.522887007216923e-05, + "grad_norm": 31.725343704223633, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8794006109237671, + "num_tokens": 805190657.0, + "step": 21100 + }, + { + "epoch": 2.6842640885383537, + "ewc_loss": 0.05043819546699524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.521909846109338e-05, + "grad_norm": 31.465383529663086, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.8657118082046509, + "num_tokens": 805229654.0, + "step": 21101 + }, + { + "epoch": 2.6843912988169443, + "ewc_loss": 0.0503813698887825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.519068402762059e-05, + "grad_norm": 31.544239044189453, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8721704483032227, + "num_tokens": 805269199.0, + "step": 21102 + }, + { + "epoch": 2.684518509095535, + "ewc_loss": 0.05047666281461716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.523833063605707e-05, + "grad_norm": 31.557113647460938, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8670026063919067, + "num_tokens": 805308918.0, + "step": 21103 + }, + { + "epoch": 2.6846457193741253, + "ewc_loss": 0.05041184276342392, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5205921701854095e-05, + "grad_norm": 31.561885833740234, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8680473566055298, + "num_tokens": 805345014.0, + "step": 21104 + }, + { + "epoch": 2.684772929652716, + "ewc_loss": 0.050470516085624695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5235258362954482e-05, + "grad_norm": 31.56987190246582, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.8845129609107971, + "num_tokens": 805383674.0, + "step": 21105 + }, + { + "epoch": 2.6849001399313064, + "ewc_loss": 0.0503510944545269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5175546397804283e-05, + "grad_norm": 31.522008895874023, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8758944272994995, + "num_tokens": 805419765.0, + "step": 21106 + }, + { + "epoch": 2.685027350209897, + "ewc_loss": 0.05037249997258186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5186249331454746e-05, + "grad_norm": 31.39507293701172, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8728475570678711, + "num_tokens": 805461346.0, + "step": 21107 + }, + { + "epoch": 2.6851545604884874, + "ewc_loss": 0.050420306622982025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5210152671206743e-05, + "grad_norm": 31.523956298828125, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8788130879402161, + "num_tokens": 805494519.0, + "step": 21108 + }, + { + "epoch": 2.685281770767078, + "ewc_loss": 0.05044330283999443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5221650503226556e-05, + "grad_norm": 31.40426254272461, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8622239828109741, + "num_tokens": 805530542.0, + "step": 21109 + }, + { + "epoch": 2.6854089810456685, + "ewc_loss": 0.05040983483195305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5204917619703338e-05, + "grad_norm": 31.48837661743164, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8703738451004028, + "num_tokens": 805564531.0, + "step": 21110 + }, + { + "epoch": 2.685536191324259, + "ewc_loss": 0.050551559776067734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5275779989897273e-05, + "grad_norm": 31.58774757385254, + "learning_rate": 1e-06, + "loss": 0.4296, + "mean_token_accuracy": 0.868874192237854, + "num_tokens": 805606011.0, + "step": 21111 + }, + { + "epoch": 2.6856634016028496, + "ewc_loss": 0.05038928613066673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.519464396755211e-05, + "grad_norm": 31.39358901977539, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8661277294158936, + "num_tokens": 805643078.0, + "step": 21112 + }, + { + "epoch": 2.68579061188144, + "ewc_loss": 0.05044521763920784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5222609110642225e-05, + "grad_norm": 31.58190155029297, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8747401237487793, + "num_tokens": 805678554.0, + "step": 21113 + }, + { + "epoch": 2.6859178221600306, + "ewc_loss": 0.050511155277490616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5255578293581493e-05, + "grad_norm": 31.3837947845459, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8819149732589722, + "num_tokens": 805713736.0, + "step": 21114 + }, + { + "epoch": 2.686045032438621, + "ewc_loss": 0.050459492951631546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.522974682506174e-05, + "grad_norm": 31.673879623413086, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8665857315063477, + "num_tokens": 805751798.0, + "step": 21115 + }, + { + "epoch": 2.6861722427172117, + "ewc_loss": 0.050529707223176956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5264853320550174e-05, + "grad_norm": 31.446128845214844, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8789184093475342, + "num_tokens": 805789205.0, + "step": 21116 + }, + { + "epoch": 2.686299452995802, + "ewc_loss": 0.05038721486926079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.519360714359209e-05, + "grad_norm": 31.489206314086914, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8753160834312439, + "num_tokens": 805824910.0, + "step": 21117 + }, + { + "epoch": 2.6864266632743927, + "ewc_loss": 0.050468385219573975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5234192435164005e-05, + "grad_norm": 31.53834342956543, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8695437908172607, + "num_tokens": 805861335.0, + "step": 21118 + }, + { + "epoch": 2.686553873552983, + "ewc_loss": 0.05047384649515152, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5236922738258727e-05, + "grad_norm": 31.59406089782715, + "learning_rate": 1e-06, + "loss": 0.4254, + "mean_token_accuracy": 0.868338942527771, + "num_tokens": 805901488.0, + "step": 21119 + }, + { + "epoch": 2.686681083831574, + "ewc_loss": 0.050484102219343185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5242050469387323e-05, + "grad_norm": 31.541181564331055, + "learning_rate": 1e-06, + "loss": 0.5285, + "mean_token_accuracy": 0.835667073726654, + "num_tokens": 805941336.0, + "step": 21120 + }, + { + "epoch": 2.686808294110164, + "ewc_loss": 0.05041662976145744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.520831549190916e-05, + "grad_norm": 31.599546432495117, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8796303272247314, + "num_tokens": 805974222.0, + "step": 21121 + }, + { + "epoch": 2.686935504388755, + "ewc_loss": 0.050427258014678955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.521362875995692e-05, + "grad_norm": 31.44902229309082, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8675094842910767, + "num_tokens": 806014942.0, + "step": 21122 + }, + { + "epoch": 2.687062714667345, + "ewc_loss": 0.05033479630947113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5167397325276397e-05, + "grad_norm": 31.47974395751953, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8822516202926636, + "num_tokens": 806048894.0, + "step": 21123 + }, + { + "epoch": 2.687189924945936, + "ewc_loss": 0.05040600895881653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5203004042850807e-05, + "grad_norm": 31.496837615966797, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8702471256256104, + "num_tokens": 806089714.0, + "step": 21124 + }, + { + "epoch": 2.687317135224526, + "ewc_loss": 0.050416234880685806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5208117222064175e-05, + "grad_norm": 31.506343841552734, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.860310971736908, + "num_tokens": 806128974.0, + "step": 21125 + }, + { + "epoch": 2.6874443455031165, + "ewc_loss": 0.050454091280698776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5227045625797473e-05, + "grad_norm": 31.45079231262207, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8712129592895508, + "num_tokens": 806169857.0, + "step": 21126 + }, + { + "epoch": 2.687571555781707, + "ewc_loss": 0.0504862517118454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5243125492124818e-05, + "grad_norm": 31.52886199951172, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8732261657714844, + "num_tokens": 806202826.0, + "step": 21127 + }, + { + "epoch": 2.6876987660602976, + "ewc_loss": 0.05049172043800354, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5245859433198348e-05, + "grad_norm": 31.493223190307617, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.871042013168335, + "num_tokens": 806242850.0, + "step": 21128 + }, + { + "epoch": 2.687825976338888, + "ewc_loss": 0.05045641213655472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5228206141036935e-05, + "grad_norm": 31.515933990478516, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8737105131149292, + "num_tokens": 806279024.0, + "step": 21129 + }, + { + "epoch": 2.6879531866174786, + "ewc_loss": 0.05052754655480385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5263772840844467e-05, + "grad_norm": 31.672101974487305, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.8575985431671143, + "num_tokens": 806313669.0, + "step": 21130 + }, + { + "epoch": 2.688080396896069, + "ewc_loss": 0.05036134645342827, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.518067412893288e-05, + "grad_norm": 31.518613815307617, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8746767044067383, + "num_tokens": 806353128.0, + "step": 21131 + }, + { + "epoch": 2.6882076071746597, + "ewc_loss": 0.05036327615380287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5181638193316758e-05, + "grad_norm": 31.568462371826172, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8735512495040894, + "num_tokens": 806385044.0, + "step": 21132 + }, + { + "epoch": 2.68833481745325, + "ewc_loss": 0.05032264441251755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5161321900668554e-05, + "grad_norm": 31.461759567260742, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8638060688972473, + "num_tokens": 806422157.0, + "step": 21133 + }, + { + "epoch": 2.6884620277318407, + "ewc_loss": 0.05030255764722824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5151279260171577e-05, + "grad_norm": 31.54947280883789, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8711543083190918, + "num_tokens": 806463180.0, + "step": 21134 + }, + { + "epoch": 2.6885892380104313, + "ewc_loss": 0.05036894977092743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.518447581678629e-05, + "grad_norm": 31.334856033325195, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8675088882446289, + "num_tokens": 806502031.0, + "step": 21135 + }, + { + "epoch": 2.688716448289022, + "ewc_loss": 0.050364360213279724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5182180252159014e-05, + "grad_norm": 31.49152374267578, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8734016418457031, + "num_tokens": 806539585.0, + "step": 21136 + }, + { + "epoch": 2.6888436585676123, + "ewc_loss": 0.05054326355457306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5271630875067785e-05, + "grad_norm": 31.53359603881836, + "learning_rate": 1e-06, + "loss": 0.358, + "mean_token_accuracy": 0.8920268416404724, + "num_tokens": 806573147.0, + "step": 21137 + }, + { + "epoch": 2.688970868846203, + "ewc_loss": 0.05046885088086128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.523442526580766e-05, + "grad_norm": 31.497068405151367, + "learning_rate": 1e-06, + "loss": 0.3811, + "mean_token_accuracy": 0.884026050567627, + "num_tokens": 806609785.0, + "step": 21138 + }, + { + "epoch": 2.6890980791247934, + "ewc_loss": 0.05045324191451073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5226621801266447e-05, + "grad_norm": 31.48756980895996, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8789281249046326, + "num_tokens": 806647709.0, + "step": 21139 + }, + { + "epoch": 2.689225289403384, + "ewc_loss": 0.05042049288749695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5210247258655727e-05, + "grad_norm": 31.551233291625977, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8761151432991028, + "num_tokens": 806684448.0, + "step": 21140 + }, + { + "epoch": 2.6893524996819744, + "ewc_loss": 0.05040150135755539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5200750314979814e-05, + "grad_norm": 31.475343704223633, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8758876323699951, + "num_tokens": 806725326.0, + "step": 21141 + }, + { + "epoch": 2.689479709960565, + "ewc_loss": 0.05040736496448517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.520368252589833e-05, + "grad_norm": 31.520366668701172, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8721212148666382, + "num_tokens": 806759797.0, + "step": 21142 + }, + { + "epoch": 2.6896069202391555, + "ewc_loss": 0.0504138246178627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5206913051079027e-05, + "grad_norm": 31.512615203857422, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8798681497573853, + "num_tokens": 806799986.0, + "step": 21143 + }, + { + "epoch": 2.6897341305177456, + "ewc_loss": 0.050419118255376816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5209559680661187e-05, + "grad_norm": 31.481847763061523, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8661767840385437, + "num_tokens": 806843055.0, + "step": 21144 + }, + { + "epoch": 2.6898613407963365, + "ewc_loss": 0.05037863925099373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5189319785567932e-05, + "grad_norm": 31.49903678894043, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8863356113433838, + "num_tokens": 806875897.0, + "step": 21145 + }, + { + "epoch": 2.6899885510749266, + "ewc_loss": 0.05044837296009064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.52241861744551e-05, + "grad_norm": 31.54378318786621, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8799244165420532, + "num_tokens": 806915583.0, + "step": 21146 + }, + { + "epoch": 2.6901157613535176, + "ewc_loss": 0.050354085862636566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.51770434260834e-05, + "grad_norm": 31.538576126098633, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8749121427536011, + "num_tokens": 806953851.0, + "step": 21147 + }, + { + "epoch": 2.6902429716321077, + "ewc_loss": 0.05046682432293892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5233412088709883e-05, + "grad_norm": 31.628578186035156, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8692421913146973, + "num_tokens": 806988519.0, + "step": 21148 + }, + { + "epoch": 2.6903701819106987, + "ewc_loss": 0.0503731444478035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5186571292579174e-05, + "grad_norm": 31.51491928100586, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8760184049606323, + "num_tokens": 807029612.0, + "step": 21149 + }, + { + "epoch": 2.6904973921892887, + "ewc_loss": 0.05029446259140968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5147232008748688e-05, + "grad_norm": 31.55284309387207, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8572787046432495, + "num_tokens": 807064009.0, + "step": 21150 + }, + { + "epoch": 2.6906246024678793, + "ewc_loss": 0.05044391378760338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.522195609344635e-05, + "grad_norm": 31.529577255249023, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.869066596031189, + "num_tokens": 807104694.0, + "step": 21151 + }, + { + "epoch": 2.69075181274647, + "ewc_loss": 0.05038825422525406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5194127374561504e-05, + "grad_norm": 31.500011444091797, + "learning_rate": 1e-06, + "loss": 0.4919, + "mean_token_accuracy": 0.8503798842430115, + "num_tokens": 807142500.0, + "step": 21152 + }, + { + "epoch": 2.6908790230250603, + "ewc_loss": 0.050394896417856216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5197448849212378e-05, + "grad_norm": 31.58184051513672, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8692030906677246, + "num_tokens": 807172123.0, + "step": 21153 + }, + { + "epoch": 2.691006233303651, + "ewc_loss": 0.05047578737139702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.523789407860022e-05, + "grad_norm": 31.55948257446289, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8740731477737427, + "num_tokens": 807206422.0, + "step": 21154 + }, + { + "epoch": 2.6911334435822414, + "ewc_loss": 0.05038463696837425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.519231929909438e-05, + "grad_norm": 31.571224212646484, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8578970432281494, + "num_tokens": 807240867.0, + "step": 21155 + }, + { + "epoch": 2.691260653860832, + "ewc_loss": 0.050412584096193314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5206292775692418e-05, + "grad_norm": 31.52226448059082, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.88185054063797, + "num_tokens": 807275838.0, + "step": 21156 + }, + { + "epoch": 2.6913878641394224, + "ewc_loss": 0.05037778243422508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5188890504068695e-05, + "grad_norm": 31.584941864013672, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8807134032249451, + "num_tokens": 807315164.0, + "step": 21157 + }, + { + "epoch": 2.691515074418013, + "ewc_loss": 0.05041714012622833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5208570150425658e-05, + "grad_norm": 31.53073501586914, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8631618022918701, + "num_tokens": 807358016.0, + "step": 21158 + }, + { + "epoch": 2.6916422846966035, + "ewc_loss": 0.05035824328660965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.517912071198225e-05, + "grad_norm": 31.532285690307617, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8709614276885986, + "num_tokens": 807402751.0, + "step": 21159 + }, + { + "epoch": 2.691769494975194, + "ewc_loss": 0.05039915442466736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5199577066814527e-05, + "grad_norm": 31.448183059692383, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8762896060943604, + "num_tokens": 807442761.0, + "step": 21160 + }, + { + "epoch": 2.6918967052537845, + "ewc_loss": 0.050394777208566666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.519738882256206e-05, + "grad_norm": 31.483186721801758, + "learning_rate": 1e-06, + "loss": 0.3898, + "mean_token_accuracy": 0.8793765902519226, + "num_tokens": 807478550.0, + "step": 21161 + }, + { + "epoch": 2.692023915532375, + "ewc_loss": 0.050493016839027405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5246508812415414e-05, + "grad_norm": 31.616662979125977, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8809950947761536, + "num_tokens": 807522756.0, + "step": 21162 + }, + { + "epoch": 2.6921511258109656, + "ewc_loss": 0.0504336878657341, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5216844733222388e-05, + "grad_norm": 31.45256805419922, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8777620792388916, + "num_tokens": 807561701.0, + "step": 21163 + }, + { + "epoch": 2.692278336089556, + "ewc_loss": 0.05048718303442001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5243591153412126e-05, + "grad_norm": 31.699186325073242, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.870726466178894, + "num_tokens": 807604058.0, + "step": 21164 + }, + { + "epoch": 2.6924055463681467, + "ewc_loss": 0.05036646127700806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.518322980904486e-05, + "grad_norm": 31.471595764160156, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8783011436462402, + "num_tokens": 807637344.0, + "step": 21165 + }, + { + "epoch": 2.692532756646737, + "ewc_loss": 0.05023977905511856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5119888960034586e-05, + "grad_norm": 31.484704971313477, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8649849891662598, + "num_tokens": 807676867.0, + "step": 21166 + }, + { + "epoch": 2.6926599669253277, + "ewc_loss": 0.05048481747508049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.524240881029982e-05, + "grad_norm": 31.622386932373047, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8751262426376343, + "num_tokens": 807707183.0, + "step": 21167 + }, + { + "epoch": 2.6927871772039182, + "ewc_loss": 0.05035167559981346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.517583743610885e-05, + "grad_norm": 31.464683532714844, + "learning_rate": 1e-06, + "loss": 0.4411, + "mean_token_accuracy": 0.8621631860733032, + "num_tokens": 807737617.0, + "step": 21168 + }, + { + "epoch": 2.6929143874825083, + "ewc_loss": 0.050362665206193924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5181332603096962e-05, + "grad_norm": 31.6103458404541, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8655350208282471, + "num_tokens": 807774141.0, + "step": 21169 + }, + { + "epoch": 2.6930415977610993, + "ewc_loss": 0.05036609619855881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5183047910104506e-05, + "grad_norm": 31.469512939453125, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.876792311668396, + "num_tokens": 807813375.0, + "step": 21170 + }, + { + "epoch": 2.6931688080396894, + "ewc_loss": 0.05040958523750305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5204792109434493e-05, + "grad_norm": 31.525409698486328, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.8589833378791809, + "num_tokens": 807852317.0, + "step": 21171 + }, + { + "epoch": 2.6932960183182804, + "ewc_loss": 0.05048464983701706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5242325136787258e-05, + "grad_norm": 31.468809127807617, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8739182353019714, + "num_tokens": 807895266.0, + "step": 21172 + }, + { + "epoch": 2.6934232285968704, + "ewc_loss": 0.05041271820664406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5206358259310946e-05, + "grad_norm": 31.486177444458008, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.874068558216095, + "num_tokens": 807938600.0, + "step": 21173 + }, + { + "epoch": 2.693550438875461, + "ewc_loss": 0.05049717426300049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5248587917303666e-05, + "grad_norm": 31.50991439819336, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8919420838356018, + "num_tokens": 807974329.0, + "step": 21174 + }, + { + "epoch": 2.6936776491540515, + "ewc_loss": 0.050326183438301086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5163091777358204e-05, + "grad_norm": 31.462440490722656, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8836599588394165, + "num_tokens": 808015666.0, + "step": 21175 + }, + { + "epoch": 2.693804859432642, + "ewc_loss": 0.050491683185100555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5245841243304312e-05, + "grad_norm": 31.577823638916016, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8829687833786011, + "num_tokens": 808050176.0, + "step": 21176 + }, + { + "epoch": 2.6939320697112326, + "ewc_loss": 0.05047755315899849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.523877628846094e-05, + "grad_norm": 31.42392349243164, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8648957014083862, + "num_tokens": 808091910.0, + "step": 21177 + }, + { + "epoch": 2.694059279989823, + "ewc_loss": 0.050488781183958054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5244391508749686e-05, + "grad_norm": 31.562551498413086, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8680713176727295, + "num_tokens": 808128646.0, + "step": 21178 + }, + { + "epoch": 2.6941864902684136, + "ewc_loss": 0.050502289086580276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5251145416405052e-05, + "grad_norm": 31.398086547851562, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8768651485443115, + "num_tokens": 808166544.0, + "step": 21179 + }, + { + "epoch": 2.694313700547004, + "ewc_loss": 0.050491780042648315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5245890356018208e-05, + "grad_norm": 31.633955001831055, + "learning_rate": 1e-06, + "loss": 0.4523, + "mean_token_accuracy": 0.8623757362365723, + "num_tokens": 808200051.0, + "step": 21180 + }, + { + "epoch": 2.6944409108255947, + "ewc_loss": 0.05048085376620293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5240426111849956e-05, + "grad_norm": 31.43059539794922, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.874126672744751, + "num_tokens": 808239251.0, + "step": 21181 + }, + { + "epoch": 2.694568121104185, + "ewc_loss": 0.05042276903986931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5211384127032943e-05, + "grad_norm": 31.560131072998047, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8672748804092407, + "num_tokens": 808280639.0, + "step": 21182 + }, + { + "epoch": 2.6946953313827757, + "ewc_loss": 0.050583366304636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5291683414252475e-05, + "grad_norm": 31.546716690063477, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8792372941970825, + "num_tokens": 808316141.0, + "step": 21183 + }, + { + "epoch": 2.6948225416613663, + "ewc_loss": 0.05045080929994583, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5225404897355475e-05, + "grad_norm": 31.61347770690918, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8713603019714355, + "num_tokens": 808359172.0, + "step": 21184 + }, + { + "epoch": 2.694949751939957, + "ewc_loss": 0.05039997026324272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.519998452044092e-05, + "grad_norm": 31.488189697265625, + "learning_rate": 1e-06, + "loss": 0.512, + "mean_token_accuracy": 0.8452756404876709, + "num_tokens": 808397823.0, + "step": 21185 + }, + { + "epoch": 2.6950769622185473, + "ewc_loss": 0.05039357766509056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.519678855605889e-05, + "grad_norm": 31.48155975341797, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.882233738899231, + "num_tokens": 808435355.0, + "step": 21186 + }, + { + "epoch": 2.695204172497138, + "ewc_loss": 0.05047796666622162, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5238983653252944e-05, + "grad_norm": 31.508241653442383, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8786723017692566, + "num_tokens": 808470792.0, + "step": 21187 + }, + { + "epoch": 2.6953313827757284, + "ewc_loss": 0.050451308488845825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.522565409890376e-05, + "grad_norm": 31.48207664489746, + "learning_rate": 1e-06, + "loss": 0.3636, + "mean_token_accuracy": 0.8896284103393555, + "num_tokens": 808504591.0, + "step": 21188 + }, + { + "epoch": 2.695458593054319, + "ewc_loss": 0.0503428652882576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5171431843773462e-05, + "grad_norm": 31.337217330932617, + "learning_rate": 1e-06, + "loss": 0.3725, + "mean_token_accuracy": 0.8837388157844543, + "num_tokens": 808537167.0, + "step": 21189 + }, + { + "epoch": 2.6955858033329094, + "ewc_loss": 0.05051856487989426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5259281756007113e-05, + "grad_norm": 31.479507446289062, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8876971006393433, + "num_tokens": 808572055.0, + "step": 21190 + }, + { + "epoch": 2.6957130136115, + "ewc_loss": 0.05050847306847572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5254235879401676e-05, + "grad_norm": 31.511377334594727, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.8841507434844971, + "num_tokens": 808605239.0, + "step": 21191 + }, + { + "epoch": 2.6958402238900905, + "ewc_loss": 0.0505179800093174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5258990717702545e-05, + "grad_norm": 31.373924255371094, + "learning_rate": 1e-06, + "loss": 0.4845, + "mean_token_accuracy": 0.8509074449539185, + "num_tokens": 808643227.0, + "step": 21192 + }, + { + "epoch": 2.695967434168681, + "ewc_loss": 0.05053576081991196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5267880118917674e-05, + "grad_norm": 31.547082901000977, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8662427663803101, + "num_tokens": 808683855.0, + "step": 21193 + }, + { + "epoch": 2.696094644447271, + "ewc_loss": 0.05058113858103752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.52905683737481e-05, + "grad_norm": 31.54421043395996, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8784962892532349, + "num_tokens": 808723467.0, + "step": 21194 + }, + { + "epoch": 2.696221854725862, + "ewc_loss": 0.05049840360879898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5249202735722065e-05, + "grad_norm": 31.60055923461914, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8613331317901611, + "num_tokens": 808759805.0, + "step": 21195 + }, + { + "epoch": 2.696349065004452, + "ewc_loss": 0.0505116805434227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5255840228055604e-05, + "grad_norm": 31.595651626586914, + "learning_rate": 1e-06, + "loss": 0.4728, + "mean_token_accuracy": 0.8553636074066162, + "num_tokens": 808802024.0, + "step": 21196 + }, + { + "epoch": 2.696476275283043, + "ewc_loss": 0.050387442111968994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5193721739924513e-05, + "grad_norm": 31.49711799621582, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8638818264007568, + "num_tokens": 808837086.0, + "step": 21197 + }, + { + "epoch": 2.696603485561633, + "ewc_loss": 0.05048972740769386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5244864445994608e-05, + "grad_norm": 31.669925689697266, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8715757131576538, + "num_tokens": 808875261.0, + "step": 21198 + }, + { + "epoch": 2.6967306958402237, + "ewc_loss": 0.05046680197119713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5233401174773462e-05, + "grad_norm": 31.555269241333008, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8784692287445068, + "num_tokens": 808913044.0, + "step": 21199 + }, + { + "epoch": 2.6968579061188143, + "ewc_loss": 0.05044078826904297, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5220393581548706e-05, + "grad_norm": 31.571395874023438, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.88057941198349, + "num_tokens": 808952374.0, + "step": 21200 + }, + { + "epoch": 2.696985116397405, + "ewc_loss": 0.05055563524365425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5277817258029245e-05, + "grad_norm": 31.625545501708984, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8740653991699219, + "num_tokens": 808994397.0, + "step": 21201 + }, + { + "epoch": 2.6971123266759953, + "ewc_loss": 0.050425637513399124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.521281930967234e-05, + "grad_norm": 31.688182830810547, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8762215375900269, + "num_tokens": 809036401.0, + "step": 21202 + }, + { + "epoch": 2.697239536954586, + "ewc_loss": 0.05037730932235718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5188654035446234e-05, + "grad_norm": 31.5380916595459, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.877827525138855, + "num_tokens": 809073073.0, + "step": 21203 + }, + { + "epoch": 2.6973667472331764, + "ewc_loss": 0.05037442967295647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.518721521482803e-05, + "grad_norm": 31.505578994750977, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8726468086242676, + "num_tokens": 809117348.0, + "step": 21204 + }, + { + "epoch": 2.697493957511767, + "ewc_loss": 0.05038686841726303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5193434339598753e-05, + "grad_norm": 31.499300003051758, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8847330808639526, + "num_tokens": 809151357.0, + "step": 21205 + }, + { + "epoch": 2.6976211677903574, + "ewc_loss": 0.05039961636066437, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5199808078468777e-05, + "grad_norm": 31.577180862426758, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8666361570358276, + "num_tokens": 809189000.0, + "step": 21206 + }, + { + "epoch": 2.697748378068948, + "ewc_loss": 0.05044398456811905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5221992473234423e-05, + "grad_norm": 31.617202758789062, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8761759996414185, + "num_tokens": 809227380.0, + "step": 21207 + }, + { + "epoch": 2.6978755883475385, + "ewc_loss": 0.05040806531906128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5204033590853214e-05, + "grad_norm": 31.45041275024414, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8720924854278564, + "num_tokens": 809265131.0, + "step": 21208 + }, + { + "epoch": 2.698002798626129, + "ewc_loss": 0.05039147287607193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.519573718018364e-05, + "grad_norm": 31.56618309020996, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8713164329528809, + "num_tokens": 809297190.0, + "step": 21209 + }, + { + "epoch": 2.6981300089047195, + "ewc_loss": 0.050494469702243805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.524723458918743e-05, + "grad_norm": 31.490676879882812, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8813925981521606, + "num_tokens": 809336774.0, + "step": 21210 + }, + { + "epoch": 2.69825721918331, + "ewc_loss": 0.050450317561626434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5225159333785996e-05, + "grad_norm": 31.59908103942871, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8558574318885803, + "num_tokens": 809376612.0, + "step": 21211 + }, + { + "epoch": 2.6983844294619006, + "ewc_loss": 0.05040517821907997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.52025893132668e-05, + "grad_norm": 31.470003128051758, + "learning_rate": 1e-06, + "loss": 0.3412, + "mean_token_accuracy": 0.8942022323608398, + "num_tokens": 809416058.0, + "step": 21212 + }, + { + "epoch": 2.698511639740491, + "ewc_loss": 0.050322290509939194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.516114545869641e-05, + "grad_norm": 31.48052215576172, + "learning_rate": 1e-06, + "loss": 0.3789, + "mean_token_accuracy": 0.8826009035110474, + "num_tokens": 809449580.0, + "step": 21213 + }, + { + "epoch": 2.6986388500190817, + "ewc_loss": 0.050474926829338074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.523746297811158e-05, + "grad_norm": 31.578487396240234, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8753403425216675, + "num_tokens": 809484983.0, + "step": 21214 + }, + { + "epoch": 2.698766060297672, + "ewc_loss": 0.05041596665978432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5207982616848312e-05, + "grad_norm": 31.462181091308594, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8760939836502075, + "num_tokens": 809521086.0, + "step": 21215 + }, + { + "epoch": 2.6988932705762627, + "ewc_loss": 0.050369229167699814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.518461405998096e-05, + "grad_norm": 31.581117630004883, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.870853066444397, + "num_tokens": 809562475.0, + "step": 21216 + }, + { + "epoch": 2.699020480854853, + "ewc_loss": 0.05048637092113495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5243185518775135e-05, + "grad_norm": 31.494216918945312, + "learning_rate": 1e-06, + "loss": 0.4591, + "mean_token_accuracy": 0.859474778175354, + "num_tokens": 809596686.0, + "step": 21217 + }, + { + "epoch": 2.6991476911334438, + "ewc_loss": 0.05032213032245636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5161065423162654e-05, + "grad_norm": 31.50023078918457, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8743424415588379, + "num_tokens": 809636614.0, + "step": 21218 + }, + { + "epoch": 2.699274901412034, + "ewc_loss": 0.050453003495931625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5226501747965813e-05, + "grad_norm": 31.543703079223633, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8609664440155029, + "num_tokens": 809674542.0, + "step": 21219 + }, + { + "epoch": 2.699402111690625, + "ewc_loss": 0.05046229064464569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5231145627913065e-05, + "grad_norm": 31.449182510375977, + "learning_rate": 1e-06, + "loss": 0.3727, + "mean_token_accuracy": 0.8881433010101318, + "num_tokens": 809716726.0, + "step": 21220 + }, + { + "epoch": 2.699529321969215, + "ewc_loss": 0.05032249167561531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5161245503113605e-05, + "grad_norm": 31.61668586730957, + "learning_rate": 1e-06, + "loss": 0.3877, + "mean_token_accuracy": 0.8819035887718201, + "num_tokens": 809756790.0, + "step": 21221 + }, + { + "epoch": 2.699656532247806, + "ewc_loss": 0.05045871064066887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5229355742339976e-05, + "grad_norm": 31.444664001464844, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.869836688041687, + "num_tokens": 809794616.0, + "step": 21222 + }, + { + "epoch": 2.699783742526396, + "ewc_loss": 0.05033685266971588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5168426873278804e-05, + "grad_norm": 31.558698654174805, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8783517479896545, + "num_tokens": 809836662.0, + "step": 21223 + }, + { + "epoch": 2.6999109528049865, + "ewc_loss": 0.05046410113573074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.523205148463603e-05, + "grad_norm": 31.518770217895508, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8700869083404541, + "num_tokens": 809873812.0, + "step": 21224 + }, + { + "epoch": 2.700038163083577, + "ewc_loss": 0.05038627237081528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.519313602533657e-05, + "grad_norm": 31.496936798095703, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8721510171890259, + "num_tokens": 809913139.0, + "step": 21225 + }, + { + "epoch": 2.7001653733621676, + "ewc_loss": 0.05039666220545769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5198331059073098e-05, + "grad_norm": 31.63915252685547, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.874430775642395, + "num_tokens": 809947741.0, + "step": 21226 + }, + { + "epoch": 2.700292583640758, + "ewc_loss": 0.05055161565542221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5275807274738327e-05, + "grad_norm": 31.673351287841797, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8656210899353027, + "num_tokens": 809987120.0, + "step": 21227 + }, + { + "epoch": 2.7004197939193486, + "ewc_loss": 0.05037776753306389, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.518888322811108e-05, + "grad_norm": 31.46680450439453, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8673583269119263, + "num_tokens": 810026972.0, + "step": 21228 + }, + { + "epoch": 2.700547004197939, + "ewc_loss": 0.050296712666749954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.514835614420008e-05, + "grad_norm": 31.597881317138672, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8721654415130615, + "num_tokens": 810062767.0, + "step": 21229 + }, + { + "epoch": 2.7006742144765297, + "ewc_loss": 0.050494734197854996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5247367375413887e-05, + "grad_norm": 31.41944122314453, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8803211450576782, + "num_tokens": 810096930.0, + "step": 21230 + }, + { + "epoch": 2.70080142475512, + "ewc_loss": 0.0503743439912796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5187171559082344e-05, + "grad_norm": 31.588680267333984, + "learning_rate": 1e-06, + "loss": 0.4599, + "mean_token_accuracy": 0.859851598739624, + "num_tokens": 810134600.0, + "step": 21231 + }, + { + "epoch": 2.7009286350337107, + "ewc_loss": 0.050426170229911804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.521308488212526e-05, + "grad_norm": 31.41495132446289, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8746516704559326, + "num_tokens": 810169535.0, + "step": 21232 + }, + { + "epoch": 2.7010558453123013, + "ewc_loss": 0.050450775772333145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5225388526450843e-05, + "grad_norm": 31.593233108520508, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.862377405166626, + "num_tokens": 810205288.0, + "step": 21233 + }, + { + "epoch": 2.701183055590892, + "ewc_loss": 0.050417836755514145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5208917577401735e-05, + "grad_norm": 31.438129425048828, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8698980808258057, + "num_tokens": 810242868.0, + "step": 21234 + }, + { + "epoch": 2.7013102658694823, + "ewc_loss": 0.05049031972885132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5245159122277983e-05, + "grad_norm": 31.608015060424805, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8607116341590881, + "num_tokens": 810286539.0, + "step": 21235 + }, + { + "epoch": 2.701437476148073, + "ewc_loss": 0.050509143620729446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5254572392441332e-05, + "grad_norm": 31.564132690429688, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8765504360198975, + "num_tokens": 810331598.0, + "step": 21236 + }, + { + "epoch": 2.7015646864266634, + "ewc_loss": 0.05046622082591057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5233110136468895e-05, + "grad_norm": 31.480012893676758, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8807565569877625, + "num_tokens": 810369501.0, + "step": 21237 + }, + { + "epoch": 2.701691896705254, + "ewc_loss": 0.05046713724732399, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.523356852179859e-05, + "grad_norm": 31.543052673339844, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8696869611740112, + "num_tokens": 810410727.0, + "step": 21238 + }, + { + "epoch": 2.7018191069838444, + "ewc_loss": 0.0504574179649353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5228708182112314e-05, + "grad_norm": 31.49241828918457, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8634931445121765, + "num_tokens": 810450315.0, + "step": 21239 + }, + { + "epoch": 2.701946317262435, + "ewc_loss": 0.05048849433660507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5244247808586806e-05, + "grad_norm": 31.6063175201416, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8635932803153992, + "num_tokens": 810493972.0, + "step": 21240 + }, + { + "epoch": 2.7020735275410255, + "ewc_loss": 0.050422944128513336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5211471438524313e-05, + "grad_norm": 31.490840911865234, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8827548027038574, + "num_tokens": 810526038.0, + "step": 21241 + }, + { + "epoch": 2.7022007378196156, + "ewc_loss": 0.05043156072497368, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5215780624421313e-05, + "grad_norm": 31.748226165771484, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8767995238304138, + "num_tokens": 810560772.0, + "step": 21242 + }, + { + "epoch": 2.7023279480982065, + "ewc_loss": 0.05045522376894951, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5227611331501976e-05, + "grad_norm": 31.4984130859375, + "learning_rate": 1e-06, + "loss": 0.3835, + "mean_token_accuracy": 0.8806764483451843, + "num_tokens": 810598322.0, + "step": 21243 + }, + { + "epoch": 2.7024551583767966, + "ewc_loss": 0.050402190536260605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5201095922966488e-05, + "grad_norm": 31.673322677612305, + "learning_rate": 1e-06, + "loss": 0.3617, + "mean_token_accuracy": 0.8923165798187256, + "num_tokens": 810638652.0, + "step": 21244 + }, + { + "epoch": 2.7025823686553876, + "ewc_loss": 0.05048777535557747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5243887648684904e-05, + "grad_norm": 31.525007247924805, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8713783621788025, + "num_tokens": 810680318.0, + "step": 21245 + }, + { + "epoch": 2.7027095789339777, + "ewc_loss": 0.0503230057656765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5161501980619505e-05, + "grad_norm": 31.631967544555664, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8771905899047852, + "num_tokens": 810715001.0, + "step": 21246 + }, + { + "epoch": 2.7028367892125686, + "ewc_loss": 0.05045737698674202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5228688173228875e-05, + "grad_norm": 31.521703720092773, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8641173243522644, + "num_tokens": 810748897.0, + "step": 21247 + }, + { + "epoch": 2.7029639994911587, + "ewc_loss": 0.05034753680229187, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5173769245157018e-05, + "grad_norm": 31.60275650024414, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8666982650756836, + "num_tokens": 810788954.0, + "step": 21248 + }, + { + "epoch": 2.7030912097697493, + "ewc_loss": 0.05047912895679474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5239563910872675e-05, + "grad_norm": 31.65294647216797, + "learning_rate": 1e-06, + "loss": 0.4011, + "mean_token_accuracy": 0.8752106428146362, + "num_tokens": 810828897.0, + "step": 21249 + }, + { + "epoch": 2.70321842004834, + "ewc_loss": 0.050429798662662506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5214900233549997e-05, + "grad_norm": 31.60856056213379, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8890206813812256, + "num_tokens": 810866698.0, + "step": 21250 + }, + { + "epoch": 2.7033456303269303, + "ewc_loss": 0.05037543177604675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5187715436914004e-05, + "grad_norm": 31.723838806152344, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8657401204109192, + "num_tokens": 810903140.0, + "step": 21251 + }, + { + "epoch": 2.703472840605521, + "ewc_loss": 0.050331491976976395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5165745682897978e-05, + "grad_norm": 31.47835350036621, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.87714684009552, + "num_tokens": 810940664.0, + "step": 21252 + }, + { + "epoch": 2.7036000508841114, + "ewc_loss": 0.05025855824351311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.512927858333569e-05, + "grad_norm": 31.56238555908203, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8704332113265991, + "num_tokens": 810981818.0, + "step": 21253 + }, + { + "epoch": 2.703727261162702, + "ewc_loss": 0.05051049590110779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5255247237510048e-05, + "grad_norm": 31.58298683166504, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8718762397766113, + "num_tokens": 811019897.0, + "step": 21254 + }, + { + "epoch": 2.7038544714412924, + "ewc_loss": 0.05031057819724083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5155288312816992e-05, + "grad_norm": 31.49066162109375, + "learning_rate": 1e-06, + "loss": 0.4153, + "mean_token_accuracy": 0.875542402267456, + "num_tokens": 811056310.0, + "step": 21255 + }, + { + "epoch": 2.703981681719883, + "ewc_loss": 0.05044930800795555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.522465365473181e-05, + "grad_norm": 31.567405700683594, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.862214982509613, + "num_tokens": 811091505.0, + "step": 21256 + }, + { + "epoch": 2.7041088919984735, + "ewc_loss": 0.05042409524321556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5212048058165237e-05, + "grad_norm": 31.513263702392578, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.88347327709198, + "num_tokens": 811134861.0, + "step": 21257 + }, + { + "epoch": 2.704236102277064, + "ewc_loss": 0.05041829124093056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5209144951077178e-05, + "grad_norm": 31.59736442565918, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8636265993118286, + "num_tokens": 811170001.0, + "step": 21258 + }, + { + "epoch": 2.7043633125556545, + "ewc_loss": 0.05044814571738243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.522407339711208e-05, + "grad_norm": 31.509769439697266, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8775973320007324, + "num_tokens": 811204203.0, + "step": 21259 + }, + { + "epoch": 2.704490522834245, + "ewc_loss": 0.050338342785835266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5169170839944854e-05, + "grad_norm": 31.367502212524414, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8761488199234009, + "num_tokens": 811244598.0, + "step": 21260 + }, + { + "epoch": 2.7046177331128356, + "ewc_loss": 0.05050257220864296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5251285478589125e-05, + "grad_norm": 31.545042037963867, + "learning_rate": 1e-06, + "loss": 0.3862, + "mean_token_accuracy": 0.8835078477859497, + "num_tokens": 811283352.0, + "step": 21261 + }, + { + "epoch": 2.704744943391426, + "ewc_loss": 0.050529707223176956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5264853320550174e-05, + "grad_norm": 31.549015045166016, + "learning_rate": 1e-06, + "loss": 0.4511, + "mean_token_accuracy": 0.8613306283950806, + "num_tokens": 811317605.0, + "step": 21262 + }, + { + "epoch": 2.7048721536700167, + "ewc_loss": 0.05048847198486328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.524423507566098e-05, + "grad_norm": 31.465085983276367, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8885977268218994, + "num_tokens": 811352117.0, + "step": 21263 + }, + { + "epoch": 2.704999363948607, + "ewc_loss": 0.05048203468322754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.524101728340611e-05, + "grad_norm": 31.645099639892578, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8648035526275635, + "num_tokens": 811386118.0, + "step": 21264 + }, + { + "epoch": 2.7051265742271977, + "ewc_loss": 0.0505865141749382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5293256840086542e-05, + "grad_norm": 31.58736228942871, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8827882409095764, + "num_tokens": 811424479.0, + "step": 21265 + }, + { + "epoch": 2.7052537845057882, + "ewc_loss": 0.050452835857868195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.522641807445325e-05, + "grad_norm": 31.522619247436523, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8591728210449219, + "num_tokens": 811464683.0, + "step": 21266 + }, + { + "epoch": 2.7053809947843783, + "ewc_loss": 0.050525858998298645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5262928829761222e-05, + "grad_norm": 31.584070205688477, + "learning_rate": 1e-06, + "loss": 0.3734, + "mean_token_accuracy": 0.888083279132843, + "num_tokens": 811504801.0, + "step": 21267 + }, + { + "epoch": 2.7055082050629693, + "ewc_loss": 0.0505511574447155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.527557808207348e-05, + "grad_norm": 31.583293914794922, + "learning_rate": 1e-06, + "loss": 0.4427, + "mean_token_accuracy": 0.8648018836975098, + "num_tokens": 811546278.0, + "step": 21268 + }, + { + "epoch": 2.7056354153415594, + "ewc_loss": 0.05040032044053078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5200160962413065e-05, + "grad_norm": 31.514623641967773, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.85905921459198, + "num_tokens": 811587019.0, + "step": 21269 + }, + { + "epoch": 2.7057626256201504, + "ewc_loss": 0.05047539621591568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5237697627744637e-05, + "grad_norm": 31.578908920288086, + "learning_rate": 1e-06, + "loss": 0.486, + "mean_token_accuracy": 0.8484429121017456, + "num_tokens": 811619843.0, + "step": 21270 + }, + { + "epoch": 2.7058898358987404, + "ewc_loss": 0.050456345081329346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.522817339922767e-05, + "grad_norm": 31.48877716064453, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8782671093940735, + "num_tokens": 811655797.0, + "step": 21271 + }, + { + "epoch": 2.706017046177331, + "ewc_loss": 0.05052092298865318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5260462280130014e-05, + "grad_norm": 31.63787269592285, + "learning_rate": 1e-06, + "loss": 0.4398, + "mean_token_accuracy": 0.8634564876556396, + "num_tokens": 811691626.0, + "step": 21272 + }, + { + "epoch": 2.7061442564559215, + "ewc_loss": 0.050516434013843536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5258217647206038e-05, + "grad_norm": 31.49336051940918, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8741899728775024, + "num_tokens": 811726046.0, + "step": 21273 + }, + { + "epoch": 2.706271466734512, + "ewc_loss": 0.050490278750658035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5245139113394544e-05, + "grad_norm": 31.476226806640625, + "learning_rate": 1e-06, + "loss": 0.4519, + "mean_token_accuracy": 0.8626970052719116, + "num_tokens": 811765686.0, + "step": 21274 + }, + { + "epoch": 2.7063986770131025, + "ewc_loss": 0.05055554211139679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5277771783294156e-05, + "grad_norm": 31.579265594482422, + "learning_rate": 1e-06, + "loss": 0.392, + "mean_token_accuracy": 0.8793022632598877, + "num_tokens": 811801143.0, + "step": 21275 + }, + { + "epoch": 2.706525887291693, + "ewc_loss": 0.05052361264824867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5261806513299234e-05, + "grad_norm": 31.52794647216797, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.880081832408905, + "num_tokens": 811834807.0, + "step": 21276 + }, + { + "epoch": 2.7066530975702836, + "ewc_loss": 0.05060533434152603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5302666472271085e-05, + "grad_norm": 31.571794509887695, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8909812569618225, + "num_tokens": 811865676.0, + "step": 21277 + }, + { + "epoch": 2.706780307848874, + "ewc_loss": 0.05059686675667763, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5298433683929034e-05, + "grad_norm": 31.5827693939209, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8709031343460083, + "num_tokens": 811901259.0, + "step": 21278 + }, + { + "epoch": 2.7069075181274647, + "ewc_loss": 0.05067453160881996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.533726546971593e-05, + "grad_norm": 31.634815216064453, + "learning_rate": 1e-06, + "loss": 0.4698, + "mean_token_accuracy": 0.8587491512298584, + "num_tokens": 811937768.0, + "step": 21279 + }, + { + "epoch": 2.707034728406055, + "ewc_loss": 0.0505758561193943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5287927201134153e-05, + "grad_norm": 31.504892349243164, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8728886246681213, + "num_tokens": 811967805.0, + "step": 21280 + }, + { + "epoch": 2.7071619386846457, + "ewc_loss": 0.05057196319103241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.528598088247236e-05, + "grad_norm": 31.556015014648438, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8688105344772339, + "num_tokens": 812007468.0, + "step": 21281 + }, + { + "epoch": 2.7072891489632362, + "ewc_loss": 0.050686679780483246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.534333907533437e-05, + "grad_norm": 31.647045135498047, + "learning_rate": 1e-06, + "loss": 0.4085, + "mean_token_accuracy": 0.8745881915092468, + "num_tokens": 812048127.0, + "step": 21282 + }, + { + "epoch": 2.7074163592418268, + "ewc_loss": 0.05052139237523079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.526069692976307e-05, + "grad_norm": 31.441720962524414, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8727797269821167, + "num_tokens": 812083904.0, + "step": 21283 + }, + { + "epoch": 2.7075435695204173, + "ewc_loss": 0.05056692287325859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5283461582148448e-05, + "grad_norm": 31.609012603759766, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8775919675827026, + "num_tokens": 812119643.0, + "step": 21284 + }, + { + "epoch": 2.707670779799008, + "ewc_loss": 0.050637297332286835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.531864811317064e-05, + "grad_norm": 31.647647857666016, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.870681881904602, + "num_tokens": 812155338.0, + "step": 21285 + }, + { + "epoch": 2.7077979900775984, + "ewc_loss": 0.05047275125980377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.523637522244826e-05, + "grad_norm": 31.561235427856445, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.87269526720047, + "num_tokens": 812193075.0, + "step": 21286 + }, + { + "epoch": 2.707925200356189, + "ewc_loss": 0.050486352294683456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5243176423828118e-05, + "grad_norm": 31.5302677154541, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8810766339302063, + "num_tokens": 812228163.0, + "step": 21287 + }, + { + "epoch": 2.7080524106347794, + "ewc_loss": 0.05050201714038849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5251008992199786e-05, + "grad_norm": 31.60979461669922, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8673001527786255, + "num_tokens": 812268565.0, + "step": 21288 + }, + { + "epoch": 2.70817962091337, + "ewc_loss": 0.050499770790338516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5249884856748395e-05, + "grad_norm": 31.51641845703125, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8780362606048584, + "num_tokens": 812307292.0, + "step": 21289 + }, + { + "epoch": 2.7083068311919605, + "ewc_loss": 0.050465356558561325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5232679035980254e-05, + "grad_norm": 31.456302642822266, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8819693326950073, + "num_tokens": 812346180.0, + "step": 21290 + }, + { + "epoch": 2.708434041470551, + "ewc_loss": 0.05057544633746147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5287723474320956e-05, + "grad_norm": 31.618391036987305, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8700777292251587, + "num_tokens": 812376033.0, + "step": 21291 + }, + { + "epoch": 2.708561251749141, + "ewc_loss": 0.050544917583465576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.52724585152464e-05, + "grad_norm": 31.53714370727539, + "learning_rate": 1e-06, + "loss": 0.3652, + "mean_token_accuracy": 0.8903655409812927, + "num_tokens": 812407925.0, + "step": 21292 + }, + { + "epoch": 2.708688462027732, + "ewc_loss": 0.05054456740617752, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.527228389226366e-05, + "grad_norm": 31.602563858032227, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.8781577944755554, + "num_tokens": 812452354.0, + "step": 21293 + }, + { + "epoch": 2.708815672306322, + "ewc_loss": 0.05054397135972977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5271985578001477e-05, + "grad_norm": 31.490571975708008, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8721156716346741, + "num_tokens": 812490015.0, + "step": 21294 + }, + { + "epoch": 2.708942882584913, + "ewc_loss": 0.050569966435432434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5284984076279216e-05, + "grad_norm": 31.703001022338867, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8693873286247253, + "num_tokens": 812529919.0, + "step": 21295 + }, + { + "epoch": 2.709070092863503, + "ewc_loss": 0.0505349338054657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.526746720832307e-05, + "grad_norm": 31.4876708984375, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8851470947265625, + "num_tokens": 812568541.0, + "step": 21296 + }, + { + "epoch": 2.7091973031420937, + "ewc_loss": 0.05042010173201561, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5210050807800144e-05, + "grad_norm": 31.505531311035156, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8643506765365601, + "num_tokens": 812612098.0, + "step": 21297 + }, + { + "epoch": 2.7093245134206843, + "ewc_loss": 0.05058775097131729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5293875296483748e-05, + "grad_norm": 31.60969352722168, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8690671920776367, + "num_tokens": 812648843.0, + "step": 21298 + }, + { + "epoch": 2.709451723699275, + "ewc_loss": 0.05049419403076172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.524709634599276e-05, + "grad_norm": 31.52361297607422, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8725160956382751, + "num_tokens": 812692080.0, + "step": 21299 + }, + { + "epoch": 2.7095789339778653, + "ewc_loss": 0.05053345486521721, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5266726879635826e-05, + "grad_norm": 31.519084930419922, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8862518072128296, + "num_tokens": 812729441.0, + "step": 21300 + }, + { + "epoch": 2.709706144256456, + "ewc_loss": 0.05048064887523651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5240324248443358e-05, + "grad_norm": 31.531978607177734, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8672655820846558, + "num_tokens": 812768908.0, + "step": 21301 + }, + { + "epoch": 2.7098333545350464, + "ewc_loss": 0.05051617696881294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5258088498958386e-05, + "grad_norm": 31.572917938232422, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8721672296524048, + "num_tokens": 812808384.0, + "step": 21302 + }, + { + "epoch": 2.709960564813637, + "ewc_loss": 0.05044809356331825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5224046112271026e-05, + "grad_norm": 31.668582916259766, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8646954298019409, + "num_tokens": 812847658.0, + "step": 21303 + }, + { + "epoch": 2.7100877750922274, + "ewc_loss": 0.05045897886157036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.522949034755584e-05, + "grad_norm": 31.504188537597656, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8736369013786316, + "num_tokens": 812889394.0, + "step": 21304 + }, + { + "epoch": 2.710214985370818, + "ewc_loss": 0.05033976584672928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.516988206480164e-05, + "grad_norm": 31.534135818481445, + "learning_rate": 1e-06, + "loss": 0.4518, + "mean_token_accuracy": 0.8604010343551636, + "num_tokens": 812924202.0, + "step": 21305 + }, + { + "epoch": 2.7103421956494085, + "ewc_loss": 0.05045868828892708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5229344828403555e-05, + "grad_norm": 31.530302047729492, + "learning_rate": 1e-06, + "loss": 0.3541, + "mean_token_accuracy": 0.8919588327407837, + "num_tokens": 812954391.0, + "step": 21306 + }, + { + "epoch": 2.710469405927999, + "ewc_loss": 0.05050894618034363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5254472348024137e-05, + "grad_norm": 31.57853889465332, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8636000156402588, + "num_tokens": 812989498.0, + "step": 21307 + }, + { + "epoch": 2.7105966162065895, + "ewc_loss": 0.05052468925714493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5262344934162684e-05, + "grad_norm": 31.539310455322266, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8793128132820129, + "num_tokens": 813030968.0, + "step": 21308 + }, + { + "epoch": 2.71072382648518, + "ewc_loss": 0.050382182002067566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5191091481246985e-05, + "grad_norm": 31.52765655517578, + "learning_rate": 1e-06, + "loss": 0.3474, + "mean_token_accuracy": 0.894368052482605, + "num_tokens": 813065843.0, + "step": 21309 + }, + { + "epoch": 2.7108510367637706, + "ewc_loss": 0.0504932776093483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5246637960663065e-05, + "grad_norm": 31.394664764404297, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.878873348236084, + "num_tokens": 813107410.0, + "step": 21310 + }, + { + "epoch": 2.710978247042361, + "ewc_loss": 0.050581250339746475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.529062476241961e-05, + "grad_norm": 31.617374420166016, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8796190023422241, + "num_tokens": 813155959.0, + "step": 21311 + }, + { + "epoch": 2.7111054573209517, + "ewc_loss": 0.050554025918245316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5277013264712878e-05, + "grad_norm": 31.53472900390625, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.879133939743042, + "num_tokens": 813194133.0, + "step": 21312 + }, + { + "epoch": 2.711232667599542, + "ewc_loss": 0.05054982379078865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5274912331951782e-05, + "grad_norm": 31.612171173095703, + "learning_rate": 1e-06, + "loss": 0.4577, + "mean_token_accuracy": 0.8607761859893799, + "num_tokens": 813231015.0, + "step": 21313 + }, + { + "epoch": 2.7113598778781327, + "ewc_loss": 0.05063673108816147, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.531836616981309e-05, + "grad_norm": 31.481164932250977, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8882685899734497, + "num_tokens": 813271497.0, + "step": 21314 + }, + { + "epoch": 2.711487088156723, + "ewc_loss": 0.05047985538840294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5239927708753385e-05, + "grad_norm": 31.475082397460938, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8747053146362305, + "num_tokens": 813305758.0, + "step": 21315 + }, + { + "epoch": 2.7116142984353138, + "ewc_loss": 0.05057963356375694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5289817131124437e-05, + "grad_norm": 31.525901794433594, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8716028928756714, + "num_tokens": 813345958.0, + "step": 21316 + }, + { + "epoch": 2.711741508713904, + "ewc_loss": 0.050458911806344986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.522945578675717e-05, + "grad_norm": 31.507648468017578, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8767819404602051, + "num_tokens": 813381630.0, + "step": 21317 + }, + { + "epoch": 2.711868718992495, + "ewc_loss": 0.05058642476797104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5293213184340857e-05, + "grad_norm": 31.633840560913086, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8727580308914185, + "num_tokens": 813421320.0, + "step": 21318 + }, + { + "epoch": 2.711995929271085, + "ewc_loss": 0.05049494281411171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.524747105780989e-05, + "grad_norm": 31.56972885131836, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.867377519607544, + "num_tokens": 813458323.0, + "step": 21319 + }, + { + "epoch": 2.712123139549676, + "ewc_loss": 0.05047459155321121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5237295631086454e-05, + "grad_norm": 31.709081649780273, + "learning_rate": 1e-06, + "loss": 0.4773, + "mean_token_accuracy": 0.8544423580169678, + "num_tokens": 813501436.0, + "step": 21320 + }, + { + "epoch": 2.712250349828266, + "ewc_loss": 0.05044564604759216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5222823751391843e-05, + "grad_norm": 31.483190536499023, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8727844953536987, + "num_tokens": 813545942.0, + "step": 21321 + }, + { + "epoch": 2.7123775601068565, + "ewc_loss": 0.05044881999492645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5224409910151735e-05, + "grad_norm": 31.59821128845215, + "learning_rate": 1e-06, + "loss": 0.4317, + "mean_token_accuracy": 0.8680735230445862, + "num_tokens": 813582202.0, + "step": 21322 + }, + { + "epoch": 2.712504770385447, + "ewc_loss": 0.0505380779504776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5269038815167733e-05, + "grad_norm": 31.614789962768555, + "learning_rate": 1e-06, + "loss": 0.4622, + "mean_token_accuracy": 0.8570448756217957, + "num_tokens": 813621711.0, + "step": 21323 + }, + { + "epoch": 2.7126319806640375, + "ewc_loss": 0.05034993961453438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.517496977816336e-05, + "grad_norm": 31.632287979125977, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8677140474319458, + "num_tokens": 813665404.0, + "step": 21324 + }, + { + "epoch": 2.712759190942628, + "ewc_loss": 0.05051138624548912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5255692889913917e-05, + "grad_norm": 31.689861297607422, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.8791900873184204, + "num_tokens": 813697470.0, + "step": 21325 + }, + { + "epoch": 2.7128864012212186, + "ewc_loss": 0.05041167512536049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5205838028341532e-05, + "grad_norm": 31.60262107849121, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8797754645347595, + "num_tokens": 813733308.0, + "step": 21326 + }, + { + "epoch": 2.713013611499809, + "ewc_loss": 0.050452232360839844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.522611612221226e-05, + "grad_norm": 31.657155990600586, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8584415912628174, + "num_tokens": 813770881.0, + "step": 21327 + }, + { + "epoch": 2.7131408217783997, + "ewc_loss": 0.050343479961156845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.517173925298266e-05, + "grad_norm": 31.509830474853516, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8778926134109497, + "num_tokens": 813806495.0, + "step": 21328 + }, + { + "epoch": 2.71326803205699, + "ewc_loss": 0.05038594827055931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5192974135279655e-05, + "grad_norm": 31.643474578857422, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8780546188354492, + "num_tokens": 813842408.0, + "step": 21329 + }, + { + "epoch": 2.7133952423355807, + "ewc_loss": 0.05043159797787666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.521579881431535e-05, + "grad_norm": 31.57572364807129, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8698201775550842, + "num_tokens": 813883041.0, + "step": 21330 + }, + { + "epoch": 2.7135224526141712, + "ewc_loss": 0.050349388271570206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.517469329177402e-05, + "grad_norm": 31.49424171447754, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8690750598907471, + "num_tokens": 813922901.0, + "step": 21331 + }, + { + "epoch": 2.7136496628927618, + "ewc_loss": 0.05043281614780426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5216408175765537e-05, + "grad_norm": 31.68670654296875, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8756866455078125, + "num_tokens": 813961948.0, + "step": 21332 + }, + { + "epoch": 2.7137768731713523, + "ewc_loss": 0.05047044903039932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5235223802155815e-05, + "grad_norm": 31.609859466552734, + "learning_rate": 1e-06, + "loss": 0.3736, + "mean_token_accuracy": 0.8860067129135132, + "num_tokens": 813999194.0, + "step": 21333 + }, + { + "epoch": 2.713904083449943, + "ewc_loss": 0.05038924887776375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.519462395866867e-05, + "grad_norm": 31.58530616760254, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8616406917572021, + "num_tokens": 814038986.0, + "step": 21334 + }, + { + "epoch": 2.7140312937285334, + "ewc_loss": 0.05049477890133858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.524738920328673e-05, + "grad_norm": 31.771852493286133, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8861758708953857, + "num_tokens": 814074351.0, + "step": 21335 + }, + { + "epoch": 2.714158504007124, + "ewc_loss": 0.050425462424755096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.521273199818097e-05, + "grad_norm": 31.397384643554688, + "learning_rate": 1e-06, + "loss": 0.4183, + "mean_token_accuracy": 0.8720697164535522, + "num_tokens": 814109255.0, + "step": 21336 + }, + { + "epoch": 2.7142857142857144, + "ewc_loss": 0.05043232440948486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5216162612196058e-05, + "grad_norm": 31.796707153320312, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8688704371452332, + "num_tokens": 814146601.0, + "step": 21337 + }, + { + "epoch": 2.714412924564305, + "ewc_loss": 0.050539854913949966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5269928300986066e-05, + "grad_norm": 31.44571304321289, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8779646754264832, + "num_tokens": 814181299.0, + "step": 21338 + }, + { + "epoch": 2.7145401348428955, + "ewc_loss": 0.050394125282764435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5197063223458827e-05, + "grad_norm": 31.550825119018555, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8802003264427185, + "num_tokens": 814218178.0, + "step": 21339 + }, + { + "epoch": 2.7146673451214856, + "ewc_loss": 0.05045400187373161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5227000151062384e-05, + "grad_norm": 31.471603393554688, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8747285604476929, + "num_tokens": 814254161.0, + "step": 21340 + }, + { + "epoch": 2.7147945554000765, + "ewc_loss": 0.05045841634273529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.522920840419829e-05, + "grad_norm": 31.601200103759766, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8742731809616089, + "num_tokens": 814290164.0, + "step": 21341 + }, + { + "epoch": 2.7149217656786666, + "ewc_loss": 0.05049799010157585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.524899537093006e-05, + "grad_norm": 31.4490909576416, + "learning_rate": 1e-06, + "loss": 0.4809, + "mean_token_accuracy": 0.852294385433197, + "num_tokens": 814334459.0, + "step": 21342 + }, + { + "epoch": 2.7150489759572576, + "ewc_loss": 0.050433628261089325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5216813810402527e-05, + "grad_norm": 31.583436965942383, + "learning_rate": 1e-06, + "loss": 0.4389, + "mean_token_accuracy": 0.8644917607307434, + "num_tokens": 814374291.0, + "step": 21343 + }, + { + "epoch": 2.7151761862358477, + "ewc_loss": 0.05058027803897858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5290139092248864e-05, + "grad_norm": 31.50770378112793, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.867425799369812, + "num_tokens": 814413196.0, + "step": 21344 + }, + { + "epoch": 2.7153033965144386, + "ewc_loss": 0.05051643028855324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5258215828216635e-05, + "grad_norm": 31.494096755981445, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8712589740753174, + "num_tokens": 814455312.0, + "step": 21345 + }, + { + "epoch": 2.7154306067930287, + "ewc_loss": 0.05054468661546707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5272343918913975e-05, + "grad_norm": 31.514223098754883, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8680307865142822, + "num_tokens": 814496586.0, + "step": 21346 + }, + { + "epoch": 2.7155578170716193, + "ewc_loss": 0.05051111429929733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.525555646570865e-05, + "grad_norm": 31.4891357421875, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8782916069030762, + "num_tokens": 814531566.0, + "step": 21347 + }, + { + "epoch": 2.71568502735021, + "ewc_loss": 0.05064238980412483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5321194698335603e-05, + "grad_norm": 31.610889434814453, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8794270753860474, + "num_tokens": 814566168.0, + "step": 21348 + }, + { + "epoch": 2.7158122376288003, + "ewc_loss": 0.05054163932800293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5270819605793804e-05, + "grad_norm": 31.457529067993164, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8727189302444458, + "num_tokens": 814604927.0, + "step": 21349 + }, + { + "epoch": 2.715939447907391, + "ewc_loss": 0.0504753552377224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.52376776188612e-05, + "grad_norm": 31.593461990356445, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8877712488174438, + "num_tokens": 814641401.0, + "step": 21350 + }, + { + "epoch": 2.7160666581859814, + "ewc_loss": 0.0505666583776474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.528332879592199e-05, + "grad_norm": 31.56235694885254, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8784160614013672, + "num_tokens": 814674252.0, + "step": 21351 + }, + { + "epoch": 2.716193868464572, + "ewc_loss": 0.05049920827150345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.524960473238025e-05, + "grad_norm": 31.49300765991211, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8716027140617371, + "num_tokens": 814704965.0, + "step": 21352 + }, + { + "epoch": 2.7163210787431624, + "ewc_loss": 0.05060030519962311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5300152628915384e-05, + "grad_norm": 31.560165405273438, + "learning_rate": 1e-06, + "loss": 0.5007, + "mean_token_accuracy": 0.8440059423446655, + "num_tokens": 814743740.0, + "step": 21353 + }, + { + "epoch": 2.716448289021753, + "ewc_loss": 0.05058087781071663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.529043922550045e-05, + "grad_norm": 31.67850685119629, + "learning_rate": 1e-06, + "loss": 0.4012, + "mean_token_accuracy": 0.8801853656768799, + "num_tokens": 814784598.0, + "step": 21354 + }, + { + "epoch": 2.7165754993003435, + "ewc_loss": 0.05058526620268822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5292632926721126e-05, + "grad_norm": 31.496171951293945, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8661892414093018, + "num_tokens": 814821437.0, + "step": 21355 + }, + { + "epoch": 2.716702709578934, + "ewc_loss": 0.050428446382284164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5214223569491878e-05, + "grad_norm": 31.563566207885742, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8729555606842041, + "num_tokens": 814861638.0, + "step": 21356 + }, + { + "epoch": 2.7168299198575245, + "ewc_loss": 0.05060884729027748, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.530442361603491e-05, + "grad_norm": 31.65176773071289, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8746646642684937, + "num_tokens": 814893799.0, + "step": 21357 + }, + { + "epoch": 2.716957130136115, + "ewc_loss": 0.05053016543388367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.526508251321502e-05, + "grad_norm": 31.593564987182617, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8803870677947998, + "num_tokens": 814930726.0, + "step": 21358 + }, + { + "epoch": 2.7170843404147056, + "ewc_loss": 0.050488781183958054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5244391508749686e-05, + "grad_norm": 31.50276756286621, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8712352514266968, + "num_tokens": 814970041.0, + "step": 21359 + }, + { + "epoch": 2.717211550693296, + "ewc_loss": 0.05058464780449867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5292323698522523e-05, + "grad_norm": 31.677671432495117, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8770422339439392, + "num_tokens": 815008587.0, + "step": 21360 + }, + { + "epoch": 2.7173387609718866, + "ewc_loss": 0.050557009875774384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5278504836023785e-05, + "grad_norm": 31.520225524902344, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8599836826324463, + "num_tokens": 815047659.0, + "step": 21361 + }, + { + "epoch": 2.717465971250477, + "ewc_loss": 0.05049033463001251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5245168217225e-05, + "grad_norm": 31.656024932861328, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8731197118759155, + "num_tokens": 815082631.0, + "step": 21362 + }, + { + "epoch": 2.7175931815290677, + "ewc_loss": 0.05056242644786835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5281213311245665e-05, + "grad_norm": 31.627225875854492, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8915613293647766, + "num_tokens": 815117568.0, + "step": 21363 + }, + { + "epoch": 2.7177203918076582, + "ewc_loss": 0.05050139129161835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5250696126022376e-05, + "grad_norm": 31.635629653930664, + "learning_rate": 1e-06, + "loss": 0.4079, + "mean_token_accuracy": 0.8736419677734375, + "num_tokens": 815155019.0, + "step": 21364 + }, + { + "epoch": 2.7178476020862483, + "ewc_loss": 0.050581298768520355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.529065022827126e-05, + "grad_norm": 31.56492805480957, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8737139701843262, + "num_tokens": 815195386.0, + "step": 21365 + }, + { + "epoch": 2.7179748123648393, + "ewc_loss": 0.05046413093805313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.523206603655126e-05, + "grad_norm": 31.655925750732422, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8773949146270752, + "num_tokens": 815228304.0, + "step": 21366 + }, + { + "epoch": 2.7181020226434294, + "ewc_loss": 0.050519298762083054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.525964919186663e-05, + "grad_norm": 31.645549774169922, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8923210501670837, + "num_tokens": 815260111.0, + "step": 21367 + }, + { + "epoch": 2.7182292329220203, + "ewc_loss": 0.050389956682920456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5194978661602363e-05, + "grad_norm": 31.630704879760742, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8725712299346924, + "num_tokens": 815297272.0, + "step": 21368 + }, + { + "epoch": 2.7183564432006104, + "ewc_loss": 0.050522077828645706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5261038899770938e-05, + "grad_norm": 31.55095863342285, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8719128370285034, + "num_tokens": 815336016.0, + "step": 21369 + }, + { + "epoch": 2.718483653479201, + "ewc_loss": 0.05048057809472084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.524028968764469e-05, + "grad_norm": 31.496301651000977, + "learning_rate": 1e-06, + "loss": 0.4936, + "mean_token_accuracy": 0.846838116645813, + "num_tokens": 815373940.0, + "step": 21370 + }, + { + "epoch": 2.7186108637577915, + "ewc_loss": 0.05053367838263512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5266839656978846e-05, + "grad_norm": 31.689380645751953, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8629021644592285, + "num_tokens": 815410836.0, + "step": 21371 + }, + { + "epoch": 2.718738074036382, + "ewc_loss": 0.05050567910075188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5252838895539753e-05, + "grad_norm": 31.554710388183594, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8786311745643616, + "num_tokens": 815449577.0, + "step": 21372 + }, + { + "epoch": 2.7188652843149725, + "ewc_loss": 0.05053549259901047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.526774551370181e-05, + "grad_norm": 31.55767059326172, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8757568001747131, + "num_tokens": 815485869.0, + "step": 21373 + }, + { + "epoch": 2.718992494593563, + "ewc_loss": 0.05050762742757797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5253813873860054e-05, + "grad_norm": 31.65203094482422, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8546215295791626, + "num_tokens": 815523614.0, + "step": 21374 + }, + { + "epoch": 2.7191197048721536, + "ewc_loss": 0.050493206828832626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5246603399864398e-05, + "grad_norm": 31.4727840423584, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8518657088279724, + "num_tokens": 815561835.0, + "step": 21375 + }, + { + "epoch": 2.719246915150744, + "ewc_loss": 0.05051335692405701, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.525667878217064e-05, + "grad_norm": 31.507381439208984, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8574037551879883, + "num_tokens": 815605596.0, + "step": 21376 + }, + { + "epoch": 2.7193741254293347, + "ewc_loss": 0.05062752217054367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.531376048864331e-05, + "grad_norm": 31.559171676635742, + "learning_rate": 1e-06, + "loss": 0.4324, + "mean_token_accuracy": 0.8671798706054688, + "num_tokens": 815638720.0, + "step": 21377 + }, + { + "epoch": 2.719501335707925, + "ewc_loss": 0.05055898055434227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5279490728280507e-05, + "grad_norm": 31.432979583740234, + "learning_rate": 1e-06, + "loss": 0.3685, + "mean_token_accuracy": 0.8880574703216553, + "num_tokens": 815673996.0, + "step": 21378 + }, + { + "epoch": 2.7196285459865157, + "ewc_loss": 0.050634946674108505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5317473046015948e-05, + "grad_norm": 31.689760208129883, + "learning_rate": 1e-06, + "loss": 0.4574, + "mean_token_accuracy": 0.8602663278579712, + "num_tokens": 815716571.0, + "step": 21379 + }, + { + "epoch": 2.7197557562651062, + "ewc_loss": 0.050665177404880524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5332588847959414e-05, + "grad_norm": 31.44063949584961, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8794296979904175, + "num_tokens": 815748973.0, + "step": 21380 + }, + { + "epoch": 2.7198829665436968, + "ewc_loss": 0.050526440143585205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.526321986806579e-05, + "grad_norm": 31.591066360473633, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8759729862213135, + "num_tokens": 815794754.0, + "step": 21381 + }, + { + "epoch": 2.7200101768222873, + "ewc_loss": 0.0506606362760067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.533031874918379e-05, + "grad_norm": 31.51968002319336, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8755147457122803, + "num_tokens": 815831051.0, + "step": 21382 + }, + { + "epoch": 2.720137387100878, + "ewc_loss": 0.05049271509051323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.524635783629492e-05, + "grad_norm": 31.474374771118164, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8750176429748535, + "num_tokens": 815874291.0, + "step": 21383 + }, + { + "epoch": 2.7202645973794684, + "ewc_loss": 0.05061274394392967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5306371753686108e-05, + "grad_norm": 31.573413848876953, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8693957328796387, + "num_tokens": 815912484.0, + "step": 21384 + }, + { + "epoch": 2.720391807658059, + "ewc_loss": 0.05055210739374161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5276052838307805e-05, + "grad_norm": 31.530733108520508, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8572813272476196, + "num_tokens": 815948626.0, + "step": 21385 + }, + { + "epoch": 2.7205190179366494, + "ewc_loss": 0.05059178173542023, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5295890736742876e-05, + "grad_norm": 31.467390060424805, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8820982575416565, + "num_tokens": 815983154.0, + "step": 21386 + }, + { + "epoch": 2.72064622821524, + "ewc_loss": 0.05059746652841568, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.529873381718062e-05, + "grad_norm": 31.522432327270508, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.875523567199707, + "num_tokens": 816017524.0, + "step": 21387 + }, + { + "epoch": 2.7207734384938305, + "ewc_loss": 0.05068664252758026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5343320885440335e-05, + "grad_norm": 31.52204704284668, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8686426877975464, + "num_tokens": 816055204.0, + "step": 21388 + }, + { + "epoch": 2.720900648772421, + "ewc_loss": 0.05067453160881996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.533726546971593e-05, + "grad_norm": 31.58078384399414, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8806769251823425, + "num_tokens": 816086671.0, + "step": 21389 + }, + { + "epoch": 2.721027859051011, + "ewc_loss": 0.050655797123909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5327899493277073e-05, + "grad_norm": 31.57465934753418, + "learning_rate": 1e-06, + "loss": 0.4923, + "mean_token_accuracy": 0.852384090423584, + "num_tokens": 816114173.0, + "step": 21390 + }, + { + "epoch": 2.721155069329602, + "ewc_loss": 0.05062020197510719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5310100681963377e-05, + "grad_norm": 31.634000778198242, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8848692178726196, + "num_tokens": 816156434.0, + "step": 21391 + }, + { + "epoch": 2.721282279608192, + "ewc_loss": 0.05071673542261124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5358367565786466e-05, + "grad_norm": 31.62495231628418, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.868806004524231, + "num_tokens": 816191037.0, + "step": 21392 + }, + { + "epoch": 2.721409489886783, + "ewc_loss": 0.050653159618377686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.53265789069701e-05, + "grad_norm": 31.56804847717285, + "learning_rate": 1e-06, + "loss": 0.3752, + "mean_token_accuracy": 0.8863632678985596, + "num_tokens": 816227031.0, + "step": 21393 + }, + { + "epoch": 2.721536700165373, + "ewc_loss": 0.05060398206114769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.530199162720237e-05, + "grad_norm": 31.67380714416504, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8761776685714722, + "num_tokens": 816264308.0, + "step": 21394 + }, + { + "epoch": 2.7216639104439637, + "ewc_loss": 0.05070451274514198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5352255761390552e-05, + "grad_norm": 31.52829933166504, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8710429072380066, + "num_tokens": 816300955.0, + "step": 21395 + }, + { + "epoch": 2.7217911207225542, + "ewc_loss": 0.05065931752324104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5329658456030302e-05, + "grad_norm": 31.544771194458008, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8739712834358215, + "num_tokens": 816332881.0, + "step": 21396 + }, + { + "epoch": 2.7219183310011448, + "ewc_loss": 0.050691455602645874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5345727408421226e-05, + "grad_norm": 31.63958168029785, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8921594619750977, + "num_tokens": 816367446.0, + "step": 21397 + }, + { + "epoch": 2.7220455412797353, + "ewc_loss": 0.05070807412266731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5354036552016623e-05, + "grad_norm": 31.558820724487305, + "learning_rate": 1e-06, + "loss": 0.4643, + "mean_token_accuracy": 0.8565269112586975, + "num_tokens": 816407286.0, + "step": 21398 + }, + { + "epoch": 2.722172751558326, + "ewc_loss": 0.050656285136938095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5328143237857148e-05, + "grad_norm": 31.645774841308594, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8823832273483276, + "num_tokens": 816445277.0, + "step": 21399 + }, + { + "epoch": 2.7222999618369164, + "ewc_loss": 0.050744567066431046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5372282834723592e-05, + "grad_norm": 31.63140869140625, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8787956833839417, + "num_tokens": 816481638.0, + "step": 21400 + }, + { + "epoch": 2.722427172115507, + "ewc_loss": 0.05061888322234154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5309442207799293e-05, + "grad_norm": 31.555179595947266, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8741286993026733, + "num_tokens": 816521386.0, + "step": 21401 + }, + { + "epoch": 2.7225543823940974, + "ewc_loss": 0.05070427805185318, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5352139346068725e-05, + "grad_norm": 31.50971221923828, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8840972185134888, + "num_tokens": 816554760.0, + "step": 21402 + }, + { + "epoch": 2.722681592672688, + "ewc_loss": 0.050611138343811035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5305569579359144e-05, + "grad_norm": 31.5320987701416, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8818511366844177, + "num_tokens": 816595195.0, + "step": 21403 + }, + { + "epoch": 2.7228088029512785, + "ewc_loss": 0.05067776143550873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.533888073230628e-05, + "grad_norm": 31.530237197875977, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8746559619903564, + "num_tokens": 816633046.0, + "step": 21404 + }, + { + "epoch": 2.722936013229869, + "ewc_loss": 0.0506848506629467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.534242594265379e-05, + "grad_norm": 31.521230697631836, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8870716094970703, + "num_tokens": 816675283.0, + "step": 21405 + }, + { + "epoch": 2.7230632235084595, + "ewc_loss": 0.05066186189651489, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5330931748612784e-05, + "grad_norm": 31.514371871948242, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8638924360275269, + "num_tokens": 816716499.0, + "step": 21406 + }, + { + "epoch": 2.72319043378705, + "ewc_loss": 0.05080472677946091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.540236346249003e-05, + "grad_norm": 31.61311149597168, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8666843771934509, + "num_tokens": 816761825.0, + "step": 21407 + }, + { + "epoch": 2.7233176440656406, + "ewc_loss": 0.050693415105342865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5346707843709737e-05, + "grad_norm": 31.590576171875, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.879338264465332, + "num_tokens": 816792080.0, + "step": 21408 + }, + { + "epoch": 2.723444854344231, + "ewc_loss": 0.05070820823311806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5354103854624555e-05, + "grad_norm": 31.546720504760742, + "learning_rate": 1e-06, + "loss": 0.4483, + "mean_token_accuracy": 0.8637864589691162, + "num_tokens": 816832121.0, + "step": 21409 + }, + { + "epoch": 2.7235720646228216, + "ewc_loss": 0.05064228177070618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5321140128653497e-05, + "grad_norm": 31.577714920043945, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8665082454681396, + "num_tokens": 816874032.0, + "step": 21410 + }, + { + "epoch": 2.723699274901412, + "ewc_loss": 0.05070161074399948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5350806026835926e-05, + "grad_norm": 31.625375747680664, + "learning_rate": 1e-06, + "loss": 0.4723, + "mean_token_accuracy": 0.8548246622085571, + "num_tokens": 816920471.0, + "step": 21411 + }, + { + "epoch": 2.7238264851800027, + "ewc_loss": 0.050699640065431595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5349820134579204e-05, + "grad_norm": 31.56123161315918, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8739755749702454, + "num_tokens": 816958042.0, + "step": 21412 + }, + { + "epoch": 2.723953695458593, + "ewc_loss": 0.05065016821026802, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5325083697680384e-05, + "grad_norm": 31.576200485229492, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.864115297794342, + "num_tokens": 816992158.0, + "step": 21413 + }, + { + "epoch": 2.7240809057371838, + "ewc_loss": 0.05063878744840622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5319393898826092e-05, + "grad_norm": 31.54932975769043, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8613204956054688, + "num_tokens": 817032552.0, + "step": 21414 + }, + { + "epoch": 2.724208116015774, + "ewc_loss": 0.05066796392202377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.533398219384253e-05, + "grad_norm": 31.625041961669922, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.869429349899292, + "num_tokens": 817072240.0, + "step": 21415 + }, + { + "epoch": 2.724335326294365, + "ewc_loss": 0.050650179386138916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5325089154648595e-05, + "grad_norm": 31.565919876098633, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8636724948883057, + "num_tokens": 817104741.0, + "step": 21416 + }, + { + "epoch": 2.724462536572955, + "ewc_loss": 0.050620850175619125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5310424462077208e-05, + "grad_norm": 31.587480545043945, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8605605363845825, + "num_tokens": 817143876.0, + "step": 21417 + }, + { + "epoch": 2.724589746851546, + "ewc_loss": 0.05068221315741539, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.534110717533622e-05, + "grad_norm": 31.47908592224121, + "learning_rate": 1e-06, + "loss": 0.3416, + "mean_token_accuracy": 0.8955722451210022, + "num_tokens": 817183647.0, + "step": 21418 + }, + { + "epoch": 2.724716957130136, + "ewc_loss": 0.05067434534430504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.533717270125635e-05, + "grad_norm": 31.537818908691406, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8683170676231384, + "num_tokens": 817214520.0, + "step": 21419 + }, + { + "epoch": 2.7248441674087265, + "ewc_loss": 0.05066356435418129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5331783035653643e-05, + "grad_norm": 31.552804946899414, + "learning_rate": 1e-06, + "loss": 0.4449, + "mean_token_accuracy": 0.8637050986289978, + "num_tokens": 817256936.0, + "step": 21420 + }, + { + "epoch": 2.724971377687317, + "ewc_loss": 0.050731875002384186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5365938199684024e-05, + "grad_norm": 31.48439598083496, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8740773797035217, + "num_tokens": 817292840.0, + "step": 21421 + }, + { + "epoch": 2.7250985879659075, + "ewc_loss": 0.05067627504467964, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.533813676564023e-05, + "grad_norm": 31.537927627563477, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8681029677391052, + "num_tokens": 817332872.0, + "step": 21422 + }, + { + "epoch": 2.725225798244498, + "ewc_loss": 0.05070776492357254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5353881937917322e-05, + "grad_norm": 31.484315872192383, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8804748058319092, + "num_tokens": 817367668.0, + "step": 21423 + }, + { + "epoch": 2.7253530085230886, + "ewc_loss": 0.05071128159761429, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.535564090067055e-05, + "grad_norm": 31.569141387939453, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8912346959114075, + "num_tokens": 817407726.0, + "step": 21424 + }, + { + "epoch": 2.725480218801679, + "ewc_loss": 0.05072053149342537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5360266590723768e-05, + "grad_norm": 31.566173553466797, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.8837265968322754, + "num_tokens": 817441003.0, + "step": 21425 + }, + { + "epoch": 2.7256074290802697, + "ewc_loss": 0.050707124173641205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5353561795782298e-05, + "grad_norm": 31.59260368347168, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8783408999443054, + "num_tokens": 817482587.0, + "step": 21426 + }, + { + "epoch": 2.72573463935886, + "ewc_loss": 0.05072033777832985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5360168365295976e-05, + "grad_norm": 31.538301467895508, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8764376640319824, + "num_tokens": 817519727.0, + "step": 21427 + }, + { + "epoch": 2.7258618496374507, + "ewc_loss": 0.050717007368803024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5358503989991732e-05, + "grad_norm": 31.656375885009766, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8690024614334106, + "num_tokens": 817553809.0, + "step": 21428 + }, + { + "epoch": 2.7259890599160412, + "ewc_loss": 0.05067441239953041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5337205443065614e-05, + "grad_norm": 31.583768844604492, + "learning_rate": 1e-06, + "loss": 0.4707, + "mean_token_accuracy": 0.8569006323814392, + "num_tokens": 817588893.0, + "step": 21429 + }, + { + "epoch": 2.7261162701946318, + "ewc_loss": 0.050742316991090775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.53711586992722e-05, + "grad_norm": 31.50814437866211, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8790282011032104, + "num_tokens": 817627789.0, + "step": 21430 + }, + { + "epoch": 2.7262434804732223, + "ewc_loss": 0.05063844472169876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.531922291382216e-05, + "grad_norm": 31.491676330566406, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8636267185211182, + "num_tokens": 817670048.0, + "step": 21431 + }, + { + "epoch": 2.726370690751813, + "ewc_loss": 0.05068626627326012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.534313352953177e-05, + "grad_norm": 31.5697078704834, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8791325688362122, + "num_tokens": 817707108.0, + "step": 21432 + }, + { + "epoch": 2.7264979010304033, + "ewc_loss": 0.05076763778924942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.538381886552088e-05, + "grad_norm": 31.671125411987305, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8706049919128418, + "num_tokens": 817748126.0, + "step": 21433 + }, + { + "epoch": 2.726625111308994, + "ewc_loss": 0.050751619040966034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5375809855177067e-05, + "grad_norm": 31.62759780883789, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8757575750350952, + "num_tokens": 817781578.0, + "step": 21434 + }, + { + "epoch": 2.7267523215875844, + "ewc_loss": 0.05064147710800171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5320738131995313e-05, + "grad_norm": 31.590539932250977, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8792537450790405, + "num_tokens": 817822143.0, + "step": 21435 + }, + { + "epoch": 2.726879531866175, + "ewc_loss": 0.05062718316912651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.531359132262878e-05, + "grad_norm": 31.558774948120117, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8826836347579956, + "num_tokens": 817858456.0, + "step": 21436 + }, + { + "epoch": 2.7270067421447655, + "ewc_loss": 0.05067557096481323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5337785700685345e-05, + "grad_norm": 31.631685256958008, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8801661729812622, + "num_tokens": 817892863.0, + "step": 21437 + }, + { + "epoch": 2.7271339524233555, + "ewc_loss": 0.050625432282686234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5312716388725676e-05, + "grad_norm": 31.5749454498291, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8769819140434265, + "num_tokens": 817931120.0, + "step": 21438 + }, + { + "epoch": 2.7272611627019465, + "ewc_loss": 0.050628114491701126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.531405698391609e-05, + "grad_norm": 31.731836318969727, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8763265013694763, + "num_tokens": 817963076.0, + "step": 21439 + }, + { + "epoch": 2.7273883729805366, + "ewc_loss": 0.05061206594109535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5306033421657048e-05, + "grad_norm": 31.407323837280273, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8743278980255127, + "num_tokens": 818003844.0, + "step": 21440 + }, + { + "epoch": 2.7275155832591276, + "ewc_loss": 0.05065330117940903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5326649847556837e-05, + "grad_norm": 31.75914192199707, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.8754782676696777, + "num_tokens": 818048533.0, + "step": 21441 + }, + { + "epoch": 2.7276427935377177, + "ewc_loss": 0.050686925649642944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.534346276661381e-05, + "grad_norm": 31.528688430786133, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8695478439331055, + "num_tokens": 818083290.0, + "step": 21442 + }, + { + "epoch": 2.7277700038163086, + "ewc_loss": 0.05068487673997879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5342438675579615e-05, + "grad_norm": 31.762245178222656, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8816803693771362, + "num_tokens": 818120037.0, + "step": 21443 + }, + { + "epoch": 2.7278972140948987, + "ewc_loss": 0.050730716437101364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5365357942064293e-05, + "grad_norm": 31.553388595581055, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8734812140464783, + "num_tokens": 818164122.0, + "step": 21444 + }, + { + "epoch": 2.7280244243734892, + "ewc_loss": 0.05050282180309296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.525141098885797e-05, + "grad_norm": 31.561803817749023, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8754479885101318, + "num_tokens": 818198065.0, + "step": 21445 + }, + { + "epoch": 2.7281516346520798, + "ewc_loss": 0.05074255168437958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5371275114594027e-05, + "grad_norm": 31.655696868896484, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8750584125518799, + "num_tokens": 818236843.0, + "step": 21446 + }, + { + "epoch": 2.7282788449306703, + "ewc_loss": 0.050659600645303726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.532980033720378e-05, + "grad_norm": 31.654682159423828, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.8744170069694519, + "num_tokens": 818271300.0, + "step": 21447 + }, + { + "epoch": 2.728406055209261, + "ewc_loss": 0.05065975338220596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5329876734758727e-05, + "grad_norm": 31.66241455078125, + "learning_rate": 1e-06, + "loss": 0.4604, + "mean_token_accuracy": 0.8585167527198792, + "num_tokens": 818311799.0, + "step": 21448 + }, + { + "epoch": 2.7285332654878514, + "ewc_loss": 0.0506819449365139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5340972570120357e-05, + "grad_norm": 31.9212589263916, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8800673484802246, + "num_tokens": 818346662.0, + "step": 21449 + }, + { + "epoch": 2.728660475766442, + "ewc_loss": 0.050580449402332306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.529022458475083e-05, + "grad_norm": 31.653676986694336, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8774951696395874, + "num_tokens": 818384465.0, + "step": 21450 + }, + { + "epoch": 2.7287876860450324, + "ewc_loss": 0.05051366239786148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5256831577280536e-05, + "grad_norm": 31.508167266845703, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8708818554878235, + "num_tokens": 818425625.0, + "step": 21451 + }, + { + "epoch": 2.728914896323623, + "ewc_loss": 0.050570953637361526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5285477022407576e-05, + "grad_norm": 31.56175994873047, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8783762454986572, + "num_tokens": 818458544.0, + "step": 21452 + }, + { + "epoch": 2.7290421066022135, + "ewc_loss": 0.050626274198293686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5313136575277895e-05, + "grad_norm": 31.602325439453125, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8690636157989502, + "num_tokens": 818503507.0, + "step": 21453 + }, + { + "epoch": 2.729169316880804, + "ewc_loss": 0.05065848305821419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.532924190745689e-05, + "grad_norm": 31.669652938842773, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8795211911201477, + "num_tokens": 818545411.0, + "step": 21454 + }, + { + "epoch": 2.7292965271593945, + "ewc_loss": 0.05070503428578377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5352517695864663e-05, + "grad_norm": 31.625957489013672, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.869102418422699, + "num_tokens": 818582100.0, + "step": 21455 + }, + { + "epoch": 2.729423737437985, + "ewc_loss": 0.05063231289386749, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.531615609768778e-05, + "grad_norm": 31.561725616455078, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.870297908782959, + "num_tokens": 818615549.0, + "step": 21456 + }, + { + "epoch": 2.7295509477165756, + "ewc_loss": 0.05065826326608658, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5329130949103273e-05, + "grad_norm": 31.57425880432129, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8779464960098267, + "num_tokens": 818652476.0, + "step": 21457 + }, + { + "epoch": 2.729678157995166, + "ewc_loss": 0.050647515803575516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.53237576544052e-05, + "grad_norm": 31.6871395111084, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8769015669822693, + "num_tokens": 818692435.0, + "step": 21458 + }, + { + "epoch": 2.7298053682737566, + "ewc_loss": 0.05063147470355034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5315737730124965e-05, + "grad_norm": 31.511249542236328, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.872596025466919, + "num_tokens": 818733206.0, + "step": 21459 + }, + { + "epoch": 2.729932578552347, + "ewc_loss": 0.050621096044778824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.531054815335665e-05, + "grad_norm": 31.639894485473633, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.8766129016876221, + "num_tokens": 818771717.0, + "step": 21460 + }, + { + "epoch": 2.7300597888309377, + "ewc_loss": 0.05071995407342911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5359977371408604e-05, + "grad_norm": 31.730358123779297, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8728773593902588, + "num_tokens": 818806999.0, + "step": 21461 + }, + { + "epoch": 2.7301869991095282, + "ewc_loss": 0.05064791813492775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5323959562228993e-05, + "grad_norm": 31.526153564453125, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8783023953437805, + "num_tokens": 818846226.0, + "step": 21462 + }, + { + "epoch": 2.7303142093881183, + "ewc_loss": 0.0506327785551548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5316388928331435e-05, + "grad_norm": 31.70539093017578, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8792363405227661, + "num_tokens": 818885667.0, + "step": 21463 + }, + { + "epoch": 2.7304414196667093, + "ewc_loss": 0.050673048943281174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5336525141028687e-05, + "grad_norm": 31.543134689331055, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8791201114654541, + "num_tokens": 818924009.0, + "step": 21464 + }, + { + "epoch": 2.7305686299452994, + "ewc_loss": 0.0506318062543869, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5315903258160688e-05, + "grad_norm": 31.513513565063477, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8817222118377686, + "num_tokens": 818962614.0, + "step": 21465 + }, + { + "epoch": 2.7306958402238903, + "ewc_loss": 0.050705038011074066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5352519514854066e-05, + "grad_norm": 31.57405662536621, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8750936388969421, + "num_tokens": 819004486.0, + "step": 21466 + }, + { + "epoch": 2.7308230505024804, + "ewc_loss": 0.05065465718507767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.532732833060436e-05, + "grad_norm": 31.536720275878906, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8665763139724731, + "num_tokens": 819048364.0, + "step": 21467 + }, + { + "epoch": 2.730950260781071, + "ewc_loss": 0.0507657416164875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5382871172041632e-05, + "grad_norm": 31.632444381713867, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8698253631591797, + "num_tokens": 819086250.0, + "step": 21468 + }, + { + "epoch": 2.7310774710596615, + "ewc_loss": 0.050677619874477386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.533880979171954e-05, + "grad_norm": 31.539165496826172, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8832271099090576, + "num_tokens": 819126705.0, + "step": 21469 + }, + { + "epoch": 2.731204681338252, + "ewc_loss": 0.05059954896569252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5299774279119447e-05, + "grad_norm": 31.594736099243164, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8752971291542053, + "num_tokens": 819165967.0, + "step": 21470 + }, + { + "epoch": 2.7313318916168425, + "ewc_loss": 0.05072994902729988, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5364974135300145e-05, + "grad_norm": 31.502349853515625, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8702096939086914, + "num_tokens": 819208066.0, + "step": 21471 + }, + { + "epoch": 2.731459101895433, + "ewc_loss": 0.05065077915787697, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.532538928790018e-05, + "grad_norm": 31.62973403930664, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8704413175582886, + "num_tokens": 819246269.0, + "step": 21472 + }, + { + "epoch": 2.7315863121740236, + "ewc_loss": 0.05075765401124954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.537882755859755e-05, + "grad_norm": 31.626066207885742, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8741651773452759, + "num_tokens": 819286027.0, + "step": 21473 + }, + { + "epoch": 2.731713522452614, + "ewc_loss": 0.050621289759874344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5310644559795037e-05, + "grad_norm": 31.557279586791992, + "learning_rate": 1e-06, + "loss": 0.3921, + "mean_token_accuracy": 0.8821773529052734, + "num_tokens": 819319994.0, + "step": 21474 + }, + { + "epoch": 2.7318407327312046, + "ewc_loss": 0.05059582740068436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.529791345295962e-05, + "grad_norm": 31.609025955200195, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8691508173942566, + "num_tokens": 819360113.0, + "step": 21475 + }, + { + "epoch": 2.731967943009795, + "ewc_loss": 0.050734035670757294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.536701867938973e-05, + "grad_norm": 31.621238708496094, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8822075724601746, + "num_tokens": 819394575.0, + "step": 21476 + }, + { + "epoch": 2.7320951532883857, + "ewc_loss": 0.05060982704162598, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5304912924184464e-05, + "grad_norm": 31.60317039489746, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8685292601585388, + "num_tokens": 819434545.0, + "step": 21477 + }, + { + "epoch": 2.7322223635669762, + "ewc_loss": 0.050620485097169876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5310242563136853e-05, + "grad_norm": 31.596155166625977, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8831778764724731, + "num_tokens": 819470699.0, + "step": 21478 + }, + { + "epoch": 2.7323495738455668, + "ewc_loss": 0.050585899502038956, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5292949430877343e-05, + "grad_norm": 31.53296661376953, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8671468496322632, + "num_tokens": 819514447.0, + "step": 21479 + }, + { + "epoch": 2.7324767841241573, + "ewc_loss": 0.050606969743967056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.530348501750268e-05, + "grad_norm": 31.6279354095459, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8674037456512451, + "num_tokens": 819553434.0, + "step": 21480 + }, + { + "epoch": 2.732603994402748, + "ewc_loss": 0.050650276243686676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.532513826736249e-05, + "grad_norm": 31.59103775024414, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.866694450378418, + "num_tokens": 819591594.0, + "step": 21481 + }, + { + "epoch": 2.7327312046813383, + "ewc_loss": 0.05069413781166077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.534706800361164e-05, + "grad_norm": 31.762542724609375, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8703151941299438, + "num_tokens": 819634530.0, + "step": 21482 + }, + { + "epoch": 2.732858414959929, + "ewc_loss": 0.0507095530629158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5354776880703866e-05, + "grad_norm": 31.706571578979492, + "learning_rate": 1e-06, + "loss": 0.4419, + "mean_token_accuracy": 0.8641989231109619, + "num_tokens": 819669358.0, + "step": 21483 + }, + { + "epoch": 2.7329856252385194, + "ewc_loss": 0.05048074200749397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.524037154216785e-05, + "grad_norm": 31.589780807495117, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8724580407142639, + "num_tokens": 819711336.0, + "step": 21484 + }, + { + "epoch": 2.73311283551711, + "ewc_loss": 0.050605908036231995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5302953872596845e-05, + "grad_norm": 31.581811904907227, + "learning_rate": 1e-06, + "loss": 0.4392, + "mean_token_accuracy": 0.8647295832633972, + "num_tokens": 819746973.0, + "step": 21485 + }, + { + "epoch": 2.7332400457957005, + "ewc_loss": 0.05059628188610077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5298140826635063e-05, + "grad_norm": 31.581222534179688, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8836877346038818, + "num_tokens": 819780305.0, + "step": 21486 + }, + { + "epoch": 2.733367256074291, + "ewc_loss": 0.050620052963495255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.531002610339783e-05, + "grad_norm": 31.51751708984375, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8816906213760376, + "num_tokens": 819817326.0, + "step": 21487 + }, + { + "epoch": 2.733494466352881, + "ewc_loss": 0.050668537616729736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.533426959416829e-05, + "grad_norm": 31.635826110839844, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8671067953109741, + "num_tokens": 819857597.0, + "step": 21488 + }, + { + "epoch": 2.733621676631472, + "ewc_loss": 0.05063987895846367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5319939595647156e-05, + "grad_norm": 31.533899307250977, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8759418725967407, + "num_tokens": 819893471.0, + "step": 21489 + }, + { + "epoch": 2.733748886910062, + "ewc_loss": 0.050650663673877716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5325331080239266e-05, + "grad_norm": 31.669408798217773, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8567421436309814, + "num_tokens": 819934175.0, + "step": 21490 + }, + { + "epoch": 2.733876097188653, + "ewc_loss": 0.05071188881993294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5355944671900943e-05, + "grad_norm": 31.634017944335938, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8813126683235168, + "num_tokens": 819971451.0, + "step": 21491 + }, + { + "epoch": 2.734003307467243, + "ewc_loss": 0.050601303577423096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5300651032011956e-05, + "grad_norm": 31.579198837280273, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8686667680740356, + "num_tokens": 820011566.0, + "step": 21492 + }, + { + "epoch": 2.7341305177458337, + "ewc_loss": 0.0507093146443367, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5354656827403232e-05, + "grad_norm": 31.57305335998535, + "learning_rate": 1e-06, + "loss": 0.4925, + "mean_token_accuracy": 0.8486183285713196, + "num_tokens": 820047944.0, + "step": 21493 + }, + { + "epoch": 2.7342577280244242, + "ewc_loss": 0.05063430964946747, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5317154722870328e-05, + "grad_norm": 31.716835021972656, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8606280088424683, + "num_tokens": 820083416.0, + "step": 21494 + }, + { + "epoch": 2.7343849383030148, + "ewc_loss": 0.05069934204220772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5349670977448113e-05, + "grad_norm": 31.609962463378906, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8649328947067261, + "num_tokens": 820122476.0, + "step": 21495 + }, + { + "epoch": 2.7345121485816053, + "ewc_loss": 0.050626643002033234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5313322112197056e-05, + "grad_norm": 31.572998046875, + "learning_rate": 1e-06, + "loss": 0.3984, + "mean_token_accuracy": 0.8790584802627563, + "num_tokens": 820159866.0, + "step": 21496 + }, + { + "epoch": 2.734639358860196, + "ewc_loss": 0.05065149441361427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5325747628812678e-05, + "grad_norm": 31.658571243286133, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8739144206047058, + "num_tokens": 820202710.0, + "step": 21497 + }, + { + "epoch": 2.7347665691387864, + "ewc_loss": 0.050656046718358994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5328023184556514e-05, + "grad_norm": 31.56684684753418, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8720296621322632, + "num_tokens": 820239156.0, + "step": 21498 + }, + { + "epoch": 2.734893779417377, + "ewc_loss": 0.050693199038505554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5346598704345524e-05, + "grad_norm": 31.47873878479004, + "learning_rate": 1e-06, + "loss": 0.3517, + "mean_token_accuracy": 0.8902102112770081, + "num_tokens": 820274010.0, + "step": 21499 + }, + { + "epoch": 2.7350209896959674, + "ewc_loss": 0.05076984316110611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.538492117309943e-05, + "grad_norm": 31.597089767456055, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8716184496879578, + "num_tokens": 820319171.0, + "step": 21500 + }, + { + "epoch": 2.735148199974558, + "ewc_loss": 0.05083584040403366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5417919459869154e-05, + "grad_norm": 31.544052124023438, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8586658239364624, + "num_tokens": 820355038.0, + "step": 21501 + }, + { + "epoch": 2.7352754102531485, + "ewc_loss": 0.05072437599301338, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5362187443533912e-05, + "grad_norm": 31.52457046508789, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.8691893219947815, + "num_tokens": 820390973.0, + "step": 21502 + }, + { + "epoch": 2.735402620531739, + "ewc_loss": 0.0507347472012043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.536737338232342e-05, + "grad_norm": 31.56941795349121, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.867261528968811, + "num_tokens": 820424543.0, + "step": 21503 + }, + { + "epoch": 2.7355298308103295, + "ewc_loss": 0.05084316059947014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.542158108553849e-05, + "grad_norm": 31.59222984313965, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8720059394836426, + "num_tokens": 820462702.0, + "step": 21504 + }, + { + "epoch": 2.73565704108892, + "ewc_loss": 0.050863154232501984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5431576432310976e-05, + "grad_norm": 31.624181747436523, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8828781843185425, + "num_tokens": 820498840.0, + "step": 21505 + }, + { + "epoch": 2.7357842513675106, + "ewc_loss": 0.05085678771138191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.542839320085477e-05, + "grad_norm": 31.64626693725586, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8659160733222961, + "num_tokens": 820534083.0, + "step": 21506 + }, + { + "epoch": 2.735911461646101, + "ewc_loss": 0.05084347352385521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5421737518627197e-05, + "grad_norm": 31.656063079833984, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.875601053237915, + "num_tokens": 820572479.0, + "step": 21507 + }, + { + "epoch": 2.7360386719246916, + "ewc_loss": 0.05075393244624138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5376966732437722e-05, + "grad_norm": 31.535369873046875, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8614380359649658, + "num_tokens": 820610283.0, + "step": 21508 + }, + { + "epoch": 2.736165882203282, + "ewc_loss": 0.05085403844714165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5427019863855094e-05, + "grad_norm": 31.699848175048828, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8896859884262085, + "num_tokens": 820647580.0, + "step": 21509 + }, + { + "epoch": 2.7362930924818727, + "ewc_loss": 0.0507805198431015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5390259906998836e-05, + "grad_norm": 31.58696746826172, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8785727620124817, + "num_tokens": 820686774.0, + "step": 21510 + }, + { + "epoch": 2.7364203027604628, + "ewc_loss": 0.050760261714458466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5380131773999892e-05, + "grad_norm": 31.67604637145996, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8824491500854492, + "num_tokens": 820724635.0, + "step": 21511 + }, + { + "epoch": 2.7365475130390537, + "ewc_loss": 0.05073843151330948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5369216018589213e-05, + "grad_norm": 31.49074363708496, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8798192739486694, + "num_tokens": 820761467.0, + "step": 21512 + }, + { + "epoch": 2.736674723317644, + "ewc_loss": 0.05070691555738449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5353458113386296e-05, + "grad_norm": 31.71192741394043, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.859616756439209, + "num_tokens": 820801001.0, + "step": 21513 + }, + { + "epoch": 2.736801933596235, + "ewc_loss": 0.05081017687916756, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.540508830861654e-05, + "grad_norm": 31.515968322753906, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8681074380874634, + "num_tokens": 820841028.0, + "step": 21514 + }, + { + "epoch": 2.736929143874825, + "ewc_loss": 0.05069933086633682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5349665520479903e-05, + "grad_norm": 31.55794906616211, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8806191682815552, + "num_tokens": 820881613.0, + "step": 21515 + }, + { + "epoch": 2.737056354153416, + "ewc_loss": 0.05083874613046646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5419372832402587e-05, + "grad_norm": 31.59475326538086, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8807268142700195, + "num_tokens": 820918205.0, + "step": 21516 + }, + { + "epoch": 2.737183564432006, + "ewc_loss": 0.050804540514945984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.540227069403045e-05, + "grad_norm": 31.58145523071289, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8663668632507324, + "num_tokens": 820950751.0, + "step": 21517 + }, + { + "epoch": 2.7373107747105965, + "ewc_loss": 0.05075933039188385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.537966429372318e-05, + "grad_norm": 31.640493392944336, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8754717707633972, + "num_tokens": 820987385.0, + "step": 21518 + }, + { + "epoch": 2.737437984989187, + "ewc_loss": 0.050795771181583405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.53978851105785e-05, + "grad_norm": 31.66252326965332, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8659094572067261, + "num_tokens": 821026273.0, + "step": 21519 + }, + { + "epoch": 2.7375651952677775, + "ewc_loss": 0.05079979822039604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5399898731848225e-05, + "grad_norm": 31.670801162719727, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8646398782730103, + "num_tokens": 821067978.0, + "step": 21520 + }, + { + "epoch": 2.737692405546368, + "ewc_loss": 0.05069654807448387, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.534827399358619e-05, + "grad_norm": 31.627790451049805, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8784275054931641, + "num_tokens": 821111925.0, + "step": 21521 + }, + { + "epoch": 2.7378196158249586, + "ewc_loss": 0.050738804042339325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5369401555508375e-05, + "grad_norm": 31.63328742980957, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8771495819091797, + "num_tokens": 821151019.0, + "step": 21522 + }, + { + "epoch": 2.737946826103549, + "ewc_loss": 0.05074990168213844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5374951292178594e-05, + "grad_norm": 31.615894317626953, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8798655271530151, + "num_tokens": 821185520.0, + "step": 21523 + }, + { + "epoch": 2.7380740363821396, + "ewc_loss": 0.05078199878334999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.539100023568608e-05, + "grad_norm": 31.571794509887695, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8744109869003296, + "num_tokens": 821224273.0, + "step": 21524 + }, + { + "epoch": 2.73820124666073, + "ewc_loss": 0.05082269012928009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.541134563216474e-05, + "grad_norm": 31.706857681274414, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8794103860855103, + "num_tokens": 821263541.0, + "step": 21525 + }, + { + "epoch": 2.7383284569393207, + "ewc_loss": 0.05076754838228226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.538377339078579e-05, + "grad_norm": 31.539508819580078, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8613603115081787, + "num_tokens": 821304053.0, + "step": 21526 + }, + { + "epoch": 2.7384556672179112, + "ewc_loss": 0.0506337471306324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5316872779512778e-05, + "grad_norm": 31.584379196166992, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8736362457275391, + "num_tokens": 821339710.0, + "step": 21527 + }, + { + "epoch": 2.7385828774965018, + "ewc_loss": 0.05082061141729355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.541030517022591e-05, + "grad_norm": 31.668855667114258, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8731061220169067, + "num_tokens": 821378377.0, + "step": 21528 + }, + { + "epoch": 2.7387100877750923, + "ewc_loss": 0.05071786046028137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.535892963351216e-05, + "grad_norm": 31.735736846923828, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8822892904281616, + "num_tokens": 821417434.0, + "step": 21529 + }, + { + "epoch": 2.738837298053683, + "ewc_loss": 0.050738655030727386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.536932697694283e-05, + "grad_norm": 31.546125411987305, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8693318367004395, + "num_tokens": 821464323.0, + "step": 21530 + }, + { + "epoch": 2.7389645083322733, + "ewc_loss": 0.05061710998415947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5308554540970363e-05, + "grad_norm": 31.665077209472656, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8702514171600342, + "num_tokens": 821509891.0, + "step": 21531 + }, + { + "epoch": 2.739091718610864, + "ewc_loss": 0.05072132498025894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.536066313041374e-05, + "grad_norm": 31.721942901611328, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8624503016471863, + "num_tokens": 821557309.0, + "step": 21532 + }, + { + "epoch": 2.7392189288894544, + "ewc_loss": 0.05057835951447487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5289180484833196e-05, + "grad_norm": 31.580686569213867, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8669860363006592, + "num_tokens": 821597751.0, + "step": 21533 + }, + { + "epoch": 2.739346139168045, + "ewc_loss": 0.050648584961891174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.532429243728984e-05, + "grad_norm": 31.696556091308594, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8854019045829773, + "num_tokens": 821637961.0, + "step": 21534 + }, + { + "epoch": 2.7394733494466355, + "ewc_loss": 0.0505935400724411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.529676930862479e-05, + "grad_norm": 31.499263763427734, + "learning_rate": 1e-06, + "loss": 0.4665, + "mean_token_accuracy": 0.8557988405227661, + "num_tokens": 821684728.0, + "step": 21535 + }, + { + "epoch": 2.7396005597252255, + "ewc_loss": 0.050611820071935654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5305909730377607e-05, + "grad_norm": 31.651525497436523, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.8786276578903198, + "num_tokens": 821715359.0, + "step": 21536 + }, + { + "epoch": 2.7397277700038165, + "ewc_loss": 0.05076778307557106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.538389162509702e-05, + "grad_norm": 31.5069580078125, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8667025566101074, + "num_tokens": 821755540.0, + "step": 21537 + }, + { + "epoch": 2.7398549802824066, + "ewc_loss": 0.05063018202781677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5315090169897303e-05, + "grad_norm": 31.64203643798828, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.873874306678772, + "num_tokens": 821788506.0, + "step": 21538 + }, + { + "epoch": 2.7399821905609976, + "ewc_loss": 0.05078897252678871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5394485419383273e-05, + "grad_norm": 31.660200119018555, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8715746998786926, + "num_tokens": 821826555.0, + "step": 21539 + }, + { + "epoch": 2.7401094008395877, + "ewc_loss": 0.05064966529607773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5324832677142695e-05, + "grad_norm": 31.632600784301758, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8637469410896301, + "num_tokens": 821866160.0, + "step": 21540 + }, + { + "epoch": 2.7402366111181786, + "ewc_loss": 0.05066424608230591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5332123186672106e-05, + "grad_norm": 31.657649993896484, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8782219886779785, + "num_tokens": 821905595.0, + "step": 21541 + }, + { + "epoch": 2.7403638213967687, + "ewc_loss": 0.050611406564712524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5305704184575006e-05, + "grad_norm": 31.561336517333984, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8627089262008667, + "num_tokens": 821944769.0, + "step": 21542 + }, + { + "epoch": 2.7404910316753592, + "ewc_loss": 0.05064219981431961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.532110011088662e-05, + "grad_norm": 31.692703247070312, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8808672428131104, + "num_tokens": 821988889.0, + "step": 21543 + }, + { + "epoch": 2.7406182419539498, + "ewc_loss": 0.050693266093730927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.534663326514419e-05, + "grad_norm": 31.557098388671875, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8758036494255066, + "num_tokens": 822025343.0, + "step": 21544 + }, + { + "epoch": 2.7407454522325403, + "ewc_loss": 0.05055913329124451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5279567125835456e-05, + "grad_norm": 31.608428955078125, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8619512915611267, + "num_tokens": 822067651.0, + "step": 21545 + }, + { + "epoch": 2.740872662511131, + "ewc_loss": 0.050757069140672684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.537853470130358e-05, + "grad_norm": 31.61322593688965, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.862899899482727, + "num_tokens": 822102885.0, + "step": 21546 + }, + { + "epoch": 2.7409998727897213, + "ewc_loss": 0.05068811774253845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5344059395138174e-05, + "grad_norm": 31.56834602355957, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.864374041557312, + "num_tokens": 822142515.0, + "step": 21547 + }, + { + "epoch": 2.741127083068312, + "ewc_loss": 0.05073392018675804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5366960471728817e-05, + "grad_norm": 31.5799617767334, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.8693831562995911, + "num_tokens": 822180745.0, + "step": 21548 + }, + { + "epoch": 2.7412542933469024, + "ewc_loss": 0.05080122500658035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5400611775694415e-05, + "grad_norm": 31.680450439453125, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.880253255367279, + "num_tokens": 822215117.0, + "step": 21549 + }, + { + "epoch": 2.741381503625493, + "ewc_loss": 0.05071850121021271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5359249775647186e-05, + "grad_norm": 31.575300216674805, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8624211549758911, + "num_tokens": 822258883.0, + "step": 21550 + }, + { + "epoch": 2.7415087139040835, + "ewc_loss": 0.05066126585006714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5330633434350602e-05, + "grad_norm": 31.672239303588867, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8698928356170654, + "num_tokens": 822295254.0, + "step": 21551 + }, + { + "epoch": 2.741635924182674, + "ewc_loss": 0.05074404180049896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.537202090024948e-05, + "grad_norm": 31.663339614868164, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8761022686958313, + "num_tokens": 822335769.0, + "step": 21552 + }, + { + "epoch": 2.7417631344612645, + "ewc_loss": 0.050710342824459076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5355171601404436e-05, + "grad_norm": 31.543582916259766, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8816245198249817, + "num_tokens": 822376873.0, + "step": 21553 + }, + { + "epoch": 2.741890344739855, + "ewc_loss": 0.05065447837114334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5327239200123586e-05, + "grad_norm": 31.53985023498535, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8691892623901367, + "num_tokens": 822419868.0, + "step": 21554 + }, + { + "epoch": 2.7420175550184456, + "ewc_loss": 0.05080188810825348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5400944650755264e-05, + "grad_norm": 31.63604164123535, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.871451735496521, + "num_tokens": 822456757.0, + "step": 21555 + }, + { + "epoch": 2.742144765297036, + "ewc_loss": 0.050764378160238266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5382189051015303e-05, + "grad_norm": 31.52924919128418, + "learning_rate": 1e-06, + "loss": 0.3694, + "mean_token_accuracy": 0.8850581049919128, + "num_tokens": 822492772.0, + "step": 21556 + }, + { + "epoch": 2.7422719755756266, + "ewc_loss": 0.05079367011785507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5396835553692654e-05, + "grad_norm": 31.66466522216797, + "learning_rate": 1e-06, + "loss": 0.4691, + "mean_token_accuracy": 0.8566660284996033, + "num_tokens": 822535400.0, + "step": 21557 + }, + { + "epoch": 2.742399185854217, + "ewc_loss": 0.05075233429670334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5376166377100162e-05, + "grad_norm": 31.62350082397461, + "learning_rate": 1e-06, + "loss": 0.3939, + "mean_token_accuracy": 0.8782824277877808, + "num_tokens": 822568313.0, + "step": 21558 + }, + { + "epoch": 2.7425263961328077, + "ewc_loss": 0.05071783438324928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5358916900586337e-05, + "grad_norm": 31.710416793823242, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8788231015205383, + "num_tokens": 822608487.0, + "step": 21559 + }, + { + "epoch": 2.742653606411398, + "ewc_loss": 0.05077754333615303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5388771973666735e-05, + "grad_norm": 31.65899658203125, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8737648129463196, + "num_tokens": 822647938.0, + "step": 21560 + }, + { + "epoch": 2.7427808166899883, + "ewc_loss": 0.05061130225658417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5305651433882304e-05, + "grad_norm": 31.450416564941406, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8683987259864807, + "num_tokens": 822685974.0, + "step": 21561 + }, + { + "epoch": 2.7429080269685793, + "ewc_loss": 0.05074567347764969, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.537283580750227e-05, + "grad_norm": 31.756454467773438, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8754515647888184, + "num_tokens": 822719166.0, + "step": 21562 + }, + { + "epoch": 2.7430352372471694, + "ewc_loss": 0.05084032192826271, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5420160454814322e-05, + "grad_norm": 31.741735458374023, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.862040638923645, + "num_tokens": 822758364.0, + "step": 21563 + }, + { + "epoch": 2.7431624475257603, + "ewc_loss": 0.050618305802345276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.530915298848413e-05, + "grad_norm": 31.559499740600586, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.880695104598999, + "num_tokens": 822798002.0, + "step": 21564 + }, + { + "epoch": 2.7432896578043504, + "ewc_loss": 0.05068213865160942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5341068976558745e-05, + "grad_norm": 31.668067932128906, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.873090386390686, + "num_tokens": 822837958.0, + "step": 21565 + }, + { + "epoch": 2.743416868082941, + "ewc_loss": 0.05068736895918846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5343684683321044e-05, + "grad_norm": 31.64569091796875, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8731024265289307, + "num_tokens": 822873076.0, + "step": 21566 + }, + { + "epoch": 2.7435440783615315, + "ewc_loss": 0.0506601557135582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.533007864258252e-05, + "grad_norm": 31.639772415161133, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8707202076911926, + "num_tokens": 822904693.0, + "step": 21567 + }, + { + "epoch": 2.743671288640122, + "ewc_loss": 0.05080760270357132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.540380046411883e-05, + "grad_norm": 31.707717895507812, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8678818345069885, + "num_tokens": 822941192.0, + "step": 21568 + }, + { + "epoch": 2.7437984989187125, + "ewc_loss": 0.05072328820824623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5361643565702252e-05, + "grad_norm": 31.603059768676758, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8602486252784729, + "num_tokens": 822982166.0, + "step": 21569 + }, + { + "epoch": 2.743925709197303, + "ewc_loss": 0.050638213753700256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5319106498500332e-05, + "grad_norm": 31.688007354736328, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8662441968917847, + "num_tokens": 823021630.0, + "step": 21570 + }, + { + "epoch": 2.7440529194758936, + "ewc_loss": 0.05084377899765968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5421890313737094e-05, + "grad_norm": 31.5909481048584, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.872059166431427, + "num_tokens": 823062133.0, + "step": 21571 + }, + { + "epoch": 2.744180129754484, + "ewc_loss": 0.05069906637072563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5349532734253444e-05, + "grad_norm": 31.644819259643555, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8688328862190247, + "num_tokens": 823105163.0, + "step": 21572 + }, + { + "epoch": 2.7443073400330746, + "ewc_loss": 0.05075746774673462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.537873479013797e-05, + "grad_norm": 31.551820755004883, + "learning_rate": 1e-06, + "loss": 0.4822, + "mean_token_accuracy": 0.85246342420578, + "num_tokens": 823146878.0, + "step": 21573 + }, + { + "epoch": 2.744434550311665, + "ewc_loss": 0.05072804167866707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5364020984852687e-05, + "grad_norm": 31.74519920349121, + "learning_rate": 1e-06, + "loss": 0.3634, + "mean_token_accuracy": 0.8893024921417236, + "num_tokens": 823182216.0, + "step": 21574 + }, + { + "epoch": 2.7445617605902557, + "ewc_loss": 0.05078170821070671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5390854716533795e-05, + "grad_norm": 31.457792282104492, + "learning_rate": 1e-06, + "loss": 0.399, + "mean_token_accuracy": 0.8797993659973145, + "num_tokens": 823227838.0, + "step": 21575 + }, + { + "epoch": 2.7446889708688462, + "ewc_loss": 0.05079340934753418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.53967045864556e-05, + "grad_norm": 31.79050636291504, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8649632930755615, + "num_tokens": 823268609.0, + "step": 21576 + }, + { + "epoch": 2.7448161811474368, + "ewc_loss": 0.05091250687837601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.545625284255948e-05, + "grad_norm": 31.648080825805664, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8805311918258667, + "num_tokens": 823309426.0, + "step": 21577 + }, + { + "epoch": 2.7449433914260273, + "ewc_loss": 0.05068638175725937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5343191737192683e-05, + "grad_norm": 31.686878204345703, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8668561577796936, + "num_tokens": 823346049.0, + "step": 21578 + }, + { + "epoch": 2.745070601704618, + "ewc_loss": 0.05082642659544945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.541321373428218e-05, + "grad_norm": 31.647174835205078, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8667780160903931, + "num_tokens": 823378915.0, + "step": 21579 + }, + { + "epoch": 2.7451978119832083, + "ewc_loss": 0.05082835629582405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.541417779866606e-05, + "grad_norm": 31.718647003173828, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8620768785476685, + "num_tokens": 823422985.0, + "step": 21580 + }, + { + "epoch": 2.745325022261799, + "ewc_loss": 0.050711873918771744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.535593739594333e-05, + "grad_norm": 31.593341827392578, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8590124845504761, + "num_tokens": 823460703.0, + "step": 21581 + }, + { + "epoch": 2.7454522325403894, + "ewc_loss": 0.05074498802423477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5372493837494403e-05, + "grad_norm": 31.706016540527344, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8748120069503784, + "num_tokens": 823499651.0, + "step": 21582 + }, + { + "epoch": 2.74557944281898, + "ewc_loss": 0.050761811435222626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.53809048444964e-05, + "grad_norm": 31.564584732055664, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8711231350898743, + "num_tokens": 823537044.0, + "step": 21583 + }, + { + "epoch": 2.7457066530975704, + "ewc_loss": 0.050694048404693604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5347024347865954e-05, + "grad_norm": 31.580415725708008, + "learning_rate": 1e-06, + "loss": 0.3702, + "mean_token_accuracy": 0.8852119445800781, + "num_tokens": 823571832.0, + "step": 21584 + }, + { + "epoch": 2.745833863376161, + "ewc_loss": 0.05087748169898987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5438741431571543e-05, + "grad_norm": 31.689624786376953, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8788406848907471, + "num_tokens": 823609852.0, + "step": 21585 + }, + { + "epoch": 2.745961073654751, + "ewc_loss": 0.050764068961143494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5382034436916e-05, + "grad_norm": 31.644132614135742, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8756526708602905, + "num_tokens": 823644419.0, + "step": 21586 + }, + { + "epoch": 2.746088283933342, + "ewc_loss": 0.05079469829797745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5397348508704454e-05, + "grad_norm": 31.75296401977539, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8777816891670227, + "num_tokens": 823682451.0, + "step": 21587 + }, + { + "epoch": 2.746215494211932, + "ewc_loss": 0.0507042296230793, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.535211569920648e-05, + "grad_norm": 31.581668853759766, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8828291296958923, + "num_tokens": 823723051.0, + "step": 21588 + }, + { + "epoch": 2.746342704490523, + "ewc_loss": 0.0507732629776001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.538663102313876e-05, + "grad_norm": 31.77598762512207, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8684091567993164, + "num_tokens": 823761353.0, + "step": 21589 + }, + { + "epoch": 2.746469914769113, + "ewc_loss": 0.05080670118331909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5403351173736155e-05, + "grad_norm": 31.6523494720459, + "learning_rate": 1e-06, + "loss": 0.4787, + "mean_token_accuracy": 0.8583053350448608, + "num_tokens": 823800281.0, + "step": 21590 + }, + { + "epoch": 2.7465971250477037, + "ewc_loss": 0.050735361874103546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.536768079153262e-05, + "grad_norm": 31.750667572021484, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8865439295768738, + "num_tokens": 823843255.0, + "step": 21591 + }, + { + "epoch": 2.7467243353262942, + "ewc_loss": 0.050777532160282135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5388766516698524e-05, + "grad_norm": 31.682140350341797, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8694945573806763, + "num_tokens": 823880736.0, + "step": 21592 + }, + { + "epoch": 2.7468515456048848, + "ewc_loss": 0.050704240798950195, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.535212115617469e-05, + "grad_norm": 31.738454818725586, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8705409169197083, + "num_tokens": 823923232.0, + "step": 21593 + }, + { + "epoch": 2.7469787558834753, + "ewc_loss": 0.05073148012161255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5365739929839037e-05, + "grad_norm": 31.720861434936523, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8752511143684387, + "num_tokens": 823957488.0, + "step": 21594 + }, + { + "epoch": 2.747105966162066, + "ewc_loss": 0.05072988569736481, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5364943212480284e-05, + "grad_norm": 31.787912368774414, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8679929971694946, + "num_tokens": 823999568.0, + "step": 21595 + }, + { + "epoch": 2.7472331764406563, + "ewc_loss": 0.05070328339934349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5351640942972153e-05, + "grad_norm": 31.531911849975586, + "learning_rate": 1e-06, + "loss": 0.3828, + "mean_token_accuracy": 0.883533775806427, + "num_tokens": 824036947.0, + "step": 21596 + }, + { + "epoch": 2.747360386719247, + "ewc_loss": 0.050675928592681885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5337963961646892e-05, + "grad_norm": 31.727670669555664, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8852289915084839, + "num_tokens": 824067966.0, + "step": 21597 + }, + { + "epoch": 2.7474875969978374, + "ewc_loss": 0.05069386959075928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.534693521738518e-05, + "grad_norm": 31.75153350830078, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8712300062179565, + "num_tokens": 824112216.0, + "step": 21598 + }, + { + "epoch": 2.747614807276428, + "ewc_loss": 0.05075949430465698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5379747967235744e-05, + "grad_norm": 31.697877883911133, + "learning_rate": 1e-06, + "loss": 0.374, + "mean_token_accuracy": 0.8831281661987305, + "num_tokens": 824148749.0, + "step": 21599 + }, + { + "epoch": 2.7477420175550185, + "ewc_loss": 0.05071099102497101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5355495381518267e-05, + "grad_norm": 31.906883239746094, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.870986819267273, + "num_tokens": 824186685.0, + "step": 21600 + }, + { + "epoch": 2.747869227833609, + "ewc_loss": 0.05071678385138512, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5358391212648712e-05, + "grad_norm": 31.82331085205078, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8665878772735596, + "num_tokens": 824223716.0, + "step": 21601 + }, + { + "epoch": 2.7479964381121995, + "ewc_loss": 0.0505266934633255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5263347197324038e-05, + "grad_norm": 31.5251407623291, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8711090683937073, + "num_tokens": 824264627.0, + "step": 21602 + }, + { + "epoch": 2.74812364839079, + "ewc_loss": 0.05057656019926071, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.528828008507844e-05, + "grad_norm": 31.81288719177246, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.8726086616516113, + "num_tokens": 824301901.0, + "step": 21603 + }, + { + "epoch": 2.7482508586693806, + "ewc_loss": 0.05081430450081825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5407152861589566e-05, + "grad_norm": 31.84904670715332, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8760876059532166, + "num_tokens": 824344621.0, + "step": 21604 + }, + { + "epoch": 2.748378068947971, + "ewc_loss": 0.05062050744891167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5310253477073275e-05, + "grad_norm": 31.60844612121582, + "learning_rate": 1e-06, + "loss": 0.3942, + "mean_token_accuracy": 0.8779770731925964, + "num_tokens": 824385850.0, + "step": 21605 + }, + { + "epoch": 2.7485052792265616, + "ewc_loss": 0.05057930201292038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5289651603088714e-05, + "grad_norm": 31.987226486206055, + "learning_rate": 1e-06, + "loss": 0.3319, + "mean_token_accuracy": 0.8984777331352234, + "num_tokens": 824425545.0, + "step": 21606 + }, + { + "epoch": 2.748632489505152, + "ewc_loss": 0.05072988197207451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.536494139349088e-05, + "grad_norm": 31.881214141845703, + "learning_rate": 1e-06, + "loss": 0.3662, + "mean_token_accuracy": 0.889731764793396, + "num_tokens": 824468611.0, + "step": 21607 + }, + { + "epoch": 2.7487596997837427, + "ewc_loss": 0.05052543431520462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.526271782699041e-05, + "grad_norm": 31.63500213623047, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8772492408752441, + "num_tokens": 824512161.0, + "step": 21608 + }, + { + "epoch": 2.7488869100623328, + "ewc_loss": 0.050508346408605576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5254174033761956e-05, + "grad_norm": 31.96672821044922, + "learning_rate": 1e-06, + "loss": 0.3784, + "mean_token_accuracy": 0.8844432830810547, + "num_tokens": 824555502.0, + "step": 21609 + }, + { + "epoch": 2.7490141203409237, + "ewc_loss": 0.050546564161777496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5273282517446205e-05, + "grad_norm": 31.681241989135742, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.8655922412872314, + "num_tokens": 824596277.0, + "step": 21610 + }, + { + "epoch": 2.749141330619514, + "ewc_loss": 0.05046457052230835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.523228613426909e-05, + "grad_norm": 31.67743682861328, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8606294989585876, + "num_tokens": 824633704.0, + "step": 21611 + }, + { + "epoch": 2.749268540898105, + "ewc_loss": 0.05067965388298035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5339826606796123e-05, + "grad_norm": 31.988880157470703, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.876873791217804, + "num_tokens": 824673584.0, + "step": 21612 + }, + { + "epoch": 2.749395751176695, + "ewc_loss": 0.050516050308942795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5258024834329262e-05, + "grad_norm": 31.58148956298828, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8670027256011963, + "num_tokens": 824712508.0, + "step": 21613 + }, + { + "epoch": 2.749522961455286, + "ewc_loss": 0.05057618021965027, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5288090910180472e-05, + "grad_norm": 31.764644622802734, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8841797709465027, + "num_tokens": 824754353.0, + "step": 21614 + }, + { + "epoch": 2.749650171733876, + "ewc_loss": 0.050539519637823105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5269759134971537e-05, + "grad_norm": 31.63867950439453, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8539972305297852, + "num_tokens": 824796333.0, + "step": 21615 + }, + { + "epoch": 2.7497773820124665, + "ewc_loss": 0.050553467124700546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.527673314034473e-05, + "grad_norm": 31.48744773864746, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8671619296073914, + "num_tokens": 824832048.0, + "step": 21616 + }, + { + "epoch": 2.749904592291057, + "ewc_loss": 0.050546206533908844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5273102437495254e-05, + "grad_norm": 31.586631774902344, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.8670939207077026, + "num_tokens": 824871889.0, + "step": 21617 + }, + { + "epoch": 2.7500318025696475, + "ewc_loss": 0.050672631710767746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.533631595724728e-05, + "grad_norm": 31.503494262695312, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8836357593536377, + "num_tokens": 824912547.0, + "step": 21618 + }, + { + "epoch": 2.750159012848238, + "ewc_loss": 0.050582874566316605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5291437850682996e-05, + "grad_norm": 31.52705955505371, + "learning_rate": 1e-06, + "loss": 0.4821, + "mean_token_accuracy": 0.8538177609443665, + "num_tokens": 824955626.0, + "step": 21619 + }, + { + "epoch": 2.7502862231268286, + "ewc_loss": 0.05071110278367996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5355551770189777e-05, + "grad_norm": 31.621023178100586, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8812637329101562, + "num_tokens": 824991813.0, + "step": 21620 + }, + { + "epoch": 2.750413433405419, + "ewc_loss": 0.0506213903427124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5310695491498336e-05, + "grad_norm": 31.59152603149414, + "learning_rate": 1e-06, + "loss": 0.3822, + "mean_token_accuracy": 0.8810745477676392, + "num_tokens": 825024356.0, + "step": 21621 + }, + { + "epoch": 2.7505406436840096, + "ewc_loss": 0.050781745463609695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.539087290642783e-05, + "grad_norm": 31.791717529296875, + "learning_rate": 1e-06, + "loss": 0.4229, + "mean_token_accuracy": 0.8697181940078735, + "num_tokens": 825060316.0, + "step": 21622 + }, + { + "epoch": 2.7506678539626, + "ewc_loss": 0.05067300796508789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5336503313155845e-05, + "grad_norm": 31.55814552307129, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8659166097640991, + "num_tokens": 825096044.0, + "step": 21623 + }, + { + "epoch": 2.7507950642411907, + "ewc_loss": 0.05063470080494881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.531735117372591e-05, + "grad_norm": 31.65974235534668, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8717412352561951, + "num_tokens": 825123999.0, + "step": 21624 + }, + { + "epoch": 2.750922274519781, + "ewc_loss": 0.05077885836362839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5389428628841415e-05, + "grad_norm": 31.681621551513672, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8619848489761353, + "num_tokens": 825170155.0, + "step": 21625 + }, + { + "epoch": 2.7510494847983717, + "ewc_loss": 0.05056948959827423, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5284743969677947e-05, + "grad_norm": 31.571407318115234, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8719761967658997, + "num_tokens": 825213388.0, + "step": 21626 + }, + { + "epoch": 2.7511766950769623, + "ewc_loss": 0.05063778907060623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5318893676740117e-05, + "grad_norm": 31.60561180114746, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8798336982727051, + "num_tokens": 825247171.0, + "step": 21627 + }, + { + "epoch": 2.751303905355553, + "ewc_loss": 0.05084123834967613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5420618840144016e-05, + "grad_norm": 31.63916015625, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8734856843948364, + "num_tokens": 825283271.0, + "step": 21628 + }, + { + "epoch": 2.7514311156341433, + "ewc_loss": 0.05078890919685364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5394454496563412e-05, + "grad_norm": 31.698171615600586, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8715680241584778, + "num_tokens": 825318976.0, + "step": 21629 + }, + { + "epoch": 2.751558325912734, + "ewc_loss": 0.050741713494062424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5370856747031212e-05, + "grad_norm": 31.75112533569336, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.872223973274231, + "num_tokens": 825358019.0, + "step": 21630 + }, + { + "epoch": 2.7516855361913244, + "ewc_loss": 0.05082475394010544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.541237699915655e-05, + "grad_norm": 31.655241012573242, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8901025056838989, + "num_tokens": 825394025.0, + "step": 21631 + }, + { + "epoch": 2.751812746469915, + "ewc_loss": 0.05069965124130249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5349825591547415e-05, + "grad_norm": 31.596284866333008, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8832106590270996, + "num_tokens": 825433021.0, + "step": 21632 + }, + { + "epoch": 2.7519399567485054, + "ewc_loss": 0.050813328474760056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5406663553440012e-05, + "grad_norm": 31.67669105529785, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8733769655227661, + "num_tokens": 825477026.0, + "step": 21633 + }, + { + "epoch": 2.7520671670270955, + "ewc_loss": 0.050814855843782425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.54074275289895e-05, + "grad_norm": 31.66935157775879, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8783423900604248, + "num_tokens": 825511054.0, + "step": 21634 + }, + { + "epoch": 2.7521943773056865, + "ewc_loss": 0.05081876739859581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5409382942598313e-05, + "grad_norm": 31.626235961914062, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8580137491226196, + "num_tokens": 825549654.0, + "step": 21635 + }, + { + "epoch": 2.7523215875842766, + "ewc_loss": 0.05081959441304207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.540979767218232e-05, + "grad_norm": 31.719863891601562, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.875135064125061, + "num_tokens": 825587269.0, + "step": 21636 + }, + { + "epoch": 2.7524487978628676, + "ewc_loss": 0.05080416053533554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5402079700143076e-05, + "grad_norm": 31.6912899017334, + "learning_rate": 1e-06, + "loss": 0.3434, + "mean_token_accuracy": 0.8965919017791748, + "num_tokens": 825624864.0, + "step": 21637 + }, + { + "epoch": 2.7525760081414576, + "ewc_loss": 0.050724394619464874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.536219653848093e-05, + "grad_norm": 31.51072120666504, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8653907179832458, + "num_tokens": 825667025.0, + "step": 21638 + }, + { + "epoch": 2.7527032184200486, + "ewc_loss": 0.05089898779988289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5449493477935903e-05, + "grad_norm": 31.891111373901367, + "learning_rate": 1e-06, + "loss": 0.4628, + "mean_token_accuracy": 0.8569694757461548, + "num_tokens": 825709505.0, + "step": 21639 + }, + { + "epoch": 2.7528304286986387, + "ewc_loss": 0.05091702938079834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5458513846388087e-05, + "grad_norm": 31.717029571533203, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8753500580787659, + "num_tokens": 825747376.0, + "step": 21640 + }, + { + "epoch": 2.7529576389772292, + "ewc_loss": 0.05074607953429222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5373039534315467e-05, + "grad_norm": 31.739412307739258, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8792477250099182, + "num_tokens": 825784817.0, + "step": 21641 + }, + { + "epoch": 2.7530848492558198, + "ewc_loss": 0.05076506733894348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5382532840012573e-05, + "grad_norm": 31.58751106262207, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8700991868972778, + "num_tokens": 825815631.0, + "step": 21642 + }, + { + "epoch": 2.7532120595344103, + "ewc_loss": 0.05070643126964569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5353216187795624e-05, + "grad_norm": 31.7349796295166, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8728492259979248, + "num_tokens": 825854202.0, + "step": 21643 + }, + { + "epoch": 2.753339269813001, + "ewc_loss": 0.05090769752860069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.545384813856799e-05, + "grad_norm": 31.6508846282959, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8800106644630432, + "num_tokens": 825890636.0, + "step": 21644 + }, + { + "epoch": 2.7534664800915913, + "ewc_loss": 0.05064452439546585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5322262445115484e-05, + "grad_norm": 31.535274505615234, + "learning_rate": 1e-06, + "loss": 0.4765, + "mean_token_accuracy": 0.8509842157363892, + "num_tokens": 825931054.0, + "step": 21645 + }, + { + "epoch": 2.753593690370182, + "ewc_loss": 0.050754960626363754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5377479687449522e-05, + "grad_norm": 31.707599639892578, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8683350086212158, + "num_tokens": 825972519.0, + "step": 21646 + }, + { + "epoch": 2.7537209006487724, + "ewc_loss": 0.05083119869232178, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5415600248379633e-05, + "grad_norm": 31.720043182373047, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8741267919540405, + "num_tokens": 826010010.0, + "step": 21647 + }, + { + "epoch": 2.753848110927363, + "ewc_loss": 0.05075164884328842, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5375824407092296e-05, + "grad_norm": 31.590177536010742, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.872711181640625, + "num_tokens": 826048348.0, + "step": 21648 + }, + { + "epoch": 2.7539753212059535, + "ewc_loss": 0.05073676258325577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5368381102452986e-05, + "grad_norm": 31.732742309570312, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8696653842926025, + "num_tokens": 826087807.0, + "step": 21649 + }, + { + "epoch": 2.754102531484544, + "ewc_loss": 0.05080842599272728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5404213374713436e-05, + "grad_norm": 31.68330955505371, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8683562874794006, + "num_tokens": 826124657.0, + "step": 21650 + }, + { + "epoch": 2.7542297417631345, + "ewc_loss": 0.05070628598332405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5353143428219482e-05, + "grad_norm": 31.589902877807617, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8759486675262451, + "num_tokens": 826164190.0, + "step": 21651 + }, + { + "epoch": 2.754356952041725, + "ewc_loss": 0.050780631601810455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5390316295670345e-05, + "grad_norm": 31.681053161621094, + "learning_rate": 1e-06, + "loss": 0.4117, + "mean_token_accuracy": 0.8739969730377197, + "num_tokens": 826202876.0, + "step": 21652 + }, + { + "epoch": 2.7544841623203156, + "ewc_loss": 0.050741441547870636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5370720322825946e-05, + "grad_norm": 31.48357391357422, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8714123964309692, + "num_tokens": 826242955.0, + "step": 21653 + }, + { + "epoch": 2.754611372598906, + "ewc_loss": 0.0508238710463047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5411934984731488e-05, + "grad_norm": 31.711332321166992, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8818279504776001, + "num_tokens": 826279925.0, + "step": 21654 + }, + { + "epoch": 2.7547385828774966, + "ewc_loss": 0.05081319808959961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5406599888810888e-05, + "grad_norm": 31.512165069580078, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8827919960021973, + "num_tokens": 826314076.0, + "step": 21655 + }, + { + "epoch": 2.754865793156087, + "ewc_loss": 0.05078607425093651, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.539303750381805e-05, + "grad_norm": 31.64488983154297, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8694040775299072, + "num_tokens": 826354138.0, + "step": 21656 + }, + { + "epoch": 2.7549930034346777, + "ewc_loss": 0.050874967128038406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.543748269090429e-05, + "grad_norm": 31.601787567138672, + "learning_rate": 1e-06, + "loss": 0.4355, + "mean_token_accuracy": 0.8684327602386475, + "num_tokens": 826395393.0, + "step": 21657 + }, + { + "epoch": 2.755120213713268, + "ewc_loss": 0.050895947962999344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5447974621783942e-05, + "grad_norm": 31.828872680664062, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8623688220977783, + "num_tokens": 826437539.0, + "step": 21658 + }, + { + "epoch": 2.7552474239918583, + "ewc_loss": 0.05087974667549133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.543987284298055e-05, + "grad_norm": 31.709157943725586, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.866025984287262, + "num_tokens": 826475857.0, + "step": 21659 + }, + { + "epoch": 2.7553746342704493, + "ewc_loss": 0.05075884982943535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5379424187121913e-05, + "grad_norm": 31.785795211791992, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8819811344146729, + "num_tokens": 826513956.0, + "step": 21660 + }, + { + "epoch": 2.7555018445490393, + "ewc_loss": 0.05072557181119919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.536278589104768e-05, + "grad_norm": 31.746538162231445, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8709306716918945, + "num_tokens": 826552747.0, + "step": 21661 + }, + { + "epoch": 2.7556290548276303, + "ewc_loss": 0.05070975795388222, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5354878744110465e-05, + "grad_norm": 31.69390869140625, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8796996474266052, + "num_tokens": 826590830.0, + "step": 21662 + }, + { + "epoch": 2.7557562651062204, + "ewc_loss": 0.05076395720243454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.538197804824449e-05, + "grad_norm": 31.814790725708008, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8760100603103638, + "num_tokens": 826633414.0, + "step": 21663 + }, + { + "epoch": 2.755883475384811, + "ewc_loss": 0.0507059320807457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5352965167257935e-05, + "grad_norm": 31.744110107421875, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8815237283706665, + "num_tokens": 826668344.0, + "step": 21664 + }, + { + "epoch": 2.7560106856634015, + "ewc_loss": 0.050591547042131424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.529577432142105e-05, + "grad_norm": 31.734893798828125, + "learning_rate": 1e-06, + "loss": 0.4718, + "mean_token_accuracy": 0.8558319211006165, + "num_tokens": 826710587.0, + "step": 21665 + }, + { + "epoch": 2.756137895941992, + "ewc_loss": 0.05064498633146286, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5322493456769735e-05, + "grad_norm": 31.71040916442871, + "learning_rate": 1e-06, + "loss": 0.4403, + "mean_token_accuracy": 0.863084077835083, + "num_tokens": 826748854.0, + "step": 21666 + }, + { + "epoch": 2.7562651062205825, + "ewc_loss": 0.050653133541345596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5326566174044274e-05, + "grad_norm": 31.672792434692383, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8601741790771484, + "num_tokens": 826789591.0, + "step": 21667 + }, + { + "epoch": 2.756392316499173, + "ewc_loss": 0.05066991597414017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.533495717216283e-05, + "grad_norm": 31.768335342407227, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8685246706008911, + "num_tokens": 826831890.0, + "step": 21668 + }, + { + "epoch": 2.7565195267777636, + "ewc_loss": 0.050686415284872055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5343208108097315e-05, + "grad_norm": 31.75796127319336, + "learning_rate": 1e-06, + "loss": 0.4735, + "mean_token_accuracy": 0.854314923286438, + "num_tokens": 826870265.0, + "step": 21669 + }, + { + "epoch": 2.756646737056354, + "ewc_loss": 0.050638582557439804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5319292035419494e-05, + "grad_norm": 31.706418991088867, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8720927238464355, + "num_tokens": 826910993.0, + "step": 21670 + }, + { + "epoch": 2.7567739473349446, + "ewc_loss": 0.050619371235370636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5309685952379368e-05, + "grad_norm": 31.70318603515625, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8709390163421631, + "num_tokens": 826947492.0, + "step": 21671 + }, + { + "epoch": 2.756901157613535, + "ewc_loss": 0.050650712102651596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5325356546090916e-05, + "grad_norm": 31.71879005432129, + "learning_rate": 1e-06, + "loss": 0.4076, + "mean_token_accuracy": 0.8758765459060669, + "num_tokens": 826987215.0, + "step": 21672 + }, + { + "epoch": 2.7570283678921257, + "ewc_loss": 0.05067889392375946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5339446438010782e-05, + "grad_norm": 31.70527458190918, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8698657751083374, + "num_tokens": 827024092.0, + "step": 21673 + }, + { + "epoch": 2.757155578170716, + "ewc_loss": 0.05058106407523155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.529053199396003e-05, + "grad_norm": 31.577661514282227, + "learning_rate": 1e-06, + "loss": 0.481, + "mean_token_accuracy": 0.8554210662841797, + "num_tokens": 827060541.0, + "step": 21674 + }, + { + "epoch": 2.7572827884493067, + "ewc_loss": 0.05072235316038132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.536117608542554e-05, + "grad_norm": 31.748779296875, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8713858127593994, + "num_tokens": 827101205.0, + "step": 21675 + }, + { + "epoch": 2.7574099987278973, + "ewc_loss": 0.050741299986839294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5370649382239208e-05, + "grad_norm": 31.72464370727539, + "learning_rate": 1e-06, + "loss": 0.4393, + "mean_token_accuracy": 0.8663147687911987, + "num_tokens": 827132441.0, + "step": 21676 + }, + { + "epoch": 2.757537209006488, + "ewc_loss": 0.050759732723236084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5379866201546974e-05, + "grad_norm": 31.718902587890625, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8723154067993164, + "num_tokens": 827169530.0, + "step": 21677 + }, + { + "epoch": 2.7576644192850783, + "ewc_loss": 0.05075860396027565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5379302314831875e-05, + "grad_norm": 31.688369750976562, + "learning_rate": 1e-06, + "loss": 0.3925, + "mean_token_accuracy": 0.8789267539978027, + "num_tokens": 827207352.0, + "step": 21678 + }, + { + "epoch": 2.757791629563669, + "ewc_loss": 0.05085732787847519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.54286642302759e-05, + "grad_norm": 31.712486267089844, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8659110069274902, + "num_tokens": 827242790.0, + "step": 21679 + }, + { + "epoch": 2.7579188398422594, + "ewc_loss": 0.050736699253320694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5368350179633126e-05, + "grad_norm": 31.726133346557617, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.8739327788352966, + "num_tokens": 827281676.0, + "step": 21680 + }, + { + "epoch": 2.75804605012085, + "ewc_loss": 0.050804026424884796, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5402012397535145e-05, + "grad_norm": 31.83175277709961, + "learning_rate": 1e-06, + "loss": 0.3701, + "mean_token_accuracy": 0.8848787546157837, + "num_tokens": 827313676.0, + "step": 21681 + }, + { + "epoch": 2.7581732603994404, + "ewc_loss": 0.050741251558065414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.537062573537696e-05, + "grad_norm": 31.686248779296875, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.871901273727417, + "num_tokens": 827355017.0, + "step": 21682 + }, + { + "epoch": 2.758300470678031, + "ewc_loss": 0.050679173320531845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5339586500194855e-05, + "grad_norm": 31.699901580810547, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8898099660873413, + "num_tokens": 827391773.0, + "step": 21683 + }, + { + "epoch": 2.758427680956621, + "ewc_loss": 0.05076948180794716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5384741093148477e-05, + "grad_norm": 31.73115348815918, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8871961832046509, + "num_tokens": 827433034.0, + "step": 21684 + }, + { + "epoch": 2.758554891235212, + "ewc_loss": 0.050706349313259125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5353174351039343e-05, + "grad_norm": 31.568599700927734, + "learning_rate": 1e-06, + "loss": 0.3837, + "mean_token_accuracy": 0.8832185864448547, + "num_tokens": 827475729.0, + "step": 21685 + }, + { + "epoch": 2.758682101513802, + "ewc_loss": 0.05075220391154289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5376102712471038e-05, + "grad_norm": 31.870370864868164, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.877825140953064, + "num_tokens": 827514127.0, + "step": 21686 + }, + { + "epoch": 2.758809311792393, + "ewc_loss": 0.05078675597906113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5393377654836513e-05, + "grad_norm": 31.598180770874023, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8831286430358887, + "num_tokens": 827546709.0, + "step": 21687 + }, + { + "epoch": 2.758936522070983, + "ewc_loss": 0.05062934011220932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5314669983345084e-05, + "grad_norm": 31.755918502807617, + "learning_rate": 1e-06, + "loss": 0.352, + "mean_token_accuracy": 0.8923236727714539, + "num_tokens": 827581450.0, + "step": 21688 + }, + { + "epoch": 2.7590637323495737, + "ewc_loss": 0.05087418481707573, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5437091608182527e-05, + "grad_norm": 31.685848236083984, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.86851966381073, + "num_tokens": 827623161.0, + "step": 21689 + }, + { + "epoch": 2.7591909426281642, + "ewc_loss": 0.050622325390577316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5311162971775047e-05, + "grad_norm": 31.64745330810547, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8682367205619812, + "num_tokens": 827659460.0, + "step": 21690 + }, + { + "epoch": 2.7593181529067548, + "ewc_loss": 0.05069112405180931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5345561880385503e-05, + "grad_norm": 31.613595962524414, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8669413328170776, + "num_tokens": 827701110.0, + "step": 21691 + }, + { + "epoch": 2.7594453631853453, + "ewc_loss": 0.05065665394067764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5328326955786906e-05, + "grad_norm": 31.59218978881836, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8695748448371887, + "num_tokens": 827738992.0, + "step": 21692 + }, + { + "epoch": 2.759572573463936, + "ewc_loss": 0.050794947892427444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.53974740189733e-05, + "grad_norm": 31.760465621948242, + "learning_rate": 1e-06, + "loss": 0.4597, + "mean_token_accuracy": 0.8582459092140198, + "num_tokens": 827775853.0, + "step": 21693 + }, + { + "epoch": 2.7596997837425263, + "ewc_loss": 0.05077372491359711, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5386862034793012e-05, + "grad_norm": 31.549646377563477, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8735588788986206, + "num_tokens": 827819438.0, + "step": 21694 + }, + { + "epoch": 2.759826994021117, + "ewc_loss": 0.0507030114531517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.535150633775629e-05, + "grad_norm": 31.698135375976562, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8662968873977661, + "num_tokens": 827858833.0, + "step": 21695 + }, + { + "epoch": 2.7599542042997074, + "ewc_loss": 0.05080650746822357, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5403252948308364e-05, + "grad_norm": 31.70731544494629, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8654128909111023, + "num_tokens": 827895069.0, + "step": 21696 + }, + { + "epoch": 2.760081414578298, + "ewc_loss": 0.050734180957078934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.536708961997647e-05, + "grad_norm": 31.65666389465332, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8861230611801147, + "num_tokens": 827932970.0, + "step": 21697 + }, + { + "epoch": 2.7602086248568884, + "ewc_loss": 0.05076116323471069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5380581064382568e-05, + "grad_norm": 31.712448120117188, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8833840489387512, + "num_tokens": 827981859.0, + "step": 21698 + }, + { + "epoch": 2.760335835135479, + "ewc_loss": 0.05075671151280403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.537835644034203e-05, + "grad_norm": 31.714000701904297, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8658959865570068, + "num_tokens": 828022018.0, + "step": 21699 + }, + { + "epoch": 2.7604630454140695, + "ewc_loss": 0.050755273550748825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5377636120538227e-05, + "grad_norm": 31.707124710083008, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8674482703208923, + "num_tokens": 828061546.0, + "step": 21700 + }, + { + "epoch": 2.76059025569266, + "ewc_loss": 0.05077853798866272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5389268557773903e-05, + "grad_norm": 31.800214767456055, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8590390682220459, + "num_tokens": 828104464.0, + "step": 21701 + }, + { + "epoch": 2.7607174659712506, + "ewc_loss": 0.050727542489767075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.53637717833044e-05, + "grad_norm": 31.701160430908203, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8616951107978821, + "num_tokens": 828142527.0, + "step": 21702 + }, + { + "epoch": 2.760844676249841, + "ewc_loss": 0.050721973180770874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5360986910527572e-05, + "grad_norm": 31.697052001953125, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8724633455276489, + "num_tokens": 828189670.0, + "step": 21703 + }, + { + "epoch": 2.7609718865284316, + "ewc_loss": 0.05072130262851715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5360650397487916e-05, + "grad_norm": 31.6865291595459, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8793428540229797, + "num_tokens": 828231511.0, + "step": 21704 + }, + { + "epoch": 2.761099096807022, + "ewc_loss": 0.05078946053981781, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.539473098295275e-05, + "grad_norm": 31.656681060791016, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8729428052902222, + "num_tokens": 828274387.0, + "step": 21705 + }, + { + "epoch": 2.7612263070856127, + "ewc_loss": 0.050741519778966904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5370760340592824e-05, + "grad_norm": 31.70870018005371, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8656640648841858, + "num_tokens": 828312263.0, + "step": 21706 + }, + { + "epoch": 2.7613535173642028, + "ewc_loss": 0.05079098045825958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.539548950153403e-05, + "grad_norm": 31.660486221313477, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8731868267059326, + "num_tokens": 828345550.0, + "step": 21707 + }, + { + "epoch": 2.7614807276427937, + "ewc_loss": 0.05064765736460686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5323828594991937e-05, + "grad_norm": 31.716686248779297, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8614038228988647, + "num_tokens": 828378633.0, + "step": 21708 + }, + { + "epoch": 2.761607937921384, + "ewc_loss": 0.05083218216896057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.541609137551859e-05, + "grad_norm": 31.783920288085938, + "learning_rate": 1e-06, + "loss": 0.4397, + "mean_token_accuracy": 0.8663709163665771, + "num_tokens": 828414292.0, + "step": 21709 + }, + { + "epoch": 2.761735148199975, + "ewc_loss": 0.050711892545223236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5355946490890346e-05, + "grad_norm": 31.732513427734375, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8708871603012085, + "num_tokens": 828452682.0, + "step": 21710 + }, + { + "epoch": 2.761862358478565, + "ewc_loss": 0.050728101283311844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5364050088683143e-05, + "grad_norm": 31.599685668945312, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.8873095512390137, + "num_tokens": 828494162.0, + "step": 21711 + }, + { + "epoch": 2.761989568757156, + "ewc_loss": 0.050699833780527115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5349916541017592e-05, + "grad_norm": 31.553421020507812, + "learning_rate": 1e-06, + "loss": 0.405, + "mean_token_accuracy": 0.8764190673828125, + "num_tokens": 828525997.0, + "step": 21712 + }, + { + "epoch": 2.762116779035746, + "ewc_loss": 0.05082545429468155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.541272624512203e-05, + "grad_norm": 31.762723922729492, + "learning_rate": 1e-06, + "loss": 0.4235, + "mean_token_accuracy": 0.8703652024269104, + "num_tokens": 828562488.0, + "step": 21713 + }, + { + "epoch": 2.7622439893143365, + "ewc_loss": 0.05087045952677727, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5435228963033296e-05, + "grad_norm": 31.752466201782227, + "learning_rate": 1e-06, + "loss": 0.3623, + "mean_token_accuracy": 0.8901841640472412, + "num_tokens": 828591818.0, + "step": 21714 + }, + { + "epoch": 2.762371199592927, + "ewc_loss": 0.0507986880838871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5399343940080144e-05, + "grad_norm": 31.618192672729492, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8612382411956787, + "num_tokens": 828628872.0, + "step": 21715 + }, + { + "epoch": 2.7624984098715175, + "ewc_loss": 0.050791334360837936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5395667762495577e-05, + "grad_norm": 31.72974395751953, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8817927837371826, + "num_tokens": 828670939.0, + "step": 21716 + }, + { + "epoch": 2.762625620150108, + "ewc_loss": 0.05080230161547661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5401150196557865e-05, + "grad_norm": 31.819503784179688, + "learning_rate": 1e-06, + "loss": 0.4709, + "mean_token_accuracy": 0.8555076718330383, + "num_tokens": 828708446.0, + "step": 21717 + }, + { + "epoch": 2.7627528304286986, + "ewc_loss": 0.05076518654823303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.538259286666289e-05, + "grad_norm": 31.65415382385254, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8765245079994202, + "num_tokens": 828749873.0, + "step": 21718 + }, + { + "epoch": 2.762880040707289, + "ewc_loss": 0.050757504999637604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5378752980032004e-05, + "grad_norm": 31.671899795532227, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.8650677800178528, + "num_tokens": 828790066.0, + "step": 21719 + }, + { + "epoch": 2.7630072509858796, + "ewc_loss": 0.050754815340042114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.537740692787338e-05, + "grad_norm": 31.757793426513672, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8689622282981873, + "num_tokens": 828821911.0, + "step": 21720 + }, + { + "epoch": 2.76313446126447, + "ewc_loss": 0.050797976553440094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.539898741815705e-05, + "grad_norm": 31.783615112304688, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8673344850540161, + "num_tokens": 828856572.0, + "step": 21721 + }, + { + "epoch": 2.7632616715430607, + "ewc_loss": 0.0507659986615181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.538299850129988e-05, + "grad_norm": 31.685813903808594, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.884528398513794, + "num_tokens": 828898865.0, + "step": 21722 + }, + { + "epoch": 2.763388881821651, + "ewc_loss": 0.05077735707163811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5388679205207154e-05, + "grad_norm": 31.82415008544922, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8801902532577515, + "num_tokens": 828933768.0, + "step": 21723 + }, + { + "epoch": 2.7635160921002417, + "ewc_loss": 0.05078554525971413, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5392771931365132e-05, + "grad_norm": 31.860904693603516, + "learning_rate": 1e-06, + "loss": 0.3724, + "mean_token_accuracy": 0.8886085748672485, + "num_tokens": 828968869.0, + "step": 21724 + }, + { + "epoch": 2.7636433023788323, + "ewc_loss": 0.05073083937168121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5365419787704013e-05, + "grad_norm": 31.82744789123535, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8561699986457825, + "num_tokens": 829005621.0, + "step": 21725 + }, + { + "epoch": 2.763770512657423, + "ewc_loss": 0.05083202198147774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5416011339984834e-05, + "grad_norm": 31.845447540283203, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.8611371517181396, + "num_tokens": 829045046.0, + "step": 21726 + }, + { + "epoch": 2.7638977229360133, + "ewc_loss": 0.05077935382723808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5389676011400297e-05, + "grad_norm": 31.759960174560547, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8849048614501953, + "num_tokens": 829084588.0, + "step": 21727 + }, + { + "epoch": 2.764024933214604, + "ewc_loss": 0.050763070583343506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.538153603381943e-05, + "grad_norm": 31.86543846130371, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8564682006835938, + "num_tokens": 829121010.0, + "step": 21728 + }, + { + "epoch": 2.7641521434931944, + "ewc_loss": 0.05078444257378578, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5392220777575858e-05, + "grad_norm": 31.735633850097656, + "learning_rate": 1e-06, + "loss": 0.385, + "mean_token_accuracy": 0.8815816640853882, + "num_tokens": 829159618.0, + "step": 21729 + }, + { + "epoch": 2.764279353771785, + "ewc_loss": 0.05070478096604347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5352390366606414e-05, + "grad_norm": 31.760574340820312, + "learning_rate": 1e-06, + "loss": 0.4615, + "mean_token_accuracy": 0.8585787415504456, + "num_tokens": 829204826.0, + "step": 21730 + }, + { + "epoch": 2.7644065640503754, + "ewc_loss": 0.05080684274435043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5403422114322893e-05, + "grad_norm": 31.802488327026367, + "learning_rate": 1e-06, + "loss": 0.3717, + "mean_token_accuracy": 0.8833770751953125, + "num_tokens": 829240741.0, + "step": 21731 + }, + { + "epoch": 2.7645337743289655, + "ewc_loss": 0.050626348704099655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.531317477405537e-05, + "grad_norm": 31.637556076049805, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8912447690963745, + "num_tokens": 829278065.0, + "step": 21732 + }, + { + "epoch": 2.7646609846075565, + "ewc_loss": 0.050751712173223495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5375855329912156e-05, + "grad_norm": 31.740413665771484, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8675072193145752, + "num_tokens": 829319438.0, + "step": 21733 + }, + { + "epoch": 2.7647881948861466, + "ewc_loss": 0.050789542496204376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.539477100071963e-05, + "grad_norm": 31.755657196044922, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8826234340667725, + "num_tokens": 829357299.0, + "step": 21734 + }, + { + "epoch": 2.7649154051647375, + "ewc_loss": 0.0506700836122036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5335042664664797e-05, + "grad_norm": 31.630496978759766, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.872825026512146, + "num_tokens": 829398600.0, + "step": 21735 + }, + { + "epoch": 2.7650426154433276, + "ewc_loss": 0.05076229199767113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.538114677008707e-05, + "grad_norm": 31.743711471557617, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8604780435562134, + "num_tokens": 829439214.0, + "step": 21736 + }, + { + "epoch": 2.7651698257219186, + "ewc_loss": 0.05076310411095619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.538155240472406e-05, + "grad_norm": 31.63465690612793, + "learning_rate": 1e-06, + "loss": 0.4572, + "mean_token_accuracy": 0.8615727424621582, + "num_tokens": 829477634.0, + "step": 21737 + }, + { + "epoch": 2.7652970360005087, + "ewc_loss": 0.050848882645368576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5424440536880866e-05, + "grad_norm": 31.764219284057617, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.879077672958374, + "num_tokens": 829514527.0, + "step": 21738 + }, + { + "epoch": 2.765424246279099, + "ewc_loss": 0.05080448463559151, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5402241590199992e-05, + "grad_norm": 31.694772720336914, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8732094764709473, + "num_tokens": 829558531.0, + "step": 21739 + }, + { + "epoch": 2.7655514565576897, + "ewc_loss": 0.05082227289676666, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.541113644838333e-05, + "grad_norm": 31.713104248046875, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.875124454498291, + "num_tokens": 829600769.0, + "step": 21740 + }, + { + "epoch": 2.7656786668362803, + "ewc_loss": 0.05075569078326225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5377845304319635e-05, + "grad_norm": 31.730791091918945, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8766087889671326, + "num_tokens": 829635938.0, + "step": 21741 + }, + { + "epoch": 2.765805877114871, + "ewc_loss": 0.05082319304347038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.541159665270243e-05, + "grad_norm": 31.7991943359375, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8724173903465271, + "num_tokens": 829672956.0, + "step": 21742 + }, + { + "epoch": 2.7659330873934613, + "ewc_loss": 0.05074068903923035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5370343792019412e-05, + "grad_norm": 31.64368438720703, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8709362149238586, + "num_tokens": 829712658.0, + "step": 21743 + }, + { + "epoch": 2.766060297672052, + "ewc_loss": 0.050806257873773575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5403129257028922e-05, + "grad_norm": 31.644195556640625, + "learning_rate": 1e-06, + "loss": 0.4063, + "mean_token_accuracy": 0.8751023411750793, + "num_tokens": 829755429.0, + "step": 21744 + }, + { + "epoch": 2.7661875079506424, + "ewc_loss": 0.050857432186603546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.54287151619792e-05, + "grad_norm": 31.675676345825195, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.871330976486206, + "num_tokens": 829796373.0, + "step": 21745 + }, + { + "epoch": 2.766314718229233, + "ewc_loss": 0.050783392041921616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5391696908627637e-05, + "grad_norm": 31.620121002197266, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8677528500556946, + "num_tokens": 829834049.0, + "step": 21746 + }, + { + "epoch": 2.7664419285078234, + "ewc_loss": 0.050818972289562225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5409486624994315e-05, + "grad_norm": 31.7283935546875, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8726130723953247, + "num_tokens": 829870853.0, + "step": 21747 + }, + { + "epoch": 2.766569138786414, + "ewc_loss": 0.05079461634159088, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5397308490937576e-05, + "grad_norm": 31.584980010986328, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8656810522079468, + "num_tokens": 829907519.0, + "step": 21748 + }, + { + "epoch": 2.7666963490650045, + "ewc_loss": 0.0508187860250473, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5409393856534734e-05, + "grad_norm": 31.72101402282715, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8595890402793884, + "num_tokens": 829947162.0, + "step": 21749 + }, + { + "epoch": 2.766823559343595, + "ewc_loss": 0.050817664712667465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.540883178880904e-05, + "grad_norm": 31.701581954956055, + "learning_rate": 1e-06, + "loss": 0.4347, + "mean_token_accuracy": 0.8692982196807861, + "num_tokens": 829991981.0, + "step": 21750 + }, + { + "epoch": 2.7669507696221856, + "ewc_loss": 0.0507957898080349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5397894205525517e-05, + "grad_norm": 31.822324752807617, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.877415657043457, + "num_tokens": 830032409.0, + "step": 21751 + }, + { + "epoch": 2.767077979900776, + "ewc_loss": 0.05078914761543274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5394574549864046e-05, + "grad_norm": 31.807357788085938, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8753447532653809, + "num_tokens": 830072738.0, + "step": 21752 + }, + { + "epoch": 2.7672051901793666, + "ewc_loss": 0.05074967071413994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5374834876856767e-05, + "grad_norm": 31.677034378051758, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8683109283447266, + "num_tokens": 830110880.0, + "step": 21753 + }, + { + "epoch": 2.767332400457957, + "ewc_loss": 0.0507381446659565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5369072318426333e-05, + "grad_norm": 31.88225746154785, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8679062724113464, + "num_tokens": 830150148.0, + "step": 21754 + }, + { + "epoch": 2.7674596107365477, + "ewc_loss": 0.050765059888362885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5382529202033766e-05, + "grad_norm": 31.8001766204834, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8599616289138794, + "num_tokens": 830190717.0, + "step": 21755 + }, + { + "epoch": 2.767586821015138, + "ewc_loss": 0.050649575889110565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5324787202407606e-05, + "grad_norm": 31.686264038085938, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8727043867111206, + "num_tokens": 830226972.0, + "step": 21756 + }, + { + "epoch": 2.7677140312937283, + "ewc_loss": 0.050714436918497086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5357217964483425e-05, + "grad_norm": 31.692241668701172, + "learning_rate": 1e-06, + "loss": 0.3853, + "mean_token_accuracy": 0.8801463842391968, + "num_tokens": 830262612.0, + "step": 21757 + }, + { + "epoch": 2.7678412415723193, + "ewc_loss": 0.050720807164907455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5360403014929034e-05, + "grad_norm": 31.94217300415039, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8784393072128296, + "num_tokens": 830306856.0, + "step": 21758 + }, + { + "epoch": 2.7679684518509093, + "ewc_loss": 0.050719164311885834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5359582650708035e-05, + "grad_norm": 31.615428924560547, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8711429834365845, + "num_tokens": 830340726.0, + "step": 21759 + }, + { + "epoch": 2.7680956621295003, + "ewc_loss": 0.05063967406749725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5319837732240558e-05, + "grad_norm": 31.946504592895508, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8723364472389221, + "num_tokens": 830380697.0, + "step": 21760 + }, + { + "epoch": 2.7682228724080904, + "ewc_loss": 0.05088471621274948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5442357582505792e-05, + "grad_norm": 31.742328643798828, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8492398858070374, + "num_tokens": 830414737.0, + "step": 21761 + }, + { + "epoch": 2.768350082686681, + "ewc_loss": 0.05061403661966324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5307017494924366e-05, + "grad_norm": 31.645292282104492, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8730741143226624, + "num_tokens": 830456518.0, + "step": 21762 + }, + { + "epoch": 2.7684772929652715, + "ewc_loss": 0.0508185550570488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5409277441212907e-05, + "grad_norm": 31.786985397338867, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8696186542510986, + "num_tokens": 830494192.0, + "step": 21763 + }, + { + "epoch": 2.768604503243862, + "ewc_loss": 0.050735995173454285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5367997295688838e-05, + "grad_norm": 31.554210662841797, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8650710582733154, + "num_tokens": 830533659.0, + "step": 21764 + }, + { + "epoch": 2.7687317135224525, + "ewc_loss": 0.050763241946697235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5381621526321396e-05, + "grad_norm": 31.823883056640625, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8661673069000244, + "num_tokens": 830569605.0, + "step": 21765 + }, + { + "epoch": 2.768858923801043, + "ewc_loss": 0.05084623768925667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5423118131584488e-05, + "grad_norm": 31.60813331604004, + "learning_rate": 1e-06, + "loss": 0.4842, + "mean_token_accuracy": 0.8504159450531006, + "num_tokens": 830610814.0, + "step": 21766 + }, + { + "epoch": 2.7689861340796336, + "ewc_loss": 0.05075791850686073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5378958525834605e-05, + "grad_norm": 31.762409210205078, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.871387243270874, + "num_tokens": 830646811.0, + "step": 21767 + }, + { + "epoch": 2.769113344358224, + "ewc_loss": 0.05086396634578705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.543198388593737e-05, + "grad_norm": 31.667997360229492, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8757592439651489, + "num_tokens": 830680984.0, + "step": 21768 + }, + { + "epoch": 2.7692405546368146, + "ewc_loss": 0.05078572407364845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.539286288083531e-05, + "grad_norm": 31.72076416015625, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8714047074317932, + "num_tokens": 830713384.0, + "step": 21769 + }, + { + "epoch": 2.769367764915405, + "ewc_loss": 0.05075271800160408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5376359189976938e-05, + "grad_norm": 31.823280334472656, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8594210743904114, + "num_tokens": 830746922.0, + "step": 21770 + }, + { + "epoch": 2.7694949751939957, + "ewc_loss": 0.050778478384017944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5389239453943446e-05, + "grad_norm": 31.593955993652344, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8785132169723511, + "num_tokens": 830789459.0, + "step": 21771 + }, + { + "epoch": 2.769622185472586, + "ewc_loss": 0.050781261175870895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.539063098083716e-05, + "grad_norm": 31.737499237060547, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8691577315330505, + "num_tokens": 830826736.0, + "step": 21772 + }, + { + "epoch": 2.7697493957511767, + "ewc_loss": 0.050881318747997284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.544065864640288e-05, + "grad_norm": 31.647722244262695, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8740023970603943, + "num_tokens": 830868118.0, + "step": 21773 + }, + { + "epoch": 2.7698766060297673, + "ewc_loss": 0.05080554634332657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5402772735105827e-05, + "grad_norm": 31.772171020507812, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8690778017044067, + "num_tokens": 830911658.0, + "step": 21774 + }, + { + "epoch": 2.770003816308358, + "ewc_loss": 0.05089869350194931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5449346139794216e-05, + "grad_norm": 31.633590698242188, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8730159997940063, + "num_tokens": 830948456.0, + "step": 21775 + }, + { + "epoch": 2.7701310265869483, + "ewc_loss": 0.050843048840761185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.542152469686698e-05, + "grad_norm": 31.82761573791504, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8812929391860962, + "num_tokens": 830988790.0, + "step": 21776 + }, + { + "epoch": 2.770258236865539, + "ewc_loss": 0.050976574420928955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5488287064945325e-05, + "grad_norm": 31.694400787353516, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8649181127548218, + "num_tokens": 831024551.0, + "step": 21777 + }, + { + "epoch": 2.7703854471441294, + "ewc_loss": 0.05080629140138626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5403145627933554e-05, + "grad_norm": 31.837343215942383, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8695324659347534, + "num_tokens": 831060082.0, + "step": 21778 + }, + { + "epoch": 2.77051265742272, + "ewc_loss": 0.05088192597031593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5440962417633273e-05, + "grad_norm": 31.598276138305664, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.872840166091919, + "num_tokens": 831100067.0, + "step": 21779 + }, + { + "epoch": 2.77063986770131, + "ewc_loss": 0.050819527357816696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5409763111383654e-05, + "grad_norm": 31.79363441467285, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8825567364692688, + "num_tokens": 831131923.0, + "step": 21780 + }, + { + "epoch": 2.770767077979901, + "ewc_loss": 0.050883617252111435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.544180824770592e-05, + "grad_norm": 31.679424285888672, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8573915958404541, + "num_tokens": 831167985.0, + "step": 21781 + }, + { + "epoch": 2.770894288258491, + "ewc_loss": 0.05081940442323685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5409703084733337e-05, + "grad_norm": 31.731842041015625, + "learning_rate": 1e-06, + "loss": 0.4367, + "mean_token_accuracy": 0.8669561743736267, + "num_tokens": 831211737.0, + "step": 21782 + }, + { + "epoch": 2.771021498537082, + "ewc_loss": 0.05093642696738243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5468212697887793e-05, + "grad_norm": 31.79678726196289, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8610938787460327, + "num_tokens": 831248764.0, + "step": 21783 + }, + { + "epoch": 2.771148708815672, + "ewc_loss": 0.05086345225572586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5431725589442067e-05, + "grad_norm": 31.72552490234375, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8778485059738159, + "num_tokens": 831284140.0, + "step": 21784 + }, + { + "epoch": 2.771275919094263, + "ewc_loss": 0.050771333277225494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5385666958754882e-05, + "grad_norm": 31.807157516479492, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8754218816757202, + "num_tokens": 831318567.0, + "step": 21785 + }, + { + "epoch": 2.771403129372853, + "ewc_loss": 0.05088651552796364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5443257982260548e-05, + "grad_norm": 31.743223190307617, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8651987314224243, + "num_tokens": 831357128.0, + "step": 21786 + }, + { + "epoch": 2.7715303396514437, + "ewc_loss": 0.05090902000665665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5454510250710882e-05, + "grad_norm": 31.70759391784668, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8524607419967651, + "num_tokens": 831391551.0, + "step": 21787 + }, + { + "epoch": 2.771657549930034, + "ewc_loss": 0.05088808014988899, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5444040147704072e-05, + "grad_norm": 31.6730899810791, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8676062822341919, + "num_tokens": 831430332.0, + "step": 21788 + }, + { + "epoch": 2.7717847602086247, + "ewc_loss": 0.05091431736946106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5457158699282445e-05, + "grad_norm": 31.640945434570312, + "learning_rate": 1e-06, + "loss": 0.398, + "mean_token_accuracy": 0.8766286373138428, + "num_tokens": 831471639.0, + "step": 21789 + }, + { + "epoch": 2.7719119704872153, + "ewc_loss": 0.050834473222494125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5417237338842824e-05, + "grad_norm": 31.556119918823242, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8770884275436401, + "num_tokens": 831506470.0, + "step": 21790 + }, + { + "epoch": 2.772039180765806, + "ewc_loss": 0.050874974578619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.54374881478725e-05, + "grad_norm": 31.61530303955078, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8647910356521606, + "num_tokens": 831547212.0, + "step": 21791 + }, + { + "epoch": 2.7721663910443963, + "ewc_loss": 0.05099361017346382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.549680539232213e-05, + "grad_norm": 31.58571434020996, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8573951721191406, + "num_tokens": 831593181.0, + "step": 21792 + }, + { + "epoch": 2.772293601322987, + "ewc_loss": 0.05096793547272682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5483966965111904e-05, + "grad_norm": 31.605684280395508, + "learning_rate": 1e-06, + "loss": 0.3606, + "mean_token_accuracy": 0.8882092237472534, + "num_tokens": 831624889.0, + "step": 21793 + }, + { + "epoch": 2.7724208116015774, + "ewc_loss": 0.0509808212518692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5490409825579263e-05, + "grad_norm": 31.621232986450195, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.8779515027999878, + "num_tokens": 831662389.0, + "step": 21794 + }, + { + "epoch": 2.772548021880168, + "ewc_loss": 0.05091095715761185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.545547795307357e-05, + "grad_norm": 31.601465225219727, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8735665082931519, + "num_tokens": 831705282.0, + "step": 21795 + }, + { + "epoch": 2.7726752321587584, + "ewc_loss": 0.05096181482076645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.548090742493514e-05, + "grad_norm": 31.66518783569336, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.879342794418335, + "num_tokens": 831741604.0, + "step": 21796 + }, + { + "epoch": 2.772802442437349, + "ewc_loss": 0.05096650868654251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5483253921265714e-05, + "grad_norm": 31.642553329467773, + "learning_rate": 1e-06, + "loss": 0.4495, + "mean_token_accuracy": 0.8628822565078735, + "num_tokens": 831784165.0, + "step": 21797 + }, + { + "epoch": 2.7729296527159395, + "ewc_loss": 0.05103475973010063, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5517379981465638e-05, + "grad_norm": 31.698440551757812, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8754792809486389, + "num_tokens": 831821101.0, + "step": 21798 + }, + { + "epoch": 2.77305686299453, + "ewc_loss": 0.05092036351561546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5460181859671138e-05, + "grad_norm": 31.543899536132812, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8744982481002808, + "num_tokens": 831855766.0, + "step": 21799 + }, + { + "epoch": 2.7731840732731206, + "ewc_loss": 0.050954669713974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5477334929746576e-05, + "grad_norm": 31.734935760498047, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8781847953796387, + "num_tokens": 831895692.0, + "step": 21800 + }, + { + "epoch": 2.773311283551711, + "ewc_loss": 0.0510319322347641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5515966626699083e-05, + "grad_norm": 31.71247673034668, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8907163143157959, + "num_tokens": 831931079.0, + "step": 21801 + }, + { + "epoch": 2.7734384938303016, + "ewc_loss": 0.05096527934074402, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5482639102847315e-05, + "grad_norm": 31.72881507873535, + "learning_rate": 1e-06, + "loss": 0.4938, + "mean_token_accuracy": 0.8464582562446594, + "num_tokens": 831969742.0, + "step": 21802 + }, + { + "epoch": 2.773565704108892, + "ewc_loss": 0.0510200560092926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5510027626296505e-05, + "grad_norm": 31.663497924804688, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8608226776123047, + "num_tokens": 832011059.0, + "step": 21803 + }, + { + "epoch": 2.7736929143874827, + "ewc_loss": 0.050999730825424194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5499864932498895e-05, + "grad_norm": 31.693174362182617, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8816300630569458, + "num_tokens": 832042768.0, + "step": 21804 + }, + { + "epoch": 2.7738201246660728, + "ewc_loss": 0.05091722682118416, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5458613890805282e-05, + "grad_norm": 31.68490219116211, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8802322745323181, + "num_tokens": 832077699.0, + "step": 21805 + }, + { + "epoch": 2.7739473349446637, + "ewc_loss": 0.05094463750720024, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5472318156971596e-05, + "grad_norm": 31.522838592529297, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8791347742080688, + "num_tokens": 832113528.0, + "step": 21806 + }, + { + "epoch": 2.774074545223254, + "ewc_loss": 0.051044661551713943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.552233127062209e-05, + "grad_norm": 31.8702449798584, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.87599778175354, + "num_tokens": 832149108.0, + "step": 21807 + }, + { + "epoch": 2.774201755501845, + "ewc_loss": 0.05108169838786125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5540848582750186e-05, + "grad_norm": 31.72361183166504, + "learning_rate": 1e-06, + "loss": 0.3715, + "mean_token_accuracy": 0.8879585862159729, + "num_tokens": 832189279.0, + "step": 21808 + }, + { + "epoch": 2.774328965780435, + "ewc_loss": 0.0509430468082428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5471523258602247e-05, + "grad_norm": 31.74207878112793, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8733857870101929, + "num_tokens": 832224423.0, + "step": 21809 + }, + { + "epoch": 2.774456176059026, + "ewc_loss": 0.05103123560547829, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5515617380733602e-05, + "grad_norm": 31.726703643798828, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8657622337341309, + "num_tokens": 832260389.0, + "step": 21810 + }, + { + "epoch": 2.774583386337616, + "ewc_loss": 0.05092250928282738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.546125506341923e-05, + "grad_norm": 31.7641544342041, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8706613779067993, + "num_tokens": 832297591.0, + "step": 21811 + }, + { + "epoch": 2.7747105966162064, + "ewc_loss": 0.050940435379743576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.54702172242105e-05, + "grad_norm": 31.72675323486328, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.877126157283783, + "num_tokens": 832330195.0, + "step": 21812 + }, + { + "epoch": 2.774837806894797, + "ewc_loss": 0.050885338336229324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.54426686296938e-05, + "grad_norm": 31.672273635864258, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8778560161590576, + "num_tokens": 832361613.0, + "step": 21813 + }, + { + "epoch": 2.7749650171733875, + "ewc_loss": 0.050999827682971954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.549991404521279e-05, + "grad_norm": 31.770288467407227, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8776412606239319, + "num_tokens": 832402800.0, + "step": 21814 + }, + { + "epoch": 2.775092227451978, + "ewc_loss": 0.05102067440748215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5510336854495108e-05, + "grad_norm": 31.74735450744629, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8741633892059326, + "num_tokens": 832445161.0, + "step": 21815 + }, + { + "epoch": 2.7752194377305686, + "ewc_loss": 0.0509907528758049, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5495375666650943e-05, + "grad_norm": 31.69243049621582, + "learning_rate": 1e-06, + "loss": 0.4562, + "mean_token_accuracy": 0.8576256036758423, + "num_tokens": 832487905.0, + "step": 21816 + }, + { + "epoch": 2.775346648009159, + "ewc_loss": 0.05097293108701706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5486466256552376e-05, + "grad_norm": 31.735469818115234, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8734580278396606, + "num_tokens": 832521107.0, + "step": 21817 + }, + { + "epoch": 2.7754738582877496, + "ewc_loss": 0.05101793259382248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5508967155474238e-05, + "grad_norm": 31.680089950561523, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8682218790054321, + "num_tokens": 832557213.0, + "step": 21818 + }, + { + "epoch": 2.77560106856634, + "ewc_loss": 0.05100448429584503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.550224235164933e-05, + "grad_norm": 31.772090911865234, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8595471382141113, + "num_tokens": 832600445.0, + "step": 21819 + }, + { + "epoch": 2.7757282788449307, + "ewc_loss": 0.051017191261053085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5508596081635915e-05, + "grad_norm": 31.766658782958984, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8718155026435852, + "num_tokens": 832635994.0, + "step": 21820 + }, + { + "epoch": 2.775855489123521, + "ewc_loss": 0.05105184391140938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.552592195570469e-05, + "grad_norm": 31.77504539489746, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8833533525466919, + "num_tokens": 832677151.0, + "step": 21821 + }, + { + "epoch": 2.7759826994021117, + "ewc_loss": 0.05101938173174858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.550969111325685e-05, + "grad_norm": 31.65623664855957, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8709677457809448, + "num_tokens": 832713995.0, + "step": 21822 + }, + { + "epoch": 2.7761099096807023, + "ewc_loss": 0.050970032811164856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.548501652199775e-05, + "grad_norm": 31.736452102661133, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8628648519515991, + "num_tokens": 832754201.0, + "step": 21823 + }, + { + "epoch": 2.776237119959293, + "ewc_loss": 0.05100024491548538, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5500123228994198e-05, + "grad_norm": 31.768341064453125, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8692986965179443, + "num_tokens": 832796920.0, + "step": 21824 + }, + { + "epoch": 2.7763643302378833, + "ewc_loss": 0.05092089995741844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.546044925111346e-05, + "grad_norm": 31.65496826171875, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8683954477310181, + "num_tokens": 832835952.0, + "step": 21825 + }, + { + "epoch": 2.776491540516474, + "ewc_loss": 0.050993211567401886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.549660530348774e-05, + "grad_norm": 31.822675704956055, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8756858110427856, + "num_tokens": 832878306.0, + "step": 21826 + }, + { + "epoch": 2.7766187507950644, + "ewc_loss": 0.05100684240460396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5503421056782827e-05, + "grad_norm": 31.78240966796875, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8646992444992065, + "num_tokens": 832914315.0, + "step": 21827 + }, + { + "epoch": 2.776745961073655, + "ewc_loss": 0.05088600143790245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5443001504754648e-05, + "grad_norm": 31.633745193481445, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8689902424812317, + "num_tokens": 832956661.0, + "step": 21828 + }, + { + "epoch": 2.7768731713522454, + "ewc_loss": 0.05098726600408554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5493633074802347e-05, + "grad_norm": 31.904233932495117, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8777855634689331, + "num_tokens": 832999446.0, + "step": 21829 + }, + { + "epoch": 2.7770003816308355, + "ewc_loss": 0.05084697902202606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.542348920542281e-05, + "grad_norm": 31.58595848083496, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8453733325004578, + "num_tokens": 833039001.0, + "step": 21830 + }, + { + "epoch": 2.7771275919094265, + "ewc_loss": 0.05094039440155029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5470197215327062e-05, + "grad_norm": 31.754701614379883, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8710595965385437, + "num_tokens": 833074334.0, + "step": 21831 + }, + { + "epoch": 2.7772548021880166, + "ewc_loss": 0.050987228751182556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.549361488490831e-05, + "grad_norm": 31.707157135009766, + "learning_rate": 1e-06, + "loss": 0.4595, + "mean_token_accuracy": 0.8601800203323364, + "num_tokens": 833113089.0, + "step": 21832 + }, + { + "epoch": 2.7773820124666075, + "ewc_loss": 0.05086047202348709, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5430235837120563e-05, + "grad_norm": 31.60696792602539, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8687845468521118, + "num_tokens": 833149755.0, + "step": 21833 + }, + { + "epoch": 2.7775092227451976, + "ewc_loss": 0.05098055303096771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5490277039352804e-05, + "grad_norm": 31.862112045288086, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8633728623390198, + "num_tokens": 833187718.0, + "step": 21834 + }, + { + "epoch": 2.7776364330237886, + "ewc_loss": 0.05088061839342117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.54403094004374e-05, + "grad_norm": 31.683055877685547, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8788213729858398, + "num_tokens": 833221416.0, + "step": 21835 + }, + { + "epoch": 2.7777636433023787, + "ewc_loss": 0.05084840953350067, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5424204068258405e-05, + "grad_norm": 31.692996978759766, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8846296668052673, + "num_tokens": 833261101.0, + "step": 21836 + }, + { + "epoch": 2.777890853580969, + "ewc_loss": 0.05093637481331825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5468187232036144e-05, + "grad_norm": 31.676912307739258, + "learning_rate": 1e-06, + "loss": 0.3953, + "mean_token_accuracy": 0.880077600479126, + "num_tokens": 833296804.0, + "step": 21837 + }, + { + "epoch": 2.7780180638595597, + "ewc_loss": 0.050980210304260254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.549010605434887e-05, + "grad_norm": 31.83216667175293, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8723446726799011, + "num_tokens": 833332005.0, + "step": 21838 + }, + { + "epoch": 2.7781452741381503, + "ewc_loss": 0.05093011632561684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5465058570262045e-05, + "grad_norm": 31.747339248657227, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.868323802947998, + "num_tokens": 833371991.0, + "step": 21839 + }, + { + "epoch": 2.778272484416741, + "ewc_loss": 0.050812967121601105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.540648347348906e-05, + "grad_norm": 31.638093948364258, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8836739659309387, + "num_tokens": 833407514.0, + "step": 21840 + }, + { + "epoch": 2.7783996946953313, + "ewc_loss": 0.05092562362551689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5462812118348666e-05, + "grad_norm": 31.660602569580078, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.869747519493103, + "num_tokens": 833444519.0, + "step": 21841 + }, + { + "epoch": 2.778526904973922, + "ewc_loss": 0.051074255257844925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.553712693043053e-05, + "grad_norm": 31.73590660095215, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8683019876480103, + "num_tokens": 833482252.0, + "step": 21842 + }, + { + "epoch": 2.7786541152525124, + "ewc_loss": 0.05099070444703102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5495352019788697e-05, + "grad_norm": 31.836740493774414, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.8702800273895264, + "num_tokens": 833515513.0, + "step": 21843 + }, + { + "epoch": 2.778781325531103, + "ewc_loss": 0.05103695020079613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5518475013086572e-05, + "grad_norm": 31.797443389892578, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.879650890827179, + "num_tokens": 833551188.0, + "step": 21844 + }, + { + "epoch": 2.7789085358096934, + "ewc_loss": 0.05094968155026436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5474841095274314e-05, + "grad_norm": 31.71029281616211, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8819400668144226, + "num_tokens": 833589159.0, + "step": 21845 + }, + { + "epoch": 2.779035746088284, + "ewc_loss": 0.050994906574487686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5497452952549793e-05, + "grad_norm": 31.734676361083984, + "learning_rate": 1e-06, + "loss": 0.4334, + "mean_token_accuracy": 0.8614289164543152, + "num_tokens": 833623581.0, + "step": 21846 + }, + { + "epoch": 2.7791629563668745, + "ewc_loss": 0.050979871302843094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5489935069344938e-05, + "grad_norm": 31.725370407104492, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8592138290405273, + "num_tokens": 833660466.0, + "step": 21847 + }, + { + "epoch": 2.779290166645465, + "ewc_loss": 0.050992030650377274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.549601595092099e-05, + "grad_norm": 31.781604766845703, + "learning_rate": 1e-06, + "loss": 0.4598, + "mean_token_accuracy": 0.8612552285194397, + "num_tokens": 833697465.0, + "step": 21848 + }, + { + "epoch": 2.7794173769240555, + "ewc_loss": 0.05102391913533211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.551195939304307e-05, + "grad_norm": 31.788768768310547, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8692305088043213, + "num_tokens": 833735178.0, + "step": 21849 + }, + { + "epoch": 2.779544587202646, + "ewc_loss": 0.05100112780928612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5500563424429856e-05, + "grad_norm": 31.798887252807617, + "learning_rate": 1e-06, + "loss": 0.3659, + "mean_token_accuracy": 0.8888287544250488, + "num_tokens": 833770193.0, + "step": 21850 + }, + { + "epoch": 2.7796717974812366, + "ewc_loss": 0.05088316276669502, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5441580874030478e-05, + "grad_norm": 31.835227966308594, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8727516531944275, + "num_tokens": 833807836.0, + "step": 21851 + }, + { + "epoch": 2.779799007759827, + "ewc_loss": 0.0509415902197361, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5470795662840828e-05, + "grad_norm": 31.74224281311035, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8685773015022278, + "num_tokens": 833847605.0, + "step": 21852 + }, + { + "epoch": 2.7799262180384177, + "ewc_loss": 0.05096767097711563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.548383599787485e-05, + "grad_norm": 31.782367706298828, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8753179311752319, + "num_tokens": 833884239.0, + "step": 21853 + }, + { + "epoch": 2.780053428317008, + "ewc_loss": 0.05100290849804878, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5501454729237594e-05, + "grad_norm": 31.8709774017334, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8727966547012329, + "num_tokens": 833921305.0, + "step": 21854 + }, + { + "epoch": 2.7801806385955983, + "ewc_loss": 0.05096784606575966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.548392330936622e-05, + "grad_norm": 31.73192596435547, + "learning_rate": 1e-06, + "loss": 0.4507, + "mean_token_accuracy": 0.8639703989028931, + "num_tokens": 833959297.0, + "step": 21855 + }, + { + "epoch": 2.7803078488741892, + "ewc_loss": 0.050919946283102036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.545997267588973e-05, + "grad_norm": 31.816604614257812, + "learning_rate": 1e-06, + "loss": 0.4944, + "mean_token_accuracy": 0.849929928779602, + "num_tokens": 833992135.0, + "step": 21856 + }, + { + "epoch": 2.7804350591527793, + "ewc_loss": 0.05099775642156601, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5498879040242173e-05, + "grad_norm": 31.71637725830078, + "learning_rate": 1e-06, + "loss": 0.4501, + "mean_token_accuracy": 0.8620516657829285, + "num_tokens": 834035933.0, + "step": 21857 + }, + { + "epoch": 2.7805622694313703, + "ewc_loss": 0.050931546837091446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5465773433097638e-05, + "grad_norm": 31.775365829467773, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8704761862754822, + "num_tokens": 834070187.0, + "step": 21858 + }, + { + "epoch": 2.7806894797099604, + "ewc_loss": 0.05104360729455948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.552180376369506e-05, + "grad_norm": 31.6887264251709, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8728295564651489, + "num_tokens": 834107370.0, + "step": 21859 + }, + { + "epoch": 2.780816689988551, + "ewc_loss": 0.05091645196080208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5458226446062326e-05, + "grad_norm": 31.738506317138672, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8636079430580139, + "num_tokens": 834145357.0, + "step": 21860 + }, + { + "epoch": 2.7809439002671414, + "ewc_loss": 0.051057569682598114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.552878504502587e-05, + "grad_norm": 31.828914642333984, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8703410625457764, + "num_tokens": 834187250.0, + "step": 21861 + }, + { + "epoch": 2.781071110545732, + "ewc_loss": 0.05087947100400925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.543973459978588e-05, + "grad_norm": 31.52181625366211, + "learning_rate": 1e-06, + "loss": 0.3851, + "mean_token_accuracy": 0.8820975422859192, + "num_tokens": 834227598.0, + "step": 21862 + }, + { + "epoch": 2.7811983208243225, + "ewc_loss": 0.050967443734407425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5483721401542425e-05, + "grad_norm": 31.816970825195312, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8857013583183289, + "num_tokens": 834267032.0, + "step": 21863 + }, + { + "epoch": 2.781325531102913, + "ewc_loss": 0.05113152787089348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5565763280610554e-05, + "grad_norm": 31.66851043701172, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8614724278450012, + "num_tokens": 834308479.0, + "step": 21864 + }, + { + "epoch": 2.7814527413815036, + "ewc_loss": 0.051002923399209976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5501462005195208e-05, + "grad_norm": 31.747678756713867, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.8804789185523987, + "num_tokens": 834346185.0, + "step": 21865 + }, + { + "epoch": 2.781579951660094, + "ewc_loss": 0.05111752077937126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5558760171406902e-05, + "grad_norm": 31.73177719116211, + "learning_rate": 1e-06, + "loss": 0.3955, + "mean_token_accuracy": 0.8790841102600098, + "num_tokens": 834384404.0, + "step": 21866 + }, + { + "epoch": 2.7817071619386846, + "ewc_loss": 0.05101240053772926, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5506200472591445e-05, + "grad_norm": 31.76275062561035, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.869867742061615, + "num_tokens": 834424833.0, + "step": 21867 + }, + { + "epoch": 2.781834372217275, + "ewc_loss": 0.051027823239564896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5513911168673076e-05, + "grad_norm": 31.584888458251953, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8730433583259583, + "num_tokens": 834464215.0, + "step": 21868 + }, + { + "epoch": 2.7819615824958657, + "ewc_loss": 0.05098859220743179, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.549429700593464e-05, + "grad_norm": 31.803661346435547, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8488006591796875, + "num_tokens": 834502452.0, + "step": 21869 + }, + { + "epoch": 2.782088792774456, + "ewc_loss": 0.05104083567857742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.552041769376956e-05, + "grad_norm": 31.698528289794922, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8723913431167603, + "num_tokens": 834539690.0, + "step": 21870 + }, + { + "epoch": 2.7822160030530467, + "ewc_loss": 0.050890784710645676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5445391656830907e-05, + "grad_norm": 31.668155670166016, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8615025281906128, + "num_tokens": 834583193.0, + "step": 21871 + }, + { + "epoch": 2.7823432133316373, + "ewc_loss": 0.0510583370923996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.552916885179002e-05, + "grad_norm": 31.863927841186523, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.865465521812439, + "num_tokens": 834620028.0, + "step": 21872 + }, + { + "epoch": 2.782470423610228, + "ewc_loss": 0.050927065312862396, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.546353243815247e-05, + "grad_norm": 31.64744758605957, + "learning_rate": 1e-06, + "loss": 0.4477, + "mean_token_accuracy": 0.8614301681518555, + "num_tokens": 834657351.0, + "step": 21873 + }, + { + "epoch": 2.7825976338888183, + "ewc_loss": 0.050959255546331406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5479628675384447e-05, + "grad_norm": 31.82718276977539, + "learning_rate": 1e-06, + "loss": 0.3575, + "mean_token_accuracy": 0.8862701058387756, + "num_tokens": 834693687.0, + "step": 21874 + }, + { + "epoch": 2.782724844167409, + "ewc_loss": 0.05100230872631073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.550115459598601e-05, + "grad_norm": 31.698009490966797, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.870489239692688, + "num_tokens": 834736171.0, + "step": 21875 + }, + { + "epoch": 2.7828520544459994, + "ewc_loss": 0.05100660398602486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5503302822471596e-05, + "grad_norm": 31.8593807220459, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8855476379394531, + "num_tokens": 834775699.0, + "step": 21876 + }, + { + "epoch": 2.78297926472459, + "ewc_loss": 0.050882089883089066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5441044272156432e-05, + "grad_norm": 31.603967666625977, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8655858039855957, + "num_tokens": 834816053.0, + "step": 21877 + }, + { + "epoch": 2.78310647500318, + "ewc_loss": 0.05091163516044617, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5455818104092032e-05, + "grad_norm": 31.6270694732666, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.8669439554214478, + "num_tokens": 834857125.0, + "step": 21878 + }, + { + "epoch": 2.783233685281771, + "ewc_loss": 0.05092282593250275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.546141331549734e-05, + "grad_norm": 31.6911563873291, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8847339153289795, + "num_tokens": 834895676.0, + "step": 21879 + }, + { + "epoch": 2.783360895560361, + "ewc_loss": 0.051045630127191544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5522815121803433e-05, + "grad_norm": 31.70456886291504, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8757350444793701, + "num_tokens": 834927748.0, + "step": 21880 + }, + { + "epoch": 2.783488105838952, + "ewc_loss": 0.05095597356557846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5477986127953045e-05, + "grad_norm": 31.72178840637207, + "learning_rate": 1e-06, + "loss": 0.44, + "mean_token_accuracy": 0.8677624464035034, + "num_tokens": 834966722.0, + "step": 21881 + }, + { + "epoch": 2.783615316117542, + "ewc_loss": 0.05107208341360092, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5536040993756615e-05, + "grad_norm": 31.76755142211914, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8725438117980957, + "num_tokens": 835003770.0, + "step": 21882 + }, + { + "epoch": 2.783742526396133, + "ewc_loss": 0.05098423734307289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5492117856629193e-05, + "grad_norm": 31.68393898010254, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8804668188095093, + "num_tokens": 835041504.0, + "step": 21883 + }, + { + "epoch": 2.783869736674723, + "ewc_loss": 0.050949301570653915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5474650101386942e-05, + "grad_norm": 31.601564407348633, + "learning_rate": 1e-06, + "loss": 0.4173, + "mean_token_accuracy": 0.8739656209945679, + "num_tokens": 835075179.0, + "step": 21884 + }, + { + "epoch": 2.7839969469533137, + "ewc_loss": 0.050999127328395844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5499562980257906e-05, + "grad_norm": 31.756088256835938, + "learning_rate": 1e-06, + "loss": 0.4829, + "mean_token_accuracy": 0.8604742288589478, + "num_tokens": 835113080.0, + "step": 21885 + }, + { + "epoch": 2.784124157231904, + "ewc_loss": 0.05105140060186386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5525700038997456e-05, + "grad_norm": 31.702198028564453, + "learning_rate": 1e-06, + "loss": 0.4209, + "mean_token_accuracy": 0.8709579110145569, + "num_tokens": 835150586.0, + "step": 21886 + }, + { + "epoch": 2.7842513675104947, + "ewc_loss": 0.05099258944392204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5496294256299734e-05, + "grad_norm": 31.727964401245117, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8693606853485107, + "num_tokens": 835184049.0, + "step": 21887 + }, + { + "epoch": 2.7843785777890853, + "ewc_loss": 0.05100727453827858, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.550363751652185e-05, + "grad_norm": 31.661727905273438, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8665202856063843, + "num_tokens": 835222745.0, + "step": 21888 + }, + { + "epoch": 2.784505788067676, + "ewc_loss": 0.05102205276489258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5511026251479052e-05, + "grad_norm": 31.71979331970215, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8594710826873779, + "num_tokens": 835263017.0, + "step": 21889 + }, + { + "epoch": 2.7846329983462663, + "ewc_loss": 0.05108968913555145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5544844902469777e-05, + "grad_norm": 31.750154495239258, + "learning_rate": 1e-06, + "loss": 0.3754, + "mean_token_accuracy": 0.8852263689041138, + "num_tokens": 835303916.0, + "step": 21890 + }, + { + "epoch": 2.784760208624857, + "ewc_loss": 0.05095365270972252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5476825612713583e-05, + "grad_norm": 31.74250602722168, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.886191725730896, + "num_tokens": 835343148.0, + "step": 21891 + }, + { + "epoch": 2.7848874189034474, + "ewc_loss": 0.05106800049543381, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5534000087645836e-05, + "grad_norm": 31.747028350830078, + "learning_rate": 1e-06, + "loss": 0.429, + "mean_token_accuracy": 0.8684931993484497, + "num_tokens": 835382337.0, + "step": 21892 + }, + { + "epoch": 2.785014629182038, + "ewc_loss": 0.05100071430206299, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5500357878627256e-05, + "grad_norm": 31.757822036743164, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8651632070541382, + "num_tokens": 835423645.0, + "step": 21893 + }, + { + "epoch": 2.7851418394606284, + "ewc_loss": 0.05102203041315079, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.551101533754263e-05, + "grad_norm": 31.742212295532227, + "learning_rate": 1e-06, + "loss": 0.4981, + "mean_token_accuracy": 0.8437963128089905, + "num_tokens": 835464590.0, + "step": 21894 + }, + { + "epoch": 2.785269049739219, + "ewc_loss": 0.050998248159885406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.549912460381165e-05, + "grad_norm": 31.879032135009766, + "learning_rate": 1e-06, + "loss": 0.3509, + "mean_token_accuracy": 0.8945444226264954, + "num_tokens": 835496376.0, + "step": 21895 + }, + { + "epoch": 2.7853962600178095, + "ewc_loss": 0.05096770450472832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.548385236877948e-05, + "grad_norm": 31.601469039916992, + "learning_rate": 1e-06, + "loss": 0.4688, + "mean_token_accuracy": 0.8572870492935181, + "num_tokens": 835541231.0, + "step": 21896 + }, + { + "epoch": 2.7855234702964, + "ewc_loss": 0.050998058170080185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5499028197373264e-05, + "grad_norm": 31.878999710083008, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8746315240859985, + "num_tokens": 835584038.0, + "step": 21897 + }, + { + "epoch": 2.7856506805749905, + "ewc_loss": 0.051117192953825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5558596462360583e-05, + "grad_norm": 31.773286819458008, + "learning_rate": 1e-06, + "loss": 0.4425, + "mean_token_accuracy": 0.8655673265457153, + "num_tokens": 835627189.0, + "step": 21898 + }, + { + "epoch": 2.785777890853581, + "ewc_loss": 0.050861015915870667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5430508685531095e-05, + "grad_norm": 31.79324722290039, + "learning_rate": 1e-06, + "loss": 0.4077, + "mean_token_accuracy": 0.8716355562210083, + "num_tokens": 835659701.0, + "step": 21899 + }, + { + "epoch": 2.7859051011321716, + "ewc_loss": 0.05104896053671837, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5524479497107677e-05, + "grad_norm": 31.87693214416504, + "learning_rate": 1e-06, + "loss": 0.386, + "mean_token_accuracy": 0.87734454870224, + "num_tokens": 835694045.0, + "step": 21900 + }, + { + "epoch": 2.786032311410762, + "ewc_loss": 0.05089447647333145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5447237931075506e-05, + "grad_norm": 31.708988189697266, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8694919347763062, + "num_tokens": 835732118.0, + "step": 21901 + }, + { + "epoch": 2.7861595216893527, + "ewc_loss": 0.05097261816263199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.548630982346367e-05, + "grad_norm": 31.79426383972168, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8799246549606323, + "num_tokens": 835772700.0, + "step": 21902 + }, + { + "epoch": 2.7862867319679427, + "ewc_loss": 0.05098903551697731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.549451710365247e-05, + "grad_norm": 31.839088439941406, + "learning_rate": 1e-06, + "loss": 0.483, + "mean_token_accuracy": 0.8558111190795898, + "num_tokens": 835811206.0, + "step": 21903 + }, + { + "epoch": 2.7864139422465337, + "ewc_loss": 0.050959885120391846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.547994336055126e-05, + "grad_norm": 31.71131706237793, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8774163722991943, + "num_tokens": 835844381.0, + "step": 21904 + }, + { + "epoch": 2.786541152525124, + "ewc_loss": 0.050985075533390045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5492538043181412e-05, + "grad_norm": 32.03272247314453, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8637874722480774, + "num_tokens": 835879926.0, + "step": 21905 + }, + { + "epoch": 2.7866683628037148, + "ewc_loss": 0.05092360079288483, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5461800760240294e-05, + "grad_norm": 31.706655502319336, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8611549139022827, + "num_tokens": 835924014.0, + "step": 21906 + }, + { + "epoch": 2.786795573082305, + "ewc_loss": 0.050819385796785355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5409692170796916e-05, + "grad_norm": 31.67519187927246, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8650784492492676, + "num_tokens": 835956054.0, + "step": 21907 + }, + { + "epoch": 2.786922783360896, + "ewc_loss": 0.051060739904642105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.553036938479636e-05, + "grad_norm": 31.9083251953125, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8776642084121704, + "num_tokens": 835988885.0, + "step": 21908 + }, + { + "epoch": 2.787049993639486, + "ewc_loss": 0.05099610239267349, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.549805140006356e-05, + "grad_norm": 31.677457809448242, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8702701926231384, + "num_tokens": 836028885.0, + "step": 21909 + }, + { + "epoch": 2.7871772039180764, + "ewc_loss": 0.05082729831337929, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5413648472749628e-05, + "grad_norm": 31.783246994018555, + "learning_rate": 1e-06, + "loss": 0.3863, + "mean_token_accuracy": 0.8852641582489014, + "num_tokens": 836067340.0, + "step": 21910 + }, + { + "epoch": 2.787304414196667, + "ewc_loss": 0.05101049691438675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.550524914113339e-05, + "grad_norm": 31.730390548706055, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8707356452941895, + "num_tokens": 836104432.0, + "step": 21911 + }, + { + "epoch": 2.7874316244752575, + "ewc_loss": 0.05085011571645737, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5425057174288668e-05, + "grad_norm": 31.802181243896484, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.866416335105896, + "num_tokens": 836141197.0, + "step": 21912 + }, + { + "epoch": 2.787558834753848, + "ewc_loss": 0.050972480326890945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5486240701866336e-05, + "grad_norm": 31.782251358032227, + "learning_rate": 1e-06, + "loss": 0.3878, + "mean_token_accuracy": 0.8828559517860413, + "num_tokens": 836176949.0, + "step": 21913 + }, + { + "epoch": 2.7876860450324386, + "ewc_loss": 0.05093316733837128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5466582883382216e-05, + "grad_norm": 31.75846290588379, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8687086701393127, + "num_tokens": 836206484.0, + "step": 21914 + }, + { + "epoch": 2.787813255311029, + "ewc_loss": 0.05088286101818085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5441429897909984e-05, + "grad_norm": 31.767946243286133, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8766764402389526, + "num_tokens": 836243056.0, + "step": 21915 + }, + { + "epoch": 2.7879404655896196, + "ewc_loss": 0.050880178809165955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.544008930271957e-05, + "grad_norm": 31.64981460571289, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8664250373840332, + "num_tokens": 836281154.0, + "step": 21916 + }, + { + "epoch": 2.78806767586821, + "ewc_loss": 0.05092344805598259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5461724362685345e-05, + "grad_norm": 31.78310203552246, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8708473443984985, + "num_tokens": 836319895.0, + "step": 21917 + }, + { + "epoch": 2.7881948861468007, + "ewc_loss": 0.050973907113075256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5486953745712526e-05, + "grad_norm": 31.755788803100586, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.87646484375, + "num_tokens": 836357227.0, + "step": 21918 + }, + { + "epoch": 2.788322096425391, + "ewc_loss": 0.05098939314484596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.549469718360342e-05, + "grad_norm": 31.763439178466797, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8644850850105286, + "num_tokens": 836398442.0, + "step": 21919 + }, + { + "epoch": 2.7884493067039817, + "ewc_loss": 0.05105087161064148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5525436285533942e-05, + "grad_norm": 31.87904930114746, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8794786930084229, + "num_tokens": 836436660.0, + "step": 21920 + }, + { + "epoch": 2.7885765169825723, + "ewc_loss": 0.05103220045566559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.551609941292554e-05, + "grad_norm": 31.817607879638672, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8659631609916687, + "num_tokens": 836479180.0, + "step": 21921 + }, + { + "epoch": 2.788703727261163, + "ewc_loss": 0.050971049815416336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.548552402004134e-05, + "grad_norm": 31.864728927612305, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8773597478866577, + "num_tokens": 836518874.0, + "step": 21922 + }, + { + "epoch": 2.7888309375397533, + "ewc_loss": 0.05092747509479523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.546373798395507e-05, + "grad_norm": 31.818187713623047, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8683240413665771, + "num_tokens": 836556910.0, + "step": 21923 + }, + { + "epoch": 2.788958147818344, + "ewc_loss": 0.050860390067100525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5430195819353685e-05, + "grad_norm": 31.711734771728516, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8833844661712646, + "num_tokens": 836598151.0, + "step": 21924 + }, + { + "epoch": 2.7890853580969344, + "ewc_loss": 0.05094650760293007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.547325311752502e-05, + "grad_norm": 31.98440170288086, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8640996217727661, + "num_tokens": 836637952.0, + "step": 21925 + }, + { + "epoch": 2.789212568375525, + "ewc_loss": 0.05101381242275238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.550690624048002e-05, + "grad_norm": 31.730125427246094, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8854875564575195, + "num_tokens": 836677138.0, + "step": 21926 + }, + { + "epoch": 2.7893397786541154, + "ewc_loss": 0.050849609076976776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5424804334761575e-05, + "grad_norm": 31.879215240478516, + "learning_rate": 1e-06, + "loss": 0.4035, + "mean_token_accuracy": 0.8757998943328857, + "num_tokens": 836716880.0, + "step": 21927 + }, + { + "epoch": 2.7894669889327055, + "ewc_loss": 0.05097153037786484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.548576594563201e-05, + "grad_norm": 31.737796783447266, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8851709365844727, + "num_tokens": 836752572.0, + "step": 21928 + }, + { + "epoch": 2.7895941992112965, + "ewc_loss": 0.0508764311671257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.543821574363392e-05, + "grad_norm": 31.867982864379883, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8682442903518677, + "num_tokens": 836794321.0, + "step": 21929 + }, + { + "epoch": 2.7897214094898866, + "ewc_loss": 0.05097798630595207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.54889928328339e-05, + "grad_norm": 31.94230842590332, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.885273814201355, + "num_tokens": 836829183.0, + "step": 21930 + }, + { + "epoch": 2.7898486197684775, + "ewc_loss": 0.050878822803497314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5439410819672048e-05, + "grad_norm": 31.816251754760742, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8612226247787476, + "num_tokens": 836864644.0, + "step": 21931 + }, + { + "epoch": 2.7899758300470676, + "ewc_loss": 0.050765134394168854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.538256740081124e-05, + "grad_norm": 31.77363395690918, + "learning_rate": 1e-06, + "loss": 0.3834, + "mean_token_accuracy": 0.8839675784111023, + "num_tokens": 836895384.0, + "step": 21932 + }, + { + "epoch": 2.7901030403256586, + "ewc_loss": 0.050957631319761276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5478815587121062e-05, + "grad_norm": 31.96517562866211, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8739251494407654, + "num_tokens": 836943336.0, + "step": 21933 + }, + { + "epoch": 2.7902302506042487, + "ewc_loss": 0.05089084804058075, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.544542439864017e-05, + "grad_norm": 31.919984817504883, + "learning_rate": 1e-06, + "loss": 0.3709, + "mean_token_accuracy": 0.8865724802017212, + "num_tokens": 836980708.0, + "step": 21934 + }, + { + "epoch": 2.790357460882839, + "ewc_loss": 0.05081060156226158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5405301130376756e-05, + "grad_norm": 31.880542755126953, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.8757623434066772, + "num_tokens": 837015738.0, + "step": 21935 + }, + { + "epoch": 2.7904846711614297, + "ewc_loss": 0.05081182345747948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5405912310816348e-05, + "grad_norm": 31.93044662475586, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8777374625205994, + "num_tokens": 837053014.0, + "step": 21936 + }, + { + "epoch": 2.7906118814400203, + "ewc_loss": 0.05075373128056526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5376864869031124e-05, + "grad_norm": 31.63728904724121, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.866523027420044, + "num_tokens": 837083579.0, + "step": 21937 + }, + { + "epoch": 2.790739091718611, + "ewc_loss": 0.050799231976270676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5399616788490675e-05, + "grad_norm": 31.80483055114746, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8799774646759033, + "num_tokens": 837126201.0, + "step": 21938 + }, + { + "epoch": 2.7908663019972013, + "ewc_loss": 0.05097734183073044, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5488670871709473e-05, + "grad_norm": 31.912918090820312, + "learning_rate": 1e-06, + "loss": 0.3899, + "mean_token_accuracy": 0.8814077377319336, + "num_tokens": 837162400.0, + "step": 21939 + }, + { + "epoch": 2.790993512275792, + "ewc_loss": 0.05080601945519447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.540300920372829e-05, + "grad_norm": 31.6708984375, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8753921985626221, + "num_tokens": 837205733.0, + "step": 21940 + }, + { + "epoch": 2.7911207225543824, + "ewc_loss": 0.050829771906137466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.541488538554404e-05, + "grad_norm": 31.786741256713867, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8607434034347534, + "num_tokens": 837243838.0, + "step": 21941 + }, + { + "epoch": 2.791247932832973, + "ewc_loss": 0.0510258786380291, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5512939828331582e-05, + "grad_norm": 31.838911056518555, + "learning_rate": 1e-06, + "loss": 0.4257, + "mean_token_accuracy": 0.868092954158783, + "num_tokens": 837280936.0, + "step": 21942 + }, + { + "epoch": 2.7913751431115634, + "ewc_loss": 0.0509498231112957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5474912035861053e-05, + "grad_norm": 31.80136489868164, + "learning_rate": 1e-06, + "loss": 0.412, + "mean_token_accuracy": 0.871575117111206, + "num_tokens": 837321790.0, + "step": 21943 + }, + { + "epoch": 2.791502353390154, + "ewc_loss": 0.050986118614673615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.549306009314023e-05, + "grad_norm": 31.90705680847168, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8741543292999268, + "num_tokens": 837362023.0, + "step": 21944 + }, + { + "epoch": 2.7916295636687445, + "ewc_loss": 0.05077766254544258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5388832000317052e-05, + "grad_norm": 31.68136215209961, + "learning_rate": 1e-06, + "loss": 0.3907, + "mean_token_accuracy": 0.8788120150566101, + "num_tokens": 837404876.0, + "step": 21945 + }, + { + "epoch": 2.791756773947335, + "ewc_loss": 0.0508735328912735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5436766009079292e-05, + "grad_norm": 31.804948806762695, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8648576140403748, + "num_tokens": 837442899.0, + "step": 21946 + }, + { + "epoch": 2.7918839842259255, + "ewc_loss": 0.05082887411117554, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5414437914150767e-05, + "grad_norm": 31.66672134399414, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8822009563446045, + "num_tokens": 837477611.0, + "step": 21947 + }, + { + "epoch": 2.792011194504516, + "ewc_loss": 0.050904449075460434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5452223781030625e-05, + "grad_norm": 31.74237060546875, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.8589377999305725, + "num_tokens": 837515928.0, + "step": 21948 + }, + { + "epoch": 2.7921384047831066, + "ewc_loss": 0.050938114523887634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.546905670897104e-05, + "grad_norm": 31.74095916748047, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8725159764289856, + "num_tokens": 837553042.0, + "step": 21949 + }, + { + "epoch": 2.792265615061697, + "ewc_loss": 0.05090733990073204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5453669877606444e-05, + "grad_norm": 31.621273040771484, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8776376247406006, + "num_tokens": 837589749.0, + "step": 21950 + }, + { + "epoch": 2.7923928253402877, + "ewc_loss": 0.05093582347035408, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5467912564636208e-05, + "grad_norm": 31.717788696289062, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8900927305221558, + "num_tokens": 837631104.0, + "step": 21951 + }, + { + "epoch": 2.792520035618878, + "ewc_loss": 0.05099208280444145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.549604141677264e-05, + "grad_norm": 31.741262435913086, + "learning_rate": 1e-06, + "loss": 0.3879, + "mean_token_accuracy": 0.8813521862030029, + "num_tokens": 837668173.0, + "step": 21952 + }, + { + "epoch": 2.7926472458974683, + "ewc_loss": 0.05087386816740036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.543693335610442e-05, + "grad_norm": 31.703895568847656, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8787466883659363, + "num_tokens": 837704282.0, + "step": 21953 + }, + { + "epoch": 2.7927744561760592, + "ewc_loss": 0.050988297909498215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5494149667792954e-05, + "grad_norm": 31.786296844482422, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.863470196723938, + "num_tokens": 837750790.0, + "step": 21954 + }, + { + "epoch": 2.7929016664546493, + "ewc_loss": 0.05101318657398224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.550659337430261e-05, + "grad_norm": 31.78260040283203, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8695698380470276, + "num_tokens": 837792041.0, + "step": 21955 + }, + { + "epoch": 2.7930288767332403, + "ewc_loss": 0.0509486123919487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5474306312389672e-05, + "grad_norm": 31.74264144897461, + "learning_rate": 1e-06, + "loss": 0.3908, + "mean_token_accuracy": 0.8785343170166016, + "num_tokens": 837818645.0, + "step": 21956 + }, + { + "epoch": 2.7931560870118304, + "ewc_loss": 0.050940774381160736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.547038639022503e-05, + "grad_norm": 31.66042709350586, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8818659782409668, + "num_tokens": 837859816.0, + "step": 21957 + }, + { + "epoch": 2.793283297290421, + "ewc_loss": 0.05100541189312935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5502706193947233e-05, + "grad_norm": 31.758119583129883, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8676347732543945, + "num_tokens": 837903970.0, + "step": 21958 + }, + { + "epoch": 2.7934105075690114, + "ewc_loss": 0.0510491207242012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5524559532641433e-05, + "grad_norm": 31.72295379638672, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8722456097602844, + "num_tokens": 837941158.0, + "step": 21959 + }, + { + "epoch": 2.793537717847602, + "ewc_loss": 0.05099250003695488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.549625060055405e-05, + "grad_norm": 31.807287216186523, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8783202171325684, + "num_tokens": 837978157.0, + "step": 21960 + }, + { + "epoch": 2.7936649281261925, + "ewc_loss": 0.05107702687382698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5538513000356033e-05, + "grad_norm": 31.8492431640625, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8838373422622681, + "num_tokens": 838017871.0, + "step": 21961 + }, + { + "epoch": 2.793792138404783, + "ewc_loss": 0.05099129676818848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5495648515061475e-05, + "grad_norm": 31.82317352294922, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.883872926235199, + "num_tokens": 838050521.0, + "step": 21962 + }, + { + "epoch": 2.7939193486833735, + "ewc_loss": 0.05094568431377411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5472842025919817e-05, + "grad_norm": 31.69982147216797, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8807135224342346, + "num_tokens": 838084450.0, + "step": 21963 + }, + { + "epoch": 2.794046558961964, + "ewc_loss": 0.050964660942554474, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5482329874648713e-05, + "grad_norm": 31.800073623657227, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.867082953453064, + "num_tokens": 838121505.0, + "step": 21964 + }, + { + "epoch": 2.7941737692405546, + "ewc_loss": 0.05101090297102928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5505451048957184e-05, + "grad_norm": 31.85354232788086, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8713600635528564, + "num_tokens": 838166148.0, + "step": 21965 + }, + { + "epoch": 2.794300979519145, + "ewc_loss": 0.05095933377742767, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.547966687416192e-05, + "grad_norm": 31.789823532104492, + "learning_rate": 1e-06, + "loss": 0.4037, + "mean_token_accuracy": 0.8774399757385254, + "num_tokens": 838203578.0, + "step": 21966 + }, + { + "epoch": 2.7944281897977357, + "ewc_loss": 0.05094131827354431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5470659238635562e-05, + "grad_norm": 31.803035736083984, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8770685195922852, + "num_tokens": 838240702.0, + "step": 21967 + }, + { + "epoch": 2.794555400076326, + "ewc_loss": 0.05101088434457779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5505441954010166e-05, + "grad_norm": 31.71798324584961, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8796239495277405, + "num_tokens": 838275639.0, + "step": 21968 + }, + { + "epoch": 2.7946826103549167, + "ewc_loss": 0.05092758312821388, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5463790734647773e-05, + "grad_norm": 31.73977279663086, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8736050724983215, + "num_tokens": 838310045.0, + "step": 21969 + }, + { + "epoch": 2.7948098206335072, + "ewc_loss": 0.05106709152460098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.553354534029495e-05, + "grad_norm": 31.736011505126953, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8735794425010681, + "num_tokens": 838351571.0, + "step": 21970 + }, + { + "epoch": 2.7949370309120978, + "ewc_loss": 0.05100776255130768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5503881261101924e-05, + "grad_norm": 31.7789363861084, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8844283819198608, + "num_tokens": 838393804.0, + "step": 21971 + }, + { + "epoch": 2.7950642411906883, + "ewc_loss": 0.05085158720612526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5425793864997104e-05, + "grad_norm": 31.57878303527832, + "learning_rate": 1e-06, + "loss": 0.421, + "mean_token_accuracy": 0.8721258640289307, + "num_tokens": 838435516.0, + "step": 21972 + }, + { + "epoch": 2.795191451469279, + "ewc_loss": 0.0509444959461689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5472247216384858e-05, + "grad_norm": 31.799644470214844, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8806886672973633, + "num_tokens": 838465425.0, + "step": 21973 + }, + { + "epoch": 2.7953186617478694, + "ewc_loss": 0.05107692629098892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5538463887642138e-05, + "grad_norm": 31.831239700317383, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8702278733253479, + "num_tokens": 838512832.0, + "step": 21974 + }, + { + "epoch": 2.79544587202646, + "ewc_loss": 0.05102404206991196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.551202123868279e-05, + "grad_norm": 31.84307098388672, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8786935210227966, + "num_tokens": 838552806.0, + "step": 21975 + }, + { + "epoch": 2.79557308230505, + "ewc_loss": 0.05100617557764053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5503088181721978e-05, + "grad_norm": 31.79582405090332, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8705147504806519, + "num_tokens": 838588592.0, + "step": 21976 + }, + { + "epoch": 2.795700292583641, + "ewc_loss": 0.05103833228349686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.551916622905992e-05, + "grad_norm": 31.872900009155273, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8706492781639099, + "num_tokens": 838629402.0, + "step": 21977 + }, + { + "epoch": 2.795827502862231, + "ewc_loss": 0.05093150585889816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.54657534242142e-05, + "grad_norm": 31.57497787475586, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8636324405670166, + "num_tokens": 838676580.0, + "step": 21978 + }, + { + "epoch": 2.795954713140822, + "ewc_loss": 0.051014695316553116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5507348254905082e-05, + "grad_norm": 31.900667190551758, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8672010898590088, + "num_tokens": 838715773.0, + "step": 21979 + }, + { + "epoch": 2.796081923419412, + "ewc_loss": 0.05101598799228668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.550799399614334e-05, + "grad_norm": 31.72518539428711, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8623020052909851, + "num_tokens": 838750270.0, + "step": 21980 + }, + { + "epoch": 2.796209133698003, + "ewc_loss": 0.05089356377720833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5446781364735216e-05, + "grad_norm": 31.859773635864258, + "learning_rate": 1e-06, + "loss": 0.4816, + "mean_token_accuracy": 0.8510856032371521, + "num_tokens": 838794823.0, + "step": 21981 + }, + { + "epoch": 2.796336343976593, + "ewc_loss": 0.05105172097682953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.552586011006497e-05, + "grad_norm": 31.73715591430664, + "learning_rate": 1e-06, + "loss": 0.4644, + "mean_token_accuracy": 0.8648220896720886, + "num_tokens": 838828901.0, + "step": 21982 + }, + { + "epoch": 2.7964635542551837, + "ewc_loss": 0.05083190277218819, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5415951313334517e-05, + "grad_norm": 31.74753189086914, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8635761737823486, + "num_tokens": 838872648.0, + "step": 21983 + }, + { + "epoch": 2.796590764533774, + "ewc_loss": 0.0510086789727211, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5504339646431617e-05, + "grad_norm": 31.76681900024414, + "learning_rate": 1e-06, + "loss": 0.3749, + "mean_token_accuracy": 0.8855623006820679, + "num_tokens": 838905150.0, + "step": 21984 + }, + { + "epoch": 2.7967179748123647, + "ewc_loss": 0.051035601645708084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5517800168017857e-05, + "grad_norm": 31.840965270996094, + "learning_rate": 1e-06, + "loss": 0.4239, + "mean_token_accuracy": 0.8723715543746948, + "num_tokens": 838949027.0, + "step": 21985 + }, + { + "epoch": 2.7968451850909553, + "ewc_loss": 0.050977952778339386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.548897646192927e-05, + "grad_norm": 31.827341079711914, + "learning_rate": 1e-06, + "loss": 0.4114, + "mean_token_accuracy": 0.8735560178756714, + "num_tokens": 838987320.0, + "step": 21986 + }, + { + "epoch": 2.796972395369546, + "ewc_loss": 0.05093313381075859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5466566512477584e-05, + "grad_norm": 31.807132720947266, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.867827296257019, + "num_tokens": 839027396.0, + "step": 21987 + }, + { + "epoch": 2.7970996056481363, + "ewc_loss": 0.050884004682302475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.54420028795721e-05, + "grad_norm": 31.726787567138672, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8790320158004761, + "num_tokens": 839069779.0, + "step": 21988 + }, + { + "epoch": 2.797226815926727, + "ewc_loss": 0.05093108117580414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5465540602453984e-05, + "grad_norm": 31.74822235107422, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8790042996406555, + "num_tokens": 839110919.0, + "step": 21989 + }, + { + "epoch": 2.7973540262053174, + "ewc_loss": 0.050959520041942596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5479759642621502e-05, + "grad_norm": 31.79268455505371, + "learning_rate": 1e-06, + "loss": 0.3797, + "mean_token_accuracy": 0.8866968750953674, + "num_tokens": 839143689.0, + "step": 21990 + }, + { + "epoch": 2.797481236483908, + "ewc_loss": 0.050968676805496216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5484338038950227e-05, + "grad_norm": 31.866910934448242, + "learning_rate": 1e-06, + "loss": 0.479, + "mean_token_accuracy": 0.8537396192550659, + "num_tokens": 839181274.0, + "step": 21991 + }, + { + "epoch": 2.7976084467624984, + "ewc_loss": 0.050958555191755295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5479277610429563e-05, + "grad_norm": 31.670400619506836, + "learning_rate": 1e-06, + "loss": 0.4699, + "mean_token_accuracy": 0.8564577698707581, + "num_tokens": 839217038.0, + "step": 21992 + }, + { + "epoch": 2.797735657041089, + "ewc_loss": 0.0508674755692482, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.543373739172239e-05, + "grad_norm": 31.71748161315918, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.872086226940155, + "num_tokens": 839260104.0, + "step": 21993 + }, + { + "epoch": 2.7978628673196795, + "ewc_loss": 0.05102936923503876, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5514684239169583e-05, + "grad_norm": 31.902559280395508, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8758971095085144, + "num_tokens": 839294328.0, + "step": 21994 + }, + { + "epoch": 2.79799007759827, + "ewc_loss": 0.050970133394002914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.548506745370105e-05, + "grad_norm": 31.727903366088867, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8714972734451294, + "num_tokens": 839326662.0, + "step": 21995 + }, + { + "epoch": 2.7981172878768605, + "ewc_loss": 0.05091381072998047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5456905859755352e-05, + "grad_norm": 31.970348358154297, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8733523488044739, + "num_tokens": 839355997.0, + "step": 21996 + }, + { + "epoch": 2.798244498155451, + "ewc_loss": 0.05099966377019882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.549983219068963e-05, + "grad_norm": 31.770965576171875, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8779544830322266, + "num_tokens": 839396596.0, + "step": 21997 + }, + { + "epoch": 2.7983717084340416, + "ewc_loss": 0.05097741261124611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.548870543250814e-05, + "grad_norm": 31.873950958251953, + "learning_rate": 1e-06, + "loss": 0.4193, + "mean_token_accuracy": 0.8736705780029297, + "num_tokens": 839432251.0, + "step": 21998 + }, + { + "epoch": 2.798498918712632, + "ewc_loss": 0.051042523235082626, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5521261704852805e-05, + "grad_norm": 31.86823272705078, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8807923197746277, + "num_tokens": 839465987.0, + "step": 21999 + }, + { + "epoch": 2.7986261289912227, + "ewc_loss": 0.050904158502817154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5452080080867745e-05, + "grad_norm": 31.834028244018555, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8740723133087158, + "num_tokens": 839506147.0, + "step": 22000 + }, + { + "epoch": 2.7987533392698127, + "ewc_loss": 0.05094088986515999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5470444597885944e-05, + "grad_norm": 31.769554138183594, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8648924231529236, + "num_tokens": 839544620.0, + "step": 22001 + }, + { + "epoch": 2.7988805495484037, + "ewc_loss": 0.051059093326330185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5529547201585956e-05, + "grad_norm": 32.031959533691406, + "learning_rate": 1e-06, + "loss": 0.4556, + "mean_token_accuracy": 0.8644925951957703, + "num_tokens": 839583217.0, + "step": 22002 + }, + { + "epoch": 2.799007759826994, + "ewc_loss": 0.050986506044864655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5493252906017005e-05, + "grad_norm": 31.67320442199707, + "learning_rate": 1e-06, + "loss": 0.3703, + "mean_token_accuracy": 0.8906221389770508, + "num_tokens": 839621283.0, + "step": 22003 + }, + { + "epoch": 2.7991349701055848, + "ewc_loss": 0.050875697284936905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5437848307774402e-05, + "grad_norm": 31.764404296875, + "learning_rate": 1e-06, + "loss": 0.4314, + "mean_token_accuracy": 0.8682501316070557, + "num_tokens": 839658072.0, + "step": 22004 + }, + { + "epoch": 2.799262180384175, + "ewc_loss": 0.051091987639665604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5545994503772818e-05, + "grad_norm": 31.79143524169922, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8544602990150452, + "num_tokens": 839700926.0, + "step": 22005 + }, + { + "epoch": 2.799389390662766, + "ewc_loss": 0.05103164166212082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.55158211075468e-05, + "grad_norm": 31.821422576904297, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8820902109146118, + "num_tokens": 839736160.0, + "step": 22006 + }, + { + "epoch": 2.799516600941356, + "ewc_loss": 0.05101082846522331, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5505414669169113e-05, + "grad_norm": 31.789318084716797, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8866064548492432, + "num_tokens": 839776834.0, + "step": 22007 + }, + { + "epoch": 2.7996438112199464, + "ewc_loss": 0.05099114030599594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5495570298517123e-05, + "grad_norm": 31.73357391357422, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8647564649581909, + "num_tokens": 839818156.0, + "step": 22008 + }, + { + "epoch": 2.799771021498537, + "ewc_loss": 0.051057688891887665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5528845071676187e-05, + "grad_norm": 31.835756301879883, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.865994393825531, + "num_tokens": 839857133.0, + "step": 22009 + }, + { + "epoch": 2.7998982317771275, + "ewc_loss": 0.050953418016433716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5476709197391756e-05, + "grad_norm": 31.770610809326172, + "learning_rate": 1e-06, + "loss": 0.4589, + "mean_token_accuracy": 0.8569145798683167, + "num_tokens": 839893129.0, + "step": 22010 + }, + { + "epoch": 2.800025442055718, + "ewc_loss": 0.05096517130732536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5482586352154613e-05, + "grad_norm": 31.919052124023438, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8756452798843384, + "num_tokens": 839924622.0, + "step": 22011 + }, + { + "epoch": 2.8001526523343085, + "ewc_loss": 0.05102693289518356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5513465516269207e-05, + "grad_norm": 31.69502067565918, + "learning_rate": 1e-06, + "loss": 0.3613, + "mean_token_accuracy": 0.8913713097572327, + "num_tokens": 839961276.0, + "step": 22012 + }, + { + "epoch": 2.800279862612899, + "ewc_loss": 0.05083770677447319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5418852601433173e-05, + "grad_norm": 31.772186279296875, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8618766069412231, + "num_tokens": 840002554.0, + "step": 22013 + }, + { + "epoch": 2.8004070728914896, + "ewc_loss": 0.051112253218889236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5556126274750568e-05, + "grad_norm": 31.874317169189453, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8715305328369141, + "num_tokens": 840041642.0, + "step": 22014 + }, + { + "epoch": 2.80053428317008, + "ewc_loss": 0.0508505180478096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.542525908211246e-05, + "grad_norm": 31.67338752746582, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8811430931091309, + "num_tokens": 840083532.0, + "step": 22015 + }, + { + "epoch": 2.8006614934486707, + "ewc_loss": 0.05102058872580528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5510295017738827e-05, + "grad_norm": 31.853614807128906, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8682656288146973, + "num_tokens": 840124221.0, + "step": 22016 + }, + { + "epoch": 2.800788703727261, + "ewc_loss": 0.051010265946388245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5505132725811563e-05, + "grad_norm": 31.87656021118164, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8772661685943604, + "num_tokens": 840162309.0, + "step": 22017 + }, + { + "epoch": 2.8009159140058517, + "ewc_loss": 0.05095828324556351, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5479141186224297e-05, + "grad_norm": 31.906448364257812, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8737192749977112, + "num_tokens": 840194778.0, + "step": 22018 + }, + { + "epoch": 2.8010431242844422, + "ewc_loss": 0.05095498636364937, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5477493181824684e-05, + "grad_norm": 31.704092025756836, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8905528783798218, + "num_tokens": 840225081.0, + "step": 22019 + }, + { + "epoch": 2.8011703345630328, + "ewc_loss": 0.05099533870816231, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5497669412288815e-05, + "grad_norm": 31.860811233520508, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.871422529220581, + "num_tokens": 840267698.0, + "step": 22020 + }, + { + "epoch": 2.8012975448416233, + "ewc_loss": 0.05103573575615883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5517867470625788e-05, + "grad_norm": 31.8077392578125, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8652746677398682, + "num_tokens": 840303255.0, + "step": 22021 + }, + { + "epoch": 2.801424755120214, + "ewc_loss": 0.05088141933083534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.544070957810618e-05, + "grad_norm": 31.823692321777344, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8709297180175781, + "num_tokens": 840344616.0, + "step": 22022 + }, + { + "epoch": 2.8015519653988044, + "ewc_loss": 0.051025308668613434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5512654246995226e-05, + "grad_norm": 31.754270553588867, + "learning_rate": 1e-06, + "loss": 0.4055, + "mean_token_accuracy": 0.8731985688209534, + "num_tokens": 840383293.0, + "step": 22023 + }, + { + "epoch": 2.801679175677395, + "ewc_loss": 0.05103336647152901, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.551668330852408e-05, + "grad_norm": 31.792987823486328, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8633386492729187, + "num_tokens": 840424583.0, + "step": 22024 + }, + { + "epoch": 2.8018063859559854, + "ewc_loss": 0.05108669027686119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5543346055201255e-05, + "grad_norm": 31.870710372924805, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8663069605827332, + "num_tokens": 840458752.0, + "step": 22025 + }, + { + "epoch": 2.8019335962345755, + "ewc_loss": 0.05104035511612892, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.552017758716829e-05, + "grad_norm": 31.870315551757812, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8780055642127991, + "num_tokens": 840498441.0, + "step": 22026 + }, + { + "epoch": 2.8020608065131665, + "ewc_loss": 0.05100104585289955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5500523406662978e-05, + "grad_norm": 31.750410079956055, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8829410672187805, + "num_tokens": 840537838.0, + "step": 22027 + }, + { + "epoch": 2.8021880167917566, + "ewc_loss": 0.05101737380027771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5508687031106092e-05, + "grad_norm": 31.92880630493164, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8611990213394165, + "num_tokens": 840571770.0, + "step": 22028 + }, + { + "epoch": 2.8023152270703475, + "ewc_loss": 0.051047392189502716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.552369551267475e-05, + "grad_norm": 31.80877685546875, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8673114776611328, + "num_tokens": 840607559.0, + "step": 22029 + }, + { + "epoch": 2.8024424373489376, + "ewc_loss": 0.050972871482372284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5486435333732516e-05, + "grad_norm": 31.856521606445312, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8574711084365845, + "num_tokens": 840650156.0, + "step": 22030 + }, + { + "epoch": 2.8025696476275286, + "ewc_loss": 0.050973955541849136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5486977392574772e-05, + "grad_norm": 31.7436466217041, + "learning_rate": 1e-06, + "loss": 0.4538, + "mean_token_accuracy": 0.8642791509628296, + "num_tokens": 840686590.0, + "step": 22031 + }, + { + "epoch": 2.8026968579061187, + "ewc_loss": 0.05096273124217987, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5481365810264833e-05, + "grad_norm": 31.711782455444336, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8647401332855225, + "num_tokens": 840724445.0, + "step": 22032 + }, + { + "epoch": 2.802824068184709, + "ewc_loss": 0.051130421459674835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5565210307831876e-05, + "grad_norm": 31.988954544067383, + "learning_rate": 1e-06, + "loss": 0.3691, + "mean_token_accuracy": 0.8870474696159363, + "num_tokens": 840765047.0, + "step": 22033 + }, + { + "epoch": 2.8029512784632997, + "ewc_loss": 0.05101119354367256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5505596568109468e-05, + "grad_norm": 31.72432518005371, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8595811724662781, + "num_tokens": 840802375.0, + "step": 22034 + }, + { + "epoch": 2.8030784887418903, + "ewc_loss": 0.05103949084877968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5519744667690247e-05, + "grad_norm": 31.93330955505371, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8710540533065796, + "num_tokens": 840839994.0, + "step": 22035 + }, + { + "epoch": 2.803205699020481, + "ewc_loss": 0.05104156956076622, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5520785129629076e-05, + "grad_norm": 31.659862518310547, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8702144622802734, + "num_tokens": 840877144.0, + "step": 22036 + }, + { + "epoch": 2.8033329092990713, + "ewc_loss": 0.05100364610552788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.550182216509711e-05, + "grad_norm": 31.8776912689209, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8632024526596069, + "num_tokens": 840918478.0, + "step": 22037 + }, + { + "epoch": 2.803460119577662, + "ewc_loss": 0.05115596577525139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5577983251423575e-05, + "grad_norm": 31.756559371948242, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8810646533966064, + "num_tokens": 840958484.0, + "step": 22038 + }, + { + "epoch": 2.8035873298562524, + "ewc_loss": 0.050997696816921234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5498848117422312e-05, + "grad_norm": 31.78712272644043, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.874891996383667, + "num_tokens": 841000035.0, + "step": 22039 + }, + { + "epoch": 2.803714540134843, + "ewc_loss": 0.051076389849185944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5538194677210413e-05, + "grad_norm": 31.76284408569336, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.8668732643127441, + "num_tokens": 841035836.0, + "step": 22040 + }, + { + "epoch": 2.8038417504134334, + "ewc_loss": 0.05101153254508972, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5505765734123997e-05, + "grad_norm": 31.81251335144043, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8731933832168579, + "num_tokens": 841069295.0, + "step": 22041 + }, + { + "epoch": 2.803968960692024, + "ewc_loss": 0.0511111356317997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.555556784500368e-05, + "grad_norm": 31.740633010864258, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.8679710626602173, + "num_tokens": 841108341.0, + "step": 22042 + }, + { + "epoch": 2.8040961709706145, + "ewc_loss": 0.05106135085225105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5530674975016154e-05, + "grad_norm": 31.788654327392578, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8816321492195129, + "num_tokens": 841142945.0, + "step": 22043 + }, + { + "epoch": 2.804223381249205, + "ewc_loss": 0.051068954169750214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5534476662869565e-05, + "grad_norm": 31.72439193725586, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8849323987960815, + "num_tokens": 841174137.0, + "step": 22044 + }, + { + "epoch": 2.8043505915277955, + "ewc_loss": 0.05109017342329025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5545086828060448e-05, + "grad_norm": 31.888349533081055, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8820275068283081, + "num_tokens": 841215914.0, + "step": 22045 + }, + { + "epoch": 2.804477801806386, + "ewc_loss": 0.05115539953112602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5577699489076622e-05, + "grad_norm": 31.84654426574707, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8736817836761475, + "num_tokens": 841256097.0, + "step": 22046 + }, + { + "epoch": 2.8046050120849766, + "ewc_loss": 0.050937097519636154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.546854921092745e-05, + "grad_norm": 31.770505905151367, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8760918378829956, + "num_tokens": 841288997.0, + "step": 22047 + }, + { + "epoch": 2.804732222363567, + "ewc_loss": 0.05099353566765785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5496767193544656e-05, + "grad_norm": 31.79527473449707, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8667998909950256, + "num_tokens": 841328053.0, + "step": 22048 + }, + { + "epoch": 2.8048594326421576, + "ewc_loss": 0.0511280819773674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5564040697645396e-05, + "grad_norm": 31.848953247070312, + "learning_rate": 1e-06, + "loss": 0.3646, + "mean_token_accuracy": 0.8906557559967041, + "num_tokens": 841366848.0, + "step": 22049 + }, + { + "epoch": 2.804986642920748, + "ewc_loss": 0.051043253391981125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5521627321722917e-05, + "grad_norm": 31.79374885559082, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8808176517486572, + "num_tokens": 841398568.0, + "step": 22050 + }, + { + "epoch": 2.8051138531993383, + "ewc_loss": 0.05097370967268944, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5486855520284735e-05, + "grad_norm": 31.758007049560547, + "learning_rate": 1e-06, + "loss": 0.419, + "mean_token_accuracy": 0.8700836896896362, + "num_tokens": 841435722.0, + "step": 22051 + }, + { + "epoch": 2.8052410634779292, + "ewc_loss": 0.0510568842291832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.55284412560286e-05, + "grad_norm": 31.748207092285156, + "learning_rate": 1e-06, + "loss": 0.4362, + "mean_token_accuracy": 0.868481457233429, + "num_tokens": 841473635.0, + "step": 22052 + }, + { + "epoch": 2.8053682737565193, + "ewc_loss": 0.051059506833553314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5529752747388557e-05, + "grad_norm": 31.677738189697266, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8759446144104004, + "num_tokens": 841512383.0, + "step": 22053 + }, + { + "epoch": 2.8054954840351103, + "ewc_loss": 0.05110969766974449, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.555484934418928e-05, + "grad_norm": 31.866384506225586, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8669819831848145, + "num_tokens": 841543174.0, + "step": 22054 + }, + { + "epoch": 2.8056226943137004, + "ewc_loss": 0.05109637230634689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.554818638600409e-05, + "grad_norm": 31.761688232421875, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8779382109642029, + "num_tokens": 841582185.0, + "step": 22055 + }, + { + "epoch": 2.805749904592291, + "ewc_loss": 0.05106806755065918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5534034648444504e-05, + "grad_norm": 31.883380889892578, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8670892715454102, + "num_tokens": 841623179.0, + "step": 22056 + }, + { + "epoch": 2.8058771148708814, + "ewc_loss": 0.05113595724105835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5567978809704073e-05, + "grad_norm": 31.655778884887695, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8807659149169922, + "num_tokens": 841661062.0, + "step": 22057 + }, + { + "epoch": 2.806004325149472, + "ewc_loss": 0.051012273877859116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.550613680796232e-05, + "grad_norm": 31.845951080322266, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8698196411132812, + "num_tokens": 841704259.0, + "step": 22058 + }, + { + "epoch": 2.8061315354280625, + "ewc_loss": 0.051142748445272446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5571374862920493e-05, + "grad_norm": 31.758325576782227, + "learning_rate": 1e-06, + "loss": 0.4276, + "mean_token_accuracy": 0.870389997959137, + "num_tokens": 841748396.0, + "step": 22059 + }, + { + "epoch": 2.806258745706653, + "ewc_loss": 0.051062047481536865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5531024220981635e-05, + "grad_norm": 31.777612686157227, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8911524415016174, + "num_tokens": 841787809.0, + "step": 22060 + }, + { + "epoch": 2.8063859559852435, + "ewc_loss": 0.051144275814294815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.557213701948058e-05, + "grad_norm": 31.79204750061035, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8721893429756165, + "num_tokens": 841824046.0, + "step": 22061 + }, + { + "epoch": 2.806513166263834, + "ewc_loss": 0.0510929636657238, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5546481992932968e-05, + "grad_norm": 31.72573471069336, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8811878561973572, + "num_tokens": 841856949.0, + "step": 22062 + }, + { + "epoch": 2.8066403765424246, + "ewc_loss": 0.05114258825778961, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5571294827386737e-05, + "grad_norm": 31.84536361694336, + "learning_rate": 1e-06, + "loss": 0.4655, + "mean_token_accuracy": 0.8595246076583862, + "num_tokens": 841898644.0, + "step": 22063 + }, + { + "epoch": 2.806767586821015, + "ewc_loss": 0.051082540303468704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.554127058829181e-05, + "grad_norm": 31.634950637817383, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.87557452917099, + "num_tokens": 841934795.0, + "step": 22064 + }, + { + "epoch": 2.8068947970996057, + "ewc_loss": 0.05107850208878517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5539251510053873e-05, + "grad_norm": 31.91588592529297, + "learning_rate": 1e-06, + "loss": 0.4719, + "mean_token_accuracy": 0.8577867150306702, + "num_tokens": 841976878.0, + "step": 22065 + }, + { + "epoch": 2.807022007378196, + "ewc_loss": 0.05120551213622093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5602756068110466e-05, + "grad_norm": 31.769140243530273, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8730697631835938, + "num_tokens": 842010998.0, + "step": 22066 + }, + { + "epoch": 2.8071492176567867, + "ewc_loss": 0.051082149147987366, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5541074137436226e-05, + "grad_norm": 31.877832412719727, + "learning_rate": 1e-06, + "loss": 0.3403, + "mean_token_accuracy": 0.8966224789619446, + "num_tokens": 842051410.0, + "step": 22067 + }, + { + "epoch": 2.8072764279353772, + "ewc_loss": 0.05108533054590225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.554266575316433e-05, + "grad_norm": 31.7255916595459, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8758388757705688, + "num_tokens": 842087954.0, + "step": 22068 + }, + { + "epoch": 2.8074036382139678, + "ewc_loss": 0.050970856100320816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.548542761360295e-05, + "grad_norm": 31.80387306213379, + "learning_rate": 1e-06, + "loss": 0.4353, + "mean_token_accuracy": 0.8691819906234741, + "num_tokens": 842128326.0, + "step": 22069 + }, + { + "epoch": 2.8075308484925583, + "ewc_loss": 0.05116504803299904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5582523448974825e-05, + "grad_norm": 31.934839248657227, + "learning_rate": 1e-06, + "loss": 0.4874, + "mean_token_accuracy": 0.852760374546051, + "num_tokens": 842171143.0, + "step": 22070 + }, + { + "epoch": 2.807658058771149, + "ewc_loss": 0.05101171135902405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.550585486460477e-05, + "grad_norm": 31.648269653320312, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8599370121955872, + "num_tokens": 842212167.0, + "step": 22071 + }, + { + "epoch": 2.8077852690497394, + "ewc_loss": 0.05104270204901695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5521350835333578e-05, + "grad_norm": 31.810922622680664, + "learning_rate": 1e-06, + "loss": 0.4456, + "mean_token_accuracy": 0.8623506426811218, + "num_tokens": 842252576.0, + "step": 22072 + }, + { + "epoch": 2.80791247932833, + "ewc_loss": 0.05111759901046753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.555880018917378e-05, + "grad_norm": 31.731664657592773, + "learning_rate": 1e-06, + "loss": 0.4242, + "mean_token_accuracy": 0.8680585622787476, + "num_tokens": 842287812.0, + "step": 22073 + }, + { + "epoch": 2.80803968960692, + "ewc_loss": 0.05099800229072571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.549900091253221e-05, + "grad_norm": 31.768762588500977, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8692160844802856, + "num_tokens": 842328289.0, + "step": 22074 + }, + { + "epoch": 2.808166899885511, + "ewc_loss": 0.05111690238118172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.55584509432083e-05, + "grad_norm": 31.75027847290039, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.876728355884552, + "num_tokens": 842359182.0, + "step": 22075 + }, + { + "epoch": 2.808294110164101, + "ewc_loss": 0.05116627737879753, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5583138267393224e-05, + "grad_norm": 31.882047653198242, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.863288164138794, + "num_tokens": 842399820.0, + "step": 22076 + }, + { + "epoch": 2.808421320442692, + "ewc_loss": 0.05108477920293808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.554238926677499e-05, + "grad_norm": 31.639019012451172, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.870599627494812, + "num_tokens": 842437852.0, + "step": 22077 + }, + { + "epoch": 2.808548530721282, + "ewc_loss": 0.051063619554042816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5531810024403967e-05, + "grad_norm": 31.793277740478516, + "learning_rate": 1e-06, + "loss": 0.4581, + "mean_token_accuracy": 0.8586184978485107, + "num_tokens": 842476729.0, + "step": 22078 + }, + { + "epoch": 2.808675740999873, + "ewc_loss": 0.051168814301490784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.55844079219969e-05, + "grad_norm": 31.777761459350586, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8595724105834961, + "num_tokens": 842514590.0, + "step": 22079 + }, + { + "epoch": 2.808802951278463, + "ewc_loss": 0.051003336906433105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.550166755099781e-05, + "grad_norm": 31.749786376953125, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8707135319709778, + "num_tokens": 842556502.0, + "step": 22080 + }, + { + "epoch": 2.8089301615570537, + "ewc_loss": 0.05116167291998863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.558083724579774e-05, + "grad_norm": 31.770488739013672, + "learning_rate": 1e-06, + "loss": 0.4531, + "mean_token_accuracy": 0.8602163791656494, + "num_tokens": 842594299.0, + "step": 22081 + }, + { + "epoch": 2.809057371835644, + "ewc_loss": 0.05101611092686653, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.550805584178306e-05, + "grad_norm": 31.69972801208496, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8694347739219666, + "num_tokens": 842633189.0, + "step": 22082 + }, + { + "epoch": 2.8091845821142347, + "ewc_loss": 0.05111799016594887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.555899482103996e-05, + "grad_norm": 31.788105010986328, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8742299675941467, + "num_tokens": 842673626.0, + "step": 22083 + }, + { + "epoch": 2.8093117923928252, + "ewc_loss": 0.05112500861287117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.55625036515994e-05, + "grad_norm": 31.771095275878906, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8656294345855713, + "num_tokens": 842712574.0, + "step": 22084 + }, + { + "epoch": 2.8094390026714158, + "ewc_loss": 0.051142413169145584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5571205696905963e-05, + "grad_norm": 31.728374481201172, + "learning_rate": 1e-06, + "loss": 0.4271, + "mean_token_accuracy": 0.8696504831314087, + "num_tokens": 842749534.0, + "step": 22085 + }, + { + "epoch": 2.8095662129500063, + "ewc_loss": 0.05117291584610939, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5586457923054695e-05, + "grad_norm": 31.853673934936523, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8625609278678894, + "num_tokens": 842784989.0, + "step": 22086 + }, + { + "epoch": 2.809693423228597, + "ewc_loss": 0.05121494084596634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5607470888644457e-05, + "grad_norm": 31.860572814941406, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8721611499786377, + "num_tokens": 842823858.0, + "step": 22087 + }, + { + "epoch": 2.8098206335071874, + "ewc_loss": 0.05107446387410164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5537232431815937e-05, + "grad_norm": 31.86402130126953, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8730936646461487, + "num_tokens": 842858849.0, + "step": 22088 + }, + { + "epoch": 2.809947843785778, + "ewc_loss": 0.05109527334570885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.554763705120422e-05, + "grad_norm": 31.836416244506836, + "learning_rate": 1e-06, + "loss": 0.3592, + "mean_token_accuracy": 0.8916873931884766, + "num_tokens": 842895809.0, + "step": 22089 + }, + { + "epoch": 2.8100750540643684, + "ewc_loss": 0.05104588344693184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.552294245106168e-05, + "grad_norm": 31.709192276000977, + "learning_rate": 1e-06, + "loss": 0.3332, + "mean_token_accuracy": 0.8987412452697754, + "num_tokens": 842933650.0, + "step": 22090 + }, + { + "epoch": 2.810202264342959, + "ewc_loss": 0.051076870411634445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.553843478381168e-05, + "grad_norm": 31.807764053344727, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8772591352462769, + "num_tokens": 842967200.0, + "step": 22091 + }, + { + "epoch": 2.8103294746215495, + "ewc_loss": 0.0510239414870739, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5511970306979492e-05, + "grad_norm": 31.73751449584961, + "learning_rate": 1e-06, + "loss": 0.4371, + "mean_token_accuracy": 0.8642803430557251, + "num_tokens": 843002498.0, + "step": 22092 + }, + { + "epoch": 2.81045668490014, + "ewc_loss": 0.05110076069831848, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5550380087224767e-05, + "grad_norm": 31.72335433959961, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8765398263931274, + "num_tokens": 843042627.0, + "step": 22093 + }, + { + "epoch": 2.8105838951787305, + "ewc_loss": 0.05115564540028572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5577823180356063e-05, + "grad_norm": 31.806011199951172, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8683689832687378, + "num_tokens": 843076862.0, + "step": 22094 + }, + { + "epoch": 2.810711105457321, + "ewc_loss": 0.05101071298122406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.55053564615082e-05, + "grad_norm": 31.590055465698242, + "learning_rate": 1e-06, + "loss": 0.4196, + "mean_token_accuracy": 0.8712449669837952, + "num_tokens": 843115000.0, + "step": 22095 + }, + { + "epoch": 2.8108383157359116, + "ewc_loss": 0.051188837736845016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5594419639674015e-05, + "grad_norm": 31.95065689086914, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8793370723724365, + "num_tokens": 843152408.0, + "step": 22096 + }, + { + "epoch": 2.810965526014502, + "ewc_loss": 0.05116739124059677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.558369487815071e-05, + "grad_norm": 31.769140243530273, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8736074566841125, + "num_tokens": 843189269.0, + "step": 22097 + }, + { + "epoch": 2.8110927362930926, + "ewc_loss": 0.05112144723534584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.556072286097333e-05, + "grad_norm": 31.87506675720215, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.8673787117004395, + "num_tokens": 843227148.0, + "step": 22098 + }, + { + "epoch": 2.8112199465716827, + "ewc_loss": 0.05117972567677498, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5589863071218133e-05, + "grad_norm": 31.736011505126953, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8632553815841675, + "num_tokens": 843259736.0, + "step": 22099 + }, + { + "epoch": 2.8113471568502737, + "ewc_loss": 0.05114204064011574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.55710201599868e-05, + "grad_norm": 31.84517478942871, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.867821455001831, + "num_tokens": 843302598.0, + "step": 22100 + }, + { + "epoch": 2.811474367128864, + "ewc_loss": 0.05117100477218628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5585502953617834e-05, + "grad_norm": 31.799110412597656, + "learning_rate": 1e-06, + "loss": 0.3743, + "mean_token_accuracy": 0.8876299858093262, + "num_tokens": 843337615.0, + "step": 22101 + }, + { + "epoch": 2.8116015774074548, + "ewc_loss": 0.0511016845703125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5550842110533267e-05, + "grad_norm": 31.817550659179688, + "learning_rate": 1e-06, + "loss": 0.3753, + "mean_token_accuracy": 0.8876451253890991, + "num_tokens": 843378845.0, + "step": 22102 + }, + { + "epoch": 2.811728787686045, + "ewc_loss": 0.051245324313640594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5622663088142872e-05, + "grad_norm": 31.910417556762695, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8619445562362671, + "num_tokens": 843414128.0, + "step": 22103 + }, + { + "epoch": 2.811855997964636, + "ewc_loss": 0.05110105127096176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.555052560637705e-05, + "grad_norm": 31.855907440185547, + "learning_rate": 1e-06, + "loss": 0.3764, + "mean_token_accuracy": 0.8868000507354736, + "num_tokens": 843454255.0, + "step": 22104 + }, + { + "epoch": 2.811983208243226, + "ewc_loss": 0.05108553543686867, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5542767616570927e-05, + "grad_norm": 31.872194290161133, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8732993602752686, + "num_tokens": 843498332.0, + "step": 22105 + }, + { + "epoch": 2.8121104185218164, + "ewc_loss": 0.051005881279706955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.550294084358029e-05, + "grad_norm": 31.699893951416016, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8624551296234131, + "num_tokens": 843539196.0, + "step": 22106 + }, + { + "epoch": 2.812237628800407, + "ewc_loss": 0.051065195351839066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5532597646815702e-05, + "grad_norm": 31.80214500427246, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8768283128738403, + "num_tokens": 843573767.0, + "step": 22107 + }, + { + "epoch": 2.8123648390789975, + "ewc_loss": 0.05116990953683853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5584955437807366e-05, + "grad_norm": 31.83424186706543, + "learning_rate": 1e-06, + "loss": 0.4558, + "mean_token_accuracy": 0.8599296808242798, + "num_tokens": 843611156.0, + "step": 22108 + }, + { + "epoch": 2.812492049357588, + "ewc_loss": 0.051112279295921326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5556139007676393e-05, + "grad_norm": 31.864999771118164, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8778587579727173, + "num_tokens": 843644026.0, + "step": 22109 + }, + { + "epoch": 2.8126192596361785, + "ewc_loss": 0.051167286932468414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.558364394644741e-05, + "grad_norm": 31.879737854003906, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8751546144485474, + "num_tokens": 843683477.0, + "step": 22110 + }, + { + "epoch": 2.812746469914769, + "ewc_loss": 0.05104296654462814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5521483621560037e-05, + "grad_norm": 31.705371856689453, + "learning_rate": 1e-06, + "loss": 0.4617, + "mean_token_accuracy": 0.8588729500770569, + "num_tokens": 843722815.0, + "step": 22111 + }, + { + "epoch": 2.8128736801933596, + "ewc_loss": 0.05109289288520813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5546445613144897e-05, + "grad_norm": 32.008934020996094, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8732625842094421, + "num_tokens": 843763843.0, + "step": 22112 + }, + { + "epoch": 2.81300089047195, + "ewc_loss": 0.05113521218299866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5567605916876346e-05, + "grad_norm": 31.719816207885742, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8650060892105103, + "num_tokens": 843805327.0, + "step": 22113 + }, + { + "epoch": 2.8131281007505406, + "ewc_loss": 0.0510513000190258, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.552565092628356e-05, + "grad_norm": 31.819290161132812, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8735882639884949, + "num_tokens": 843850132.0, + "step": 22114 + }, + { + "epoch": 2.813255311029131, + "ewc_loss": 0.05107058212161064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5535291570122354e-05, + "grad_norm": 31.7640323638916, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.871991753578186, + "num_tokens": 843887425.0, + "step": 22115 + }, + { + "epoch": 2.8133825213077217, + "ewc_loss": 0.0511120967566967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5556048058206216e-05, + "grad_norm": 31.72894287109375, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8805968761444092, + "num_tokens": 843920728.0, + "step": 22116 + }, + { + "epoch": 2.8135097315863122, + "ewc_loss": 0.05105912685394287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5529563572490588e-05, + "grad_norm": 31.875131607055664, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8709783554077148, + "num_tokens": 843959733.0, + "step": 22117 + }, + { + "epoch": 2.8136369418649028, + "ewc_loss": 0.051179978996515274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5589988581486978e-05, + "grad_norm": 31.72648048400879, + "learning_rate": 1e-06, + "loss": 0.4198, + "mean_token_accuracy": 0.8739991188049316, + "num_tokens": 844005616.0, + "step": 22118 + }, + { + "epoch": 2.8137641521434933, + "ewc_loss": 0.05101960152387619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5509800252621062e-05, + "grad_norm": 31.8837890625, + "learning_rate": 1e-06, + "loss": 0.3605, + "mean_token_accuracy": 0.88688063621521, + "num_tokens": 844034216.0, + "step": 22119 + }, + { + "epoch": 2.813891362422084, + "ewc_loss": 0.051121607422828674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5560802896507084e-05, + "grad_norm": 31.74860191345215, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8691921234130859, + "num_tokens": 844071173.0, + "step": 22120 + }, + { + "epoch": 2.8140185727006743, + "ewc_loss": 0.05105026438832283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.552513251430355e-05, + "grad_norm": 31.859174728393555, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8701744079589844, + "num_tokens": 844113665.0, + "step": 22121 + }, + { + "epoch": 2.814145782979265, + "ewc_loss": 0.05110204964876175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5551024009473622e-05, + "grad_norm": 31.650638580322266, + "learning_rate": 1e-06, + "loss": 0.4213, + "mean_token_accuracy": 0.8723166584968567, + "num_tokens": 844154063.0, + "step": 22122 + }, + { + "epoch": 2.8142729932578554, + "ewc_loss": 0.05106402933597565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5532015570206568e-05, + "grad_norm": 32.04425048828125, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8907607197761536, + "num_tokens": 844185634.0, + "step": 22123 + }, + { + "epoch": 2.8144002035364455, + "ewc_loss": 0.05108795687556267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.554397906351369e-05, + "grad_norm": 31.85896110534668, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8623107671737671, + "num_tokens": 844221755.0, + "step": 22124 + }, + { + "epoch": 2.8145274138150365, + "ewc_loss": 0.050968680530786514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.548433985793963e-05, + "grad_norm": 31.891807556152344, + "learning_rate": 1e-06, + "loss": 0.3997, + "mean_token_accuracy": 0.8774064183235168, + "num_tokens": 844256230.0, + "step": 22125 + }, + { + "epoch": 2.8146546240936265, + "ewc_loss": 0.051063716411590576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5531859137117863e-05, + "grad_norm": 31.784025192260742, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8651063442230225, + "num_tokens": 844292823.0, + "step": 22126 + }, + { + "epoch": 2.8147818343722175, + "ewc_loss": 0.05100870504975319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5504352379357442e-05, + "grad_norm": 31.771848678588867, + "learning_rate": 1e-06, + "loss": 0.4853, + "mean_token_accuracy": 0.8504664897918701, + "num_tokens": 844331843.0, + "step": 22127 + }, + { + "epoch": 2.8149090446508076, + "ewc_loss": 0.051013875752687454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.550693716329988e-05, + "grad_norm": 31.88826560974121, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8823076486587524, + "num_tokens": 844375823.0, + "step": 22128 + }, + { + "epoch": 2.815036254929398, + "ewc_loss": 0.05112282931804657, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5561414076946676e-05, + "grad_norm": 31.89670181274414, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8728011846542358, + "num_tokens": 844415187.0, + "step": 22129 + }, + { + "epoch": 2.8151634652079887, + "ewc_loss": 0.05111071839928627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.555535866122227e-05, + "grad_norm": 31.886123657226562, + "learning_rate": 1e-06, + "loss": 0.4092, + "mean_token_accuracy": 0.875144362449646, + "num_tokens": 844454986.0, + "step": 22130 + }, + { + "epoch": 2.815290675486579, + "ewc_loss": 0.05102336034178734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5511679268674925e-05, + "grad_norm": 31.832355499267578, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8783152103424072, + "num_tokens": 844495514.0, + "step": 22131 + }, + { + "epoch": 2.8154178857651697, + "ewc_loss": 0.050990719348192215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.549535929574631e-05, + "grad_norm": 31.759801864624023, + "learning_rate": 1e-06, + "loss": 0.425, + "mean_token_accuracy": 0.8716332912445068, + "num_tokens": 844532406.0, + "step": 22132 + }, + { + "epoch": 2.8155450960437602, + "ewc_loss": 0.05104907602071762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.552453770476859e-05, + "grad_norm": 31.887065887451172, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8709050416946411, + "num_tokens": 844569561.0, + "step": 22133 + }, + { + "epoch": 2.8156723063223508, + "ewc_loss": 0.05100943148136139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.550471617723815e-05, + "grad_norm": 31.7435359954834, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8779410123825073, + "num_tokens": 844613504.0, + "step": 22134 + }, + { + "epoch": 2.8157995166009413, + "ewc_loss": 0.05106170102953911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5530849597998895e-05, + "grad_norm": 31.824893951416016, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.869955837726593, + "num_tokens": 844649255.0, + "step": 22135 + }, + { + "epoch": 2.815926726879532, + "ewc_loss": 0.051059234887361526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5529618142172694e-05, + "grad_norm": 31.79173469543457, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8749407529830933, + "num_tokens": 844687202.0, + "step": 22136 + }, + { + "epoch": 2.8160539371581224, + "ewc_loss": 0.05102260783314705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5511304556857795e-05, + "grad_norm": 31.76409912109375, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.8743555545806885, + "num_tokens": 844723096.0, + "step": 22137 + }, + { + "epoch": 2.816181147436713, + "ewc_loss": 0.05119405686855316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.55970280704787e-05, + "grad_norm": 31.838613510131836, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8737287521362305, + "num_tokens": 844759754.0, + "step": 22138 + }, + { + "epoch": 2.8163083577153034, + "ewc_loss": 0.05113913491368294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.556956678745337e-05, + "grad_norm": 31.8077335357666, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8756930828094482, + "num_tokens": 844798372.0, + "step": 22139 + }, + { + "epoch": 2.816435567993894, + "ewc_loss": 0.05111340805888176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5556704713380896e-05, + "grad_norm": 31.82245445251465, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.867681086063385, + "num_tokens": 844832738.0, + "step": 22140 + }, + { + "epoch": 2.8165627782724845, + "ewc_loss": 0.0511736162006855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.558680898800958e-05, + "grad_norm": 31.865915298461914, + "learning_rate": 1e-06, + "loss": 0.4057, + "mean_token_accuracy": 0.873707115650177, + "num_tokens": 844871684.0, + "step": 22141 + }, + { + "epoch": 2.816689988551075, + "ewc_loss": 0.05110449343919754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5552246370352805e-05, + "grad_norm": 31.80385971069336, + "learning_rate": 1e-06, + "loss": 0.4395, + "mean_token_accuracy": 0.8652845025062561, + "num_tokens": 844912140.0, + "step": 22142 + }, + { + "epoch": 2.8168171988296655, + "ewc_loss": 0.05104714259505272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5523571821395308e-05, + "grad_norm": 31.692289352416992, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8605682849884033, + "num_tokens": 844948256.0, + "step": 22143 + }, + { + "epoch": 2.816944409108256, + "ewc_loss": 0.05112016201019287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5560080757713877e-05, + "grad_norm": 31.866395950317383, + "learning_rate": 1e-06, + "loss": 0.3653, + "mean_token_accuracy": 0.8885380029678345, + "num_tokens": 844983821.0, + "step": 22144 + }, + { + "epoch": 2.8170716193868466, + "ewc_loss": 0.05118902400135994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5594512408133596e-05, + "grad_norm": 31.707876205444336, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8767654895782471, + "num_tokens": 845020622.0, + "step": 22145 + }, + { + "epoch": 2.817198829665437, + "ewc_loss": 0.05110124498605728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5550622012815438e-05, + "grad_norm": 31.80942726135254, + "learning_rate": 1e-06, + "loss": 0.4436, + "mean_token_accuracy": 0.8648535013198853, + "num_tokens": 845056748.0, + "step": 22146 + }, + { + "epoch": 2.8173260399440276, + "ewc_loss": 0.05118533968925476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5592669771867804e-05, + "grad_norm": 31.767208099365234, + "learning_rate": 1e-06, + "loss": 0.4746, + "mean_token_accuracy": 0.8540550470352173, + "num_tokens": 845096294.0, + "step": 22147 + }, + { + "epoch": 2.817453250222618, + "ewc_loss": 0.05116788670420647, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5583944079698995e-05, + "grad_norm": 31.8048038482666, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8875136971473694, + "num_tokens": 845135132.0, + "step": 22148 + }, + { + "epoch": 2.8175804605012083, + "ewc_loss": 0.051189132034778595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.55945651588263e-05, + "grad_norm": 31.900310516357422, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8809906244277954, + "num_tokens": 845172164.0, + "step": 22149 + }, + { + "epoch": 2.8177076707797992, + "ewc_loss": 0.051117703318595886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.555885112087708e-05, + "grad_norm": 31.77867317199707, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8730611205101013, + "num_tokens": 845210270.0, + "step": 22150 + }, + { + "epoch": 2.8178348810583893, + "ewc_loss": 0.051201045513153076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.560052234912291e-05, + "grad_norm": 31.82856559753418, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8764995336532593, + "num_tokens": 845251863.0, + "step": 22151 + }, + { + "epoch": 2.8179620913369803, + "ewc_loss": 0.05114784836769104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.557392326707486e-05, + "grad_norm": 31.877391815185547, + "learning_rate": 1e-06, + "loss": 0.4064, + "mean_token_accuracy": 0.8772410154342651, + "num_tokens": 845290262.0, + "step": 22152 + }, + { + "epoch": 2.8180893016155704, + "ewc_loss": 0.05116485059261322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5582425223547034e-05, + "grad_norm": 31.79477310180664, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8794949650764465, + "num_tokens": 845326894.0, + "step": 22153 + }, + { + "epoch": 2.818216511894161, + "ewc_loss": 0.051042232662439346, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.552111618570052e-05, + "grad_norm": 31.705472946166992, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8814682960510254, + "num_tokens": 845367545.0, + "step": 22154 + }, + { + "epoch": 2.8183437221727514, + "ewc_loss": 0.05117517337203026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5587587515474297e-05, + "grad_norm": 31.906715393066406, + "learning_rate": 1e-06, + "loss": 0.4731, + "mean_token_accuracy": 0.8588088750839233, + "num_tokens": 845400920.0, + "step": 22155 + }, + { + "epoch": 2.818470932451342, + "ewc_loss": 0.05121822655200958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.560911343607586e-05, + "grad_norm": 31.92921257019043, + "learning_rate": 1e-06, + "loss": 0.4237, + "mean_token_accuracy": 0.867957353591919, + "num_tokens": 845436279.0, + "step": 22156 + }, + { + "epoch": 2.8185981427299325, + "ewc_loss": 0.05105279013514519, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.552639489294961e-05, + "grad_norm": 31.741851806640625, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8771453499794006, + "num_tokens": 845476308.0, + "step": 22157 + }, + { + "epoch": 2.818725353008523, + "ewc_loss": 0.05118411034345627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5592054953449406e-05, + "grad_norm": 31.782575607299805, + "learning_rate": 1e-06, + "loss": 0.4359, + "mean_token_accuracy": 0.8688654899597168, + "num_tokens": 845516624.0, + "step": 22158 + }, + { + "epoch": 2.8188525632871135, + "ewc_loss": 0.05122126638889313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.561063229222782e-05, + "grad_norm": 31.864456176757812, + "learning_rate": 1e-06, + "loss": 0.3574, + "mean_token_accuracy": 0.8909718990325928, + "num_tokens": 845552385.0, + "step": 22159 + }, + { + "epoch": 2.818979773565704, + "ewc_loss": 0.051240187138319016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.562009285611566e-05, + "grad_norm": 31.84398651123047, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8820961117744446, + "num_tokens": 845592600.0, + "step": 22160 + }, + { + "epoch": 2.8191069838442946, + "ewc_loss": 0.05116383731365204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5581917725503445e-05, + "grad_norm": 31.887359619140625, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8822462558746338, + "num_tokens": 845630028.0, + "step": 22161 + }, + { + "epoch": 2.819234194122885, + "ewc_loss": 0.05123131349682808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5615656340960413e-05, + "grad_norm": 31.860740661621094, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8684672117233276, + "num_tokens": 845665874.0, + "step": 22162 + }, + { + "epoch": 2.8193614044014756, + "ewc_loss": 0.05116032063961029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.558016058173962e-05, + "grad_norm": 31.81568145751953, + "learning_rate": 1e-06, + "loss": 0.4551, + "mean_token_accuracy": 0.8608753085136414, + "num_tokens": 845702738.0, + "step": 22163 + }, + { + "epoch": 2.819488614680066, + "ewc_loss": 0.051108624786138535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5554312742315233e-05, + "grad_norm": 31.878129959106445, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8759443759918213, + "num_tokens": 845744785.0, + "step": 22164 + }, + { + "epoch": 2.8196158249586567, + "ewc_loss": 0.05121127888560295, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5605639166315086e-05, + "grad_norm": 31.675769805908203, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8727696537971497, + "num_tokens": 845779196.0, + "step": 22165 + }, + { + "epoch": 2.8197430352372472, + "ewc_loss": 0.051032550632953644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5516275854897685e-05, + "grad_norm": 31.782569885253906, + "learning_rate": 1e-06, + "loss": 0.4404, + "mean_token_accuracy": 0.8629322648048401, + "num_tokens": 845819016.0, + "step": 22166 + }, + { + "epoch": 2.8198702455158378, + "ewc_loss": 0.05126730725169182, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5633653422119096e-05, + "grad_norm": 31.815750122070312, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8745185136795044, + "num_tokens": 845855014.0, + "step": 22167 + }, + { + "epoch": 2.8199974557944283, + "ewc_loss": 0.051127661019563675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5563829694874585e-05, + "grad_norm": 31.778854370117188, + "learning_rate": 1e-06, + "loss": 0.39, + "mean_token_accuracy": 0.8818137645721436, + "num_tokens": 845895491.0, + "step": 22168 + }, + { + "epoch": 2.820124666073019, + "ewc_loss": 0.051144085824489594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5572042432031594e-05, + "grad_norm": 31.843494415283203, + "learning_rate": 1e-06, + "loss": 0.4486, + "mean_token_accuracy": 0.8638640642166138, + "num_tokens": 845931560.0, + "step": 22169 + }, + { + "epoch": 2.8202518763516093, + "ewc_loss": 0.05126215144991875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.563107591413427e-05, + "grad_norm": 31.92796516418457, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8734799027442932, + "num_tokens": 845966701.0, + "step": 22170 + }, + { + "epoch": 2.8203790866302, + "ewc_loss": 0.05116596072912216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5582980015315115e-05, + "grad_norm": 31.824922561645508, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8898687958717346, + "num_tokens": 846002327.0, + "step": 22171 + }, + { + "epoch": 2.82050629690879, + "ewc_loss": 0.05111389234662056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5556946638971567e-05, + "grad_norm": 31.91985321044922, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8734430074691772, + "num_tokens": 846042157.0, + "step": 22172 + }, + { + "epoch": 2.820633507187381, + "ewc_loss": 0.05123010277748108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5615050617489032e-05, + "grad_norm": 31.833114624023438, + "learning_rate": 1e-06, + "loss": 0.4833, + "mean_token_accuracy": 0.8578819632530212, + "num_tokens": 846082851.0, + "step": 22173 + }, + { + "epoch": 2.820760717465971, + "ewc_loss": 0.051086798310279846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5543398805893958e-05, + "grad_norm": 31.885700225830078, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8714333772659302, + "num_tokens": 846116846.0, + "step": 22174 + }, + { + "epoch": 2.820887927744562, + "ewc_loss": 0.05118972808122635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.559486347308848e-05, + "grad_norm": 31.687335968017578, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8752959370613098, + "num_tokens": 846154373.0, + "step": 22175 + }, + { + "epoch": 2.821015138023152, + "ewc_loss": 0.05104753375053406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5523766453261487e-05, + "grad_norm": 31.83330726623535, + "learning_rate": 1e-06, + "loss": 0.3546, + "mean_token_accuracy": 0.8898143768310547, + "num_tokens": 846195329.0, + "step": 22176 + }, + { + "epoch": 2.821142348301743, + "ewc_loss": 0.05119149386882782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5595747501938604e-05, + "grad_norm": 31.719345092773438, + "learning_rate": 1e-06, + "loss": 0.3773, + "mean_token_accuracy": 0.8842025399208069, + "num_tokens": 846233078.0, + "step": 22177 + }, + { + "epoch": 2.821269558580333, + "ewc_loss": 0.05116697773337364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5583489332348108e-05, + "grad_norm": 31.93191146850586, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8707858324050903, + "num_tokens": 846269030.0, + "step": 22178 + }, + { + "epoch": 2.8213967688589237, + "ewc_loss": 0.05136484652757645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5682422347017564e-05, + "grad_norm": 31.75808334350586, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.868760883808136, + "num_tokens": 846307269.0, + "step": 22179 + }, + { + "epoch": 2.821523979137514, + "ewc_loss": 0.051161203533411026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.558060259616468e-05, + "grad_norm": 31.904321670532227, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8639023303985596, + "num_tokens": 846339116.0, + "step": 22180 + }, + { + "epoch": 2.8216511894161047, + "ewc_loss": 0.05127331614494324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5636658392613754e-05, + "grad_norm": 31.777780532836914, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8652147650718689, + "num_tokens": 846384968.0, + "step": 22181 + }, + { + "epoch": 2.8217783996946952, + "ewc_loss": 0.051166072487831116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5583036403986625e-05, + "grad_norm": 31.81633186340332, + "learning_rate": 1e-06, + "loss": 0.464, + "mean_token_accuracy": 0.857112467288971, + "num_tokens": 846426235.0, + "step": 22182 + }, + { + "epoch": 2.8219056099732858, + "ewc_loss": 0.05132514238357544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5662571715656668e-05, + "grad_norm": 31.825687408447266, + "learning_rate": 1e-06, + "loss": 0.4549, + "mean_token_accuracy": 0.8598719835281372, + "num_tokens": 846465715.0, + "step": 22183 + }, + { + "epoch": 2.8220328202518763, + "ewc_loss": 0.051147665828466415, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5573832317604683e-05, + "grad_norm": 31.7471923828125, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8591915369033813, + "num_tokens": 846499972.0, + "step": 22184 + }, + { + "epoch": 2.822160030530467, + "ewc_loss": 0.05117449164390564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.558724554546643e-05, + "grad_norm": 31.855329513549805, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8716434240341187, + "num_tokens": 846535906.0, + "step": 22185 + }, + { + "epoch": 2.8222872408090574, + "ewc_loss": 0.051278673112392426, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5639335945015773e-05, + "grad_norm": 31.899808883666992, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8685219883918762, + "num_tokens": 846572723.0, + "step": 22186 + }, + { + "epoch": 2.822414451087648, + "ewc_loss": 0.05121790990233421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.560895518399775e-05, + "grad_norm": 31.822093963623047, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8767811059951782, + "num_tokens": 846613501.0, + "step": 22187 + }, + { + "epoch": 2.8225416613662384, + "ewc_loss": 0.05119182541966438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5595913029974326e-05, + "grad_norm": 31.851459503173828, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8942461013793945, + "num_tokens": 846650001.0, + "step": 22188 + }, + { + "epoch": 2.822668871644829, + "ewc_loss": 0.0512283630669117, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5614181140554138e-05, + "grad_norm": 31.91639518737793, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8601819276809692, + "num_tokens": 846690636.0, + "step": 22189 + }, + { + "epoch": 2.8227960819234195, + "ewc_loss": 0.0512680858373642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5634042685851455e-05, + "grad_norm": 31.97661018371582, + "learning_rate": 1e-06, + "loss": 0.4882, + "mean_token_accuracy": 0.8511596918106079, + "num_tokens": 846728513.0, + "step": 22190 + }, + { + "epoch": 2.82292329220201, + "ewc_loss": 0.05117279663681984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5586397896404378e-05, + "grad_norm": 31.86671257019043, + "learning_rate": 1e-06, + "loss": 0.4652, + "mean_token_accuracy": 0.8587419390678406, + "num_tokens": 846771989.0, + "step": 22191 + }, + { + "epoch": 2.8230505024806005, + "ewc_loss": 0.05123262479901314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5616312996135093e-05, + "grad_norm": 32.05727767944336, + "learning_rate": 1e-06, + "loss": 0.3633, + "mean_token_accuracy": 0.8898580074310303, + "num_tokens": 846808393.0, + "step": 22192 + }, + { + "epoch": 2.823177712759191, + "ewc_loss": 0.05115872249007225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5579362045391463e-05, + "grad_norm": 31.87066650390625, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8628426194190979, + "num_tokens": 846846928.0, + "step": 22193 + }, + { + "epoch": 2.8233049230377816, + "ewc_loss": 0.05105815455317497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.552907790231984e-05, + "grad_norm": 31.892780303955078, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.863450288772583, + "num_tokens": 846884637.0, + "step": 22194 + }, + { + "epoch": 2.823432133316372, + "ewc_loss": 0.05114723742008209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.557361949584447e-05, + "grad_norm": 31.773550033569336, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.8767693042755127, + "num_tokens": 846925749.0, + "step": 22195 + }, + { + "epoch": 2.8235593435949626, + "ewc_loss": 0.051150254905223846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5575127438060008e-05, + "grad_norm": 31.937881469726562, + "learning_rate": 1e-06, + "loss": 0.4715, + "mean_token_accuracy": 0.8578206300735474, + "num_tokens": 846967191.0, + "step": 22196 + }, + { + "epoch": 2.8236865538735527, + "ewc_loss": 0.051146432757377625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.557321568019688e-05, + "grad_norm": 31.855464935302734, + "learning_rate": 1e-06, + "loss": 0.3963, + "mean_token_accuracy": 0.8781051635742188, + "num_tokens": 847003280.0, + "step": 22197 + }, + { + "epoch": 2.8238137641521437, + "ewc_loss": 0.05113731697201729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5568659111741e-05, + "grad_norm": 31.876331329345703, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8606417775154114, + "num_tokens": 847042179.0, + "step": 22198 + }, + { + "epoch": 2.8239409744307338, + "ewc_loss": 0.051169369369745255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5584684408386238e-05, + "grad_norm": 31.717226028442383, + "learning_rate": 1e-06, + "loss": 0.4218, + "mean_token_accuracy": 0.8679239153862, + "num_tokens": 847083008.0, + "step": 22199 + }, + { + "epoch": 2.8240681847093247, + "ewc_loss": 0.05110782012343407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.555391074565705e-05, + "grad_norm": 31.900920867919922, + "learning_rate": 1e-06, + "loss": 0.4828, + "mean_token_accuracy": 0.8527147769927979, + "num_tokens": 847124009.0, + "step": 22200 + }, + { + "epoch": 2.824195394987915, + "ewc_loss": 0.05123455449938774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5617277060518973e-05, + "grad_norm": 31.77693748474121, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8667247891426086, + "num_tokens": 847163526.0, + "step": 22201 + }, + { + "epoch": 2.824322605266506, + "ewc_loss": 0.05115031078457832, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.557515472290106e-05, + "grad_norm": 31.745223999023438, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.8704703450202942, + "num_tokens": 847201374.0, + "step": 22202 + }, + { + "epoch": 2.824449815545096, + "ewc_loss": 0.0512760728597641, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.563803718658164e-05, + "grad_norm": 31.81378173828125, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8579888343811035, + "num_tokens": 847239469.0, + "step": 22203 + }, + { + "epoch": 2.8245770258236864, + "ewc_loss": 0.05126286298036575, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5631430617067963e-05, + "grad_norm": 31.98248291015625, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8701700568199158, + "num_tokens": 847275964.0, + "step": 22204 + }, + { + "epoch": 2.824704236102277, + "ewc_loss": 0.05125748738646507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5628743969718926e-05, + "grad_norm": 31.78006935119629, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8731058835983276, + "num_tokens": 847316815.0, + "step": 22205 + }, + { + "epoch": 2.8248314463808675, + "ewc_loss": 0.051192209124565125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.55961040238617e-05, + "grad_norm": 31.87578582763672, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8761954307556152, + "num_tokens": 847350949.0, + "step": 22206 + }, + { + "epoch": 2.824958656659458, + "ewc_loss": 0.05123180150985718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.561590008554049e-05, + "grad_norm": 31.858858108520508, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8756768703460693, + "num_tokens": 847388945.0, + "step": 22207 + }, + { + "epoch": 2.8250858669380485, + "ewc_loss": 0.05124863609671593, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.56243183685001e-05, + "grad_norm": 31.95879364013672, + "learning_rate": 1e-06, + "loss": 0.3831, + "mean_token_accuracy": 0.8838009238243103, + "num_tokens": 847435114.0, + "step": 22208 + }, + { + "epoch": 2.825213077216639, + "ewc_loss": 0.05121801048517227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.560900611570105e-05, + "grad_norm": 31.795303344726562, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8644216060638428, + "num_tokens": 847470815.0, + "step": 22209 + }, + { + "epoch": 2.8253402874952296, + "ewc_loss": 0.0511397160589695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5569857825757936e-05, + "grad_norm": 31.704816818237305, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8694026470184326, + "num_tokens": 847509379.0, + "step": 22210 + }, + { + "epoch": 2.82546749777382, + "ewc_loss": 0.05110575631260872, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5552877559675835e-05, + "grad_norm": 31.8167781829834, + "learning_rate": 1e-06, + "loss": 0.4125, + "mean_token_accuracy": 0.8749849796295166, + "num_tokens": 847547427.0, + "step": 22211 + }, + { + "epoch": 2.8255947080524106, + "ewc_loss": 0.051230017095804214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.561500878073275e-05, + "grad_norm": 31.991735458374023, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8558980226516724, + "num_tokens": 847584740.0, + "step": 22212 + }, + { + "epoch": 2.825721918331001, + "ewc_loss": 0.05122264847159386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5611323508201167e-05, + "grad_norm": 31.758092880249023, + "learning_rate": 1e-06, + "loss": 0.4516, + "mean_token_accuracy": 0.8605583310127258, + "num_tokens": 847626201.0, + "step": 22213 + }, + { + "epoch": 2.8258491286095917, + "ewc_loss": 0.051080673933029175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.554033744672779e-05, + "grad_norm": 31.909229278564453, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.8854212760925293, + "num_tokens": 847661311.0, + "step": 22214 + }, + { + "epoch": 2.8259763388881822, + "ewc_loss": 0.051296062767505646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5648030714364722e-05, + "grad_norm": 31.817123413085938, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8699232339859009, + "num_tokens": 847695234.0, + "step": 22215 + }, + { + "epoch": 2.8261035491667728, + "ewc_loss": 0.05110343173146248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.555171522544697e-05, + "grad_norm": 31.82423973083496, + "learning_rate": 1e-06, + "loss": 0.3616, + "mean_token_accuracy": 0.8886805772781372, + "num_tokens": 847738848.0, + "step": 22216 + }, + { + "epoch": 2.8262307594453633, + "ewc_loss": 0.051303572952747345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5651786927483045e-05, + "grad_norm": 31.871826171875, + "learning_rate": 1e-06, + "loss": 0.4029, + "mean_token_accuracy": 0.8746711015701294, + "num_tokens": 847775952.0, + "step": 22217 + }, + { + "epoch": 2.826357969723954, + "ewc_loss": 0.05115712434053421, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5578561690053903e-05, + "grad_norm": 31.839420318603516, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.888117790222168, + "num_tokens": 847809381.0, + "step": 22218 + }, + { + "epoch": 2.8264851800025443, + "ewc_loss": 0.051204025745391846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5602012101444416e-05, + "grad_norm": 31.819684982299805, + "learning_rate": 1e-06, + "loss": 0.4364, + "mean_token_accuracy": 0.8680035471916199, + "num_tokens": 847849835.0, + "step": 22219 + }, + { + "epoch": 2.826612390281135, + "ewc_loss": 0.05117505043745041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5587525669834577e-05, + "grad_norm": 31.834062576293945, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8689528703689575, + "num_tokens": 847882726.0, + "step": 22220 + }, + { + "epoch": 2.8267396005597254, + "ewc_loss": 0.051223598420619965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5611800083424896e-05, + "grad_norm": 31.782712936401367, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8720722794532776, + "num_tokens": 847920663.0, + "step": 22221 + }, + { + "epoch": 2.8268668108383155, + "ewc_loss": 0.05129669979214668, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5648349037510343e-05, + "grad_norm": 31.9830265045166, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8799059391021729, + "num_tokens": 847962204.0, + "step": 22222 + }, + { + "epoch": 2.8269940211169065, + "ewc_loss": 0.051247891038656235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.562394547567237e-05, + "grad_norm": 31.70902442932129, + "learning_rate": 1e-06, + "loss": 0.4391, + "mean_token_accuracy": 0.8651809096336365, + "num_tokens": 848003722.0, + "step": 22223 + }, + { + "epoch": 2.8271212313954965, + "ewc_loss": 0.05120141804218292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5600709705031477e-05, + "grad_norm": 31.95370101928711, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8798263669013977, + "num_tokens": 848045335.0, + "step": 22224 + }, + { + "epoch": 2.8272484416740875, + "ewc_loss": 0.0512167327105999, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5608365831431e-05, + "grad_norm": 31.73093605041504, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8651089668273926, + "num_tokens": 848088817.0, + "step": 22225 + }, + { + "epoch": 2.8273756519526776, + "ewc_loss": 0.05111205205321312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5556026230333373e-05, + "grad_norm": 31.777544021606445, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8632421493530273, + "num_tokens": 848120993.0, + "step": 22226 + }, + { + "epoch": 2.827502862231268, + "ewc_loss": 0.05134489759802818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5672448828117922e-05, + "grad_norm": 31.901521682739258, + "learning_rate": 1e-06, + "loss": 0.4578, + "mean_token_accuracy": 0.8637051582336426, + "num_tokens": 848162542.0, + "step": 22227 + }, + { + "epoch": 2.8276300725098586, + "ewc_loss": 0.051177963614463806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5588982680346817e-05, + "grad_norm": 31.766515731811523, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8853869438171387, + "num_tokens": 848198194.0, + "step": 22228 + }, + { + "epoch": 2.827757282788449, + "ewc_loss": 0.05120028182864189, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5600140361348167e-05, + "grad_norm": 31.888294219970703, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8666402101516724, + "num_tokens": 848240575.0, + "step": 22229 + }, + { + "epoch": 2.8278844930670397, + "ewc_loss": 0.05115998908877373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5579995053703897e-05, + "grad_norm": 31.813695907592773, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.8716021180152893, + "num_tokens": 848289026.0, + "step": 22230 + }, + { + "epoch": 2.8280117033456302, + "ewc_loss": 0.051162660121917725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.55813301919261e-05, + "grad_norm": 31.937870025634766, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.874255359172821, + "num_tokens": 848326438.0, + "step": 22231 + }, + { + "epoch": 2.8281389136242208, + "ewc_loss": 0.05120287463068962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5601437300792895e-05, + "grad_norm": 31.877199172973633, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8790121078491211, + "num_tokens": 848368106.0, + "step": 22232 + }, + { + "epoch": 2.8282661239028113, + "ewc_loss": 0.05105878785252571, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5529394406476058e-05, + "grad_norm": 31.72917938232422, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8686150312423706, + "num_tokens": 848409775.0, + "step": 22233 + }, + { + "epoch": 2.828393334181402, + "ewc_loss": 0.0511692576110363, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5584628019714728e-05, + "grad_norm": 31.722793579101562, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8724485039710999, + "num_tokens": 848442149.0, + "step": 22234 + }, + { + "epoch": 2.8285205444599923, + "ewc_loss": 0.05117623880505562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5588118660380132e-05, + "grad_norm": 31.889060974121094, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8660649657249451, + "num_tokens": 848475468.0, + "step": 22235 + }, + { + "epoch": 2.828647754738583, + "ewc_loss": 0.05121179297566414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5605895643820986e-05, + "grad_norm": 31.782058715820312, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8724140524864197, + "num_tokens": 848509565.0, + "step": 22236 + }, + { + "epoch": 2.8287749650171734, + "ewc_loss": 0.05119992047548294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5599960281397216e-05, + "grad_norm": 31.845996856689453, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8730657696723938, + "num_tokens": 848546357.0, + "step": 22237 + }, + { + "epoch": 2.828902175295764, + "ewc_loss": 0.051177505403757095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.558875348768197e-05, + "grad_norm": 31.78580665588379, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8597285747528076, + "num_tokens": 848587661.0, + "step": 22238 + }, + { + "epoch": 2.8290293855743545, + "ewc_loss": 0.051266707479953766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.563335328886751e-05, + "grad_norm": 31.781288146972656, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8767179846763611, + "num_tokens": 848628250.0, + "step": 22239 + }, + { + "epoch": 2.829156595852945, + "ewc_loss": 0.05120381712913513, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5601908419048414e-05, + "grad_norm": 31.84156608581543, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8738718628883362, + "num_tokens": 848670452.0, + "step": 22240 + }, + { + "epoch": 2.8292838061315355, + "ewc_loss": 0.05125289410352707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5626446586102247e-05, + "grad_norm": 31.792219161987305, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8735106587409973, + "num_tokens": 848703592.0, + "step": 22241 + }, + { + "epoch": 2.829411016410126, + "ewc_loss": 0.05131535977125168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5657680453150533e-05, + "grad_norm": 31.824344635009766, + "learning_rate": 1e-06, + "loss": 0.4722, + "mean_token_accuracy": 0.8560206890106201, + "num_tokens": 848739291.0, + "step": 22242 + }, + { + "epoch": 2.8295382266887166, + "ewc_loss": 0.05122394487261772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5611972887418233e-05, + "grad_norm": 31.83342170715332, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8662014007568359, + "num_tokens": 848776039.0, + "step": 22243 + }, + { + "epoch": 2.829665436967307, + "ewc_loss": 0.051276642829179764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5638320948928595e-05, + "grad_norm": 31.80152702331543, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8746390342712402, + "num_tokens": 848814799.0, + "step": 22244 + }, + { + "epoch": 2.8297926472458976, + "ewc_loss": 0.051283080130815506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.564154056017287e-05, + "grad_norm": 31.88714027404785, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.873190701007843, + "num_tokens": 848860123.0, + "step": 22245 + }, + { + "epoch": 2.829919857524488, + "ewc_loss": 0.05128804221749306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5644021661719307e-05, + "grad_norm": 31.74910545349121, + "learning_rate": 1e-06, + "loss": 0.3586, + "mean_token_accuracy": 0.8892173767089844, + "num_tokens": 848900447.0, + "step": 22246 + }, + { + "epoch": 2.8300470678030782, + "ewc_loss": 0.05124172568321228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.562086228863336e-05, + "grad_norm": 31.812393188476562, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8664695024490356, + "num_tokens": 848945385.0, + "step": 22247 + }, + { + "epoch": 2.830174278081669, + "ewc_loss": 0.051307663321495056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.565383147157263e-05, + "grad_norm": 31.60470962524414, + "learning_rate": 1e-06, + "loss": 0.441, + "mean_token_accuracy": 0.8620359897613525, + "num_tokens": 848978783.0, + "step": 22248 + }, + { + "epoch": 2.8303014883602593, + "ewc_loss": 0.051280707120895386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5640352760092355e-05, + "grad_norm": 31.89647674560547, + "learning_rate": 1e-06, + "loss": 0.4575, + "mean_token_accuracy": 0.860508918762207, + "num_tokens": 849017324.0, + "step": 22249 + }, + { + "epoch": 2.8304286986388503, + "ewc_loss": 0.0514049269258976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5702463972265832e-05, + "grad_norm": 31.831085205078125, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8919245004653931, + "num_tokens": 849045928.0, + "step": 22250 + }, + { + "epoch": 2.8305559089174404, + "ewc_loss": 0.05120854824781418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5604274924262427e-05, + "grad_norm": 31.754669189453125, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8718844652175903, + "num_tokens": 849080855.0, + "step": 22251 + }, + { + "epoch": 2.830683119196031, + "ewc_loss": 0.05126996338367462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5634981284383684e-05, + "grad_norm": 31.767742156982422, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8753555417060852, + "num_tokens": 849114835.0, + "step": 22252 + }, + { + "epoch": 2.8308103294746214, + "ewc_loss": 0.05118633806705475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5593168174964376e-05, + "grad_norm": 31.747081756591797, + "learning_rate": 1e-06, + "loss": 0.457, + "mean_token_accuracy": 0.8614477515220642, + "num_tokens": 849152632.0, + "step": 22253 + }, + { + "epoch": 2.830937539753212, + "ewc_loss": 0.05125485360622406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.562742702139076e-05, + "grad_norm": 31.84520149230957, + "learning_rate": 1e-06, + "loss": 0.4019, + "mean_token_accuracy": 0.8807905912399292, + "num_tokens": 849189911.0, + "step": 22254 + }, + { + "epoch": 2.8310647500318025, + "ewc_loss": 0.05135934427380562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5679672035039403e-05, + "grad_norm": 31.777759552001953, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8702915906906128, + "num_tokens": 849227847.0, + "step": 22255 + }, + { + "epoch": 2.831191960310393, + "ewc_loss": 0.05125952512025833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.562976260378491e-05, + "grad_norm": 31.867238998413086, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8773660063743591, + "num_tokens": 849265778.0, + "step": 22256 + }, + { + "epoch": 2.8313191705889835, + "ewc_loss": 0.0513920783996582, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5696039301692508e-05, + "grad_norm": 31.825468063354492, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8722149729728699, + "num_tokens": 849302488.0, + "step": 22257 + }, + { + "epoch": 2.831446380867574, + "ewc_loss": 0.051246870309114456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5623434339649975e-05, + "grad_norm": 31.88370704650879, + "learning_rate": 1e-06, + "loss": 0.3675, + "mean_token_accuracy": 0.8861522674560547, + "num_tokens": 849337483.0, + "step": 22258 + }, + { + "epoch": 2.8315735911461646, + "ewc_loss": 0.05126119405031204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.563059751992114e-05, + "grad_norm": 31.674394607543945, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8780966997146606, + "num_tokens": 849378491.0, + "step": 22259 + }, + { + "epoch": 2.831700801424755, + "ewc_loss": 0.05129446089267731, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5647230359027162e-05, + "grad_norm": 32.01166915893555, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8789734840393066, + "num_tokens": 849418897.0, + "step": 22260 + }, + { + "epoch": 2.8318280117033456, + "ewc_loss": 0.05126360431313515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5631801690906286e-05, + "grad_norm": 31.700077056884766, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.876162588596344, + "num_tokens": 849457057.0, + "step": 22261 + }, + { + "epoch": 2.831955221981936, + "ewc_loss": 0.05126441642642021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5632207325543277e-05, + "grad_norm": 32.00247573852539, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.875976026058197, + "num_tokens": 849488825.0, + "step": 22262 + }, + { + "epoch": 2.8320824322605267, + "ewc_loss": 0.05129831284284592, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5649156668805517e-05, + "grad_norm": 31.757734298706055, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8796597123146057, + "num_tokens": 849528336.0, + "step": 22263 + }, + { + "epoch": 2.8322096425391172, + "ewc_loss": 0.05116112902760506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5580564397387207e-05, + "grad_norm": 31.863540649414062, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8854742050170898, + "num_tokens": 849565373.0, + "step": 22264 + }, + { + "epoch": 2.8323368528177078, + "ewc_loss": 0.051439352333545685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5719675250002183e-05, + "grad_norm": 31.87784767150879, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8834843039512634, + "num_tokens": 849600909.0, + "step": 22265 + }, + { + "epoch": 2.8324640630962983, + "ewc_loss": 0.051141634583473206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5570816433173604e-05, + "grad_norm": 31.634214401245117, + "learning_rate": 1e-06, + "loss": 0.4461, + "mean_token_accuracy": 0.8633168935775757, + "num_tokens": 849639499.0, + "step": 22266 + }, + { + "epoch": 2.832591273374889, + "ewc_loss": 0.051289550960063934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5644774723332375e-05, + "grad_norm": 31.905487060546875, + "learning_rate": 1e-06, + "loss": 0.3781, + "mean_token_accuracy": 0.8830085396766663, + "num_tokens": 849677232.0, + "step": 22267 + }, + { + "epoch": 2.8327184836534793, + "ewc_loss": 0.05141391605138779, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5706958695081994e-05, + "grad_norm": 31.841480255126953, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8776235580444336, + "num_tokens": 849717187.0, + "step": 22268 + }, + { + "epoch": 2.83284569393207, + "ewc_loss": 0.05121799558401108, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5608997020754032e-05, + "grad_norm": 31.918603897094727, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8821882009506226, + "num_tokens": 849756545.0, + "step": 22269 + }, + { + "epoch": 2.83297290421066, + "ewc_loss": 0.05125308781862259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5626542992540635e-05, + "grad_norm": 31.826799392700195, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8763449788093567, + "num_tokens": 849788670.0, + "step": 22270 + }, + { + "epoch": 2.833100114489251, + "ewc_loss": 0.05120375007390976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.560187567723915e-05, + "grad_norm": 31.914047241210938, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8718211054801941, + "num_tokens": 849825715.0, + "step": 22271 + }, + { + "epoch": 2.833227324767841, + "ewc_loss": 0.051276665180921555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5638331862865016e-05, + "grad_norm": 31.822389602661133, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8779851198196411, + "num_tokens": 849856435.0, + "step": 22272 + }, + { + "epoch": 2.833354535046432, + "ewc_loss": 0.05118144676089287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.559072345320601e-05, + "grad_norm": 31.837505340576172, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8875908255577087, + "num_tokens": 849895634.0, + "step": 22273 + }, + { + "epoch": 2.833481745325022, + "ewc_loss": 0.05131036415696144, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5655182980699465e-05, + "grad_norm": 31.780338287353516, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.8523712158203125, + "num_tokens": 849938646.0, + "step": 22274 + }, + { + "epoch": 2.833608955603613, + "ewc_loss": 0.05133712664246559, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5668563466751948e-05, + "grad_norm": 31.842296600341797, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.8694159984588623, + "num_tokens": 849978642.0, + "step": 22275 + }, + { + "epoch": 2.833736165882203, + "ewc_loss": 0.05132317170500755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5661585823399946e-05, + "grad_norm": 31.822093963623047, + "learning_rate": 1e-06, + "loss": 0.4204, + "mean_token_accuracy": 0.8719087839126587, + "num_tokens": 850017820.0, + "step": 22276 + }, + { + "epoch": 2.8338633761607936, + "ewc_loss": 0.05125607177615166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5628036382840946e-05, + "grad_norm": 31.808286666870117, + "learning_rate": 1e-06, + "loss": 0.4784, + "mean_token_accuracy": 0.8540827035903931, + "num_tokens": 850056619.0, + "step": 22277 + }, + { + "epoch": 2.833990586439384, + "ewc_loss": 0.051276832818984985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.563841553637758e-05, + "grad_norm": 31.82421112060547, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8704814910888672, + "num_tokens": 850092456.0, + "step": 22278 + }, + { + "epoch": 2.8341177967179747, + "ewc_loss": 0.0513058565557003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5652927433839068e-05, + "grad_norm": 31.80845832824707, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8697464466094971, + "num_tokens": 850131598.0, + "step": 22279 + }, + { + "epoch": 2.8342450069965652, + "ewc_loss": 0.051309164613485336, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5654582714196295e-05, + "grad_norm": 31.870946884155273, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8621072173118591, + "num_tokens": 850170884.0, + "step": 22280 + }, + { + "epoch": 2.8343722172751558, + "ewc_loss": 0.05123182758688927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5615914637455717e-05, + "grad_norm": 31.70049476623535, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.871005654335022, + "num_tokens": 850206989.0, + "step": 22281 + }, + { + "epoch": 2.8344994275537463, + "ewc_loss": 0.05132884904742241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.566442526585888e-05, + "grad_norm": 31.900615692138672, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8688005805015564, + "num_tokens": 850248602.0, + "step": 22282 + }, + { + "epoch": 2.834626637832337, + "ewc_loss": 0.051413003355264664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5706502128741704e-05, + "grad_norm": 31.75872230529785, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8647930026054382, + "num_tokens": 850289790.0, + "step": 22283 + }, + { + "epoch": 2.8347538481109273, + "ewc_loss": 0.05132735148072243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.566367584222462e-05, + "grad_norm": 31.908618927001953, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.8808358907699585, + "num_tokens": 850326688.0, + "step": 22284 + }, + { + "epoch": 2.834881058389518, + "ewc_loss": 0.051358483731746674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5679242753540166e-05, + "grad_norm": 31.722705841064453, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.883884608745575, + "num_tokens": 850366460.0, + "step": 22285 + }, + { + "epoch": 2.8350082686681084, + "ewc_loss": 0.051230911165475845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5615456252126023e-05, + "grad_norm": 31.707616806030273, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8784145712852478, + "num_tokens": 850400046.0, + "step": 22286 + }, + { + "epoch": 2.835135478946699, + "ewc_loss": 0.05142137408256531, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5710687623359263e-05, + "grad_norm": 31.957483291625977, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8639873266220093, + "num_tokens": 850434374.0, + "step": 22287 + }, + { + "epoch": 2.8352626892252895, + "ewc_loss": 0.0513552688062191, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.567763476690743e-05, + "grad_norm": 31.913429260253906, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8745748400688171, + "num_tokens": 850469056.0, + "step": 22288 + }, + { + "epoch": 2.83538989950388, + "ewc_loss": 0.05121493339538574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.560746725066565e-05, + "grad_norm": 31.766521453857422, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8700389862060547, + "num_tokens": 850505805.0, + "step": 22289 + }, + { + "epoch": 2.8355171097824705, + "ewc_loss": 0.05133572593331337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5667863155831583e-05, + "grad_norm": 31.914772033691406, + "learning_rate": 1e-06, + "loss": 0.4582, + "mean_token_accuracy": 0.8605080842971802, + "num_tokens": 850550056.0, + "step": 22290 + }, + { + "epoch": 2.835644320061061, + "ewc_loss": 0.05137992650270462, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5689963877084665e-05, + "grad_norm": 31.85358428955078, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8797778487205505, + "num_tokens": 850585075.0, + "step": 22291 + }, + { + "epoch": 2.8357715303396516, + "ewc_loss": 0.0512089729309082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5604485927033238e-05, + "grad_norm": 31.778169631958008, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.883059024810791, + "num_tokens": 850625758.0, + "step": 22292 + }, + { + "epoch": 2.835898740618242, + "ewc_loss": 0.05128667503595352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5643337721703574e-05, + "grad_norm": 31.805322647094727, + "learning_rate": 1e-06, + "loss": 0.3664, + "mean_token_accuracy": 0.8864330053329468, + "num_tokens": 850664608.0, + "step": 22293 + }, + { + "epoch": 2.8360259508968326, + "ewc_loss": 0.05140020698308945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.570010292402003e-05, + "grad_norm": 31.87875747680664, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8805710673332214, + "num_tokens": 850705897.0, + "step": 22294 + }, + { + "epoch": 2.8361531611754227, + "ewc_loss": 0.051285866647958755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5642933906055987e-05, + "grad_norm": 31.76953125, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.8683513402938843, + "num_tokens": 850746195.0, + "step": 22295 + }, + { + "epoch": 2.8362803714540137, + "ewc_loss": 0.05130526050925255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.565263093856629e-05, + "grad_norm": 31.827301025390625, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8795207142829895, + "num_tokens": 850785183.0, + "step": 22296 + }, + { + "epoch": 2.8364075817326038, + "ewc_loss": 0.051404304802417755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5702152925077826e-05, + "grad_norm": 31.875835418701172, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8767608404159546, + "num_tokens": 850819558.0, + "step": 22297 + }, + { + "epoch": 2.8365347920111947, + "ewc_loss": 0.051295045763254166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5647523216321133e-05, + "grad_norm": 31.7330265045166, + "learning_rate": 1e-06, + "loss": 0.4479, + "mean_token_accuracy": 0.8631643056869507, + "num_tokens": 850857385.0, + "step": 22298 + }, + { + "epoch": 2.836662002289785, + "ewc_loss": 0.051329370588064194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5664685381343588e-05, + "grad_norm": 31.883228302001953, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8674951791763306, + "num_tokens": 850895297.0, + "step": 22299 + }, + { + "epoch": 2.836789212568376, + "ewc_loss": 0.05138299986720085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5691499104141258e-05, + "grad_norm": 31.734508514404297, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.8721799850463867, + "num_tokens": 850930982.0, + "step": 22300 + }, + { + "epoch": 2.836916422846966, + "ewc_loss": 0.05125709995627403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5628549337852746e-05, + "grad_norm": 31.838932037353516, + "learning_rate": 1e-06, + "loss": 0.4418, + "mean_token_accuracy": 0.8683345317840576, + "num_tokens": 850969032.0, + "step": 22301 + }, + { + "epoch": 2.8370436331255564, + "ewc_loss": 0.05140308663249016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5701543563627638e-05, + "grad_norm": 31.834861755371094, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8758230209350586, + "num_tokens": 851008325.0, + "step": 22302 + }, + { + "epoch": 2.837170843404147, + "ewc_loss": 0.05124284699559212, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5621424356359057e-05, + "grad_norm": 31.69514274597168, + "learning_rate": 1e-06, + "loss": 0.4147, + "mean_token_accuracy": 0.874333381652832, + "num_tokens": 851044289.0, + "step": 22303 + }, + { + "epoch": 2.8372980536827375, + "ewc_loss": 0.051389917731285095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5694958821986802e-05, + "grad_norm": 31.928218841552734, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8786357641220093, + "num_tokens": 851079887.0, + "step": 22304 + }, + { + "epoch": 2.837425263961328, + "ewc_loss": 0.0513647124171257, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5682356863399036e-05, + "grad_norm": 31.766143798828125, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8762502670288086, + "num_tokens": 851124535.0, + "step": 22305 + }, + { + "epoch": 2.8375524742399185, + "ewc_loss": 0.051344163715839386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5672081392258406e-05, + "grad_norm": 31.83367347717285, + "learning_rate": 1e-06, + "loss": 0.3813, + "mean_token_accuracy": 0.8817098736763, + "num_tokens": 851162982.0, + "step": 22306 + }, + { + "epoch": 2.837679684518509, + "ewc_loss": 0.05126917362213135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5634586563683115e-05, + "grad_norm": 31.770614624023438, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8805780410766602, + "num_tokens": 851201864.0, + "step": 22307 + }, + { + "epoch": 2.8378068947970996, + "ewc_loss": 0.05139138549566269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.569569187471643e-05, + "grad_norm": 31.732646942138672, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8760581016540527, + "num_tokens": 851241659.0, + "step": 22308 + }, + { + "epoch": 2.83793410507569, + "ewc_loss": 0.05127359554171562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5636798454797827e-05, + "grad_norm": 31.898733139038086, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8651204705238342, + "num_tokens": 851278861.0, + "step": 22309 + }, + { + "epoch": 2.8380613153542806, + "ewc_loss": 0.05132756009697914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5663779524620622e-05, + "grad_norm": 31.80643653869629, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8756020665168762, + "num_tokens": 851314442.0, + "step": 22310 + }, + { + "epoch": 2.838188525632871, + "ewc_loss": 0.05126883462071419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5634417397668585e-05, + "grad_norm": 31.934057235717773, + "learning_rate": 1e-06, + "loss": 0.4262, + "mean_token_accuracy": 0.868604838848114, + "num_tokens": 851356322.0, + "step": 22311 + }, + { + "epoch": 2.8383157359114617, + "ewc_loss": 0.05129021778702736, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5645109417382628e-05, + "grad_norm": 31.750696182250977, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8612478971481323, + "num_tokens": 851395374.0, + "step": 22312 + }, + { + "epoch": 2.838442946190052, + "ewc_loss": 0.05123211443424225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5616056518629193e-05, + "grad_norm": 31.86165428161621, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.871645450592041, + "num_tokens": 851430957.0, + "step": 22313 + }, + { + "epoch": 2.8385701564686427, + "ewc_loss": 0.051350224763154984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5675111828604713e-05, + "grad_norm": 31.877784729003906, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8670357465744019, + "num_tokens": 851474357.0, + "step": 22314 + }, + { + "epoch": 2.8386973667472333, + "ewc_loss": 0.05120919644832611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5604598704376258e-05, + "grad_norm": 31.836517333984375, + "learning_rate": 1e-06, + "loss": 0.3737, + "mean_token_accuracy": 0.8858344554901123, + "num_tokens": 851510346.0, + "step": 22315 + }, + { + "epoch": 2.838824577025824, + "ewc_loss": 0.051285650581121445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5642824766691774e-05, + "grad_norm": 31.995040893554688, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8744646906852722, + "num_tokens": 851546798.0, + "step": 22316 + }, + { + "epoch": 2.8389517873044143, + "ewc_loss": 0.051239561289548874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.561977998993825e-05, + "grad_norm": 31.804359436035156, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8777179718017578, + "num_tokens": 851587819.0, + "step": 22317 + }, + { + "epoch": 2.839078997583005, + "ewc_loss": 0.05123025178909302, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5615125196054578e-05, + "grad_norm": 31.94859504699707, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8780487775802612, + "num_tokens": 851628764.0, + "step": 22318 + }, + { + "epoch": 2.8392062078615954, + "ewc_loss": 0.05127763748168945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5638819352025166e-05, + "grad_norm": 31.939821243286133, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8590419292449951, + "num_tokens": 851672206.0, + "step": 22319 + }, + { + "epoch": 2.8393334181401855, + "ewc_loss": 0.051276493817567825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.563824637036305e-05, + "grad_norm": 31.81368637084961, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8671676516532898, + "num_tokens": 851709969.0, + "step": 22320 + }, + { + "epoch": 2.8394606284187764, + "ewc_loss": 0.051152054220438004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5576027837814763e-05, + "grad_norm": 31.81877899169922, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8664153814315796, + "num_tokens": 851749015.0, + "step": 22321 + }, + { + "epoch": 2.8395878386973665, + "ewc_loss": 0.051148671656847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5574336177669466e-05, + "grad_norm": 31.849905014038086, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8765753507614136, + "num_tokens": 851787210.0, + "step": 22322 + }, + { + "epoch": 2.8397150489759575, + "ewc_loss": 0.051224544644355774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5612273020669818e-05, + "grad_norm": 31.90741729736328, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8674739599227905, + "num_tokens": 851824677.0, + "step": 22323 + }, + { + "epoch": 2.8398422592545476, + "ewc_loss": 0.05124606937170029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5623034161981195e-05, + "grad_norm": 31.776424407958984, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8773126602172852, + "num_tokens": 851863614.0, + "step": 22324 + }, + { + "epoch": 2.839969469533138, + "ewc_loss": 0.05116087570786476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.558043706812896e-05, + "grad_norm": 31.873096466064453, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8665713667869568, + "num_tokens": 851908153.0, + "step": 22325 + }, + { + "epoch": 2.8400966798117286, + "ewc_loss": 0.05127460137009621, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5637300495873205e-05, + "grad_norm": 31.84671401977539, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8686350584030151, + "num_tokens": 851946473.0, + "step": 22326 + }, + { + "epoch": 2.840223890090319, + "ewc_loss": 0.05130494013428688, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5652470867498778e-05, + "grad_norm": 32.12796401977539, + "learning_rate": 1e-06, + "loss": 0.4476, + "mean_token_accuracy": 0.8621503710746765, + "num_tokens": 851991600.0, + "step": 22327 + }, + { + "epoch": 2.8403511003689097, + "ewc_loss": 0.05117378011345863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.558689084253274e-05, + "grad_norm": 31.776832580566406, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8740482330322266, + "num_tokens": 852029468.0, + "step": 22328 + }, + { + "epoch": 2.8404783106475002, + "ewc_loss": 0.051158905029296875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.557945299486164e-05, + "grad_norm": 31.951505661010742, + "learning_rate": 1e-06, + "loss": 0.3956, + "mean_token_accuracy": 0.8787667751312256, + "num_tokens": 852061400.0, + "step": 22329 + }, + { + "epoch": 2.8406055209260908, + "ewc_loss": 0.05129765719175339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.564882925071288e-05, + "grad_norm": 31.98822784423828, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8717703223228455, + "num_tokens": 852100596.0, + "step": 22330 + }, + { + "epoch": 2.8407327312046813, + "ewc_loss": 0.05120058357715607, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.560029133746866e-05, + "grad_norm": 32.00067901611328, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8760639429092407, + "num_tokens": 852140320.0, + "step": 22331 + }, + { + "epoch": 2.840859941483272, + "ewc_loss": 0.05117548629641533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5587743948563002e-05, + "grad_norm": 31.944913864135742, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8718981742858887, + "num_tokens": 852175969.0, + "step": 22332 + }, + { + "epoch": 2.8409871517618623, + "ewc_loss": 0.05114053934812546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5570268917363137e-05, + "grad_norm": 31.811861038208008, + "learning_rate": 1e-06, + "loss": 0.46, + "mean_token_accuracy": 0.8596140146255493, + "num_tokens": 852210816.0, + "step": 22333 + }, + { + "epoch": 2.841114362040453, + "ewc_loss": 0.05119702219963074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.559851054684259e-05, + "grad_norm": 31.977828979492188, + "learning_rate": 1e-06, + "loss": 0.4203, + "mean_token_accuracy": 0.8708279728889465, + "num_tokens": 852246684.0, + "step": 22334 + }, + { + "epoch": 2.8412415723190434, + "ewc_loss": 0.05117591470479965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5587956770323217e-05, + "grad_norm": 31.88187026977539, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8754616379737854, + "num_tokens": 852285283.0, + "step": 22335 + }, + { + "epoch": 2.841368782597634, + "ewc_loss": 0.05115964636206627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5579824068699963e-05, + "grad_norm": 31.896326065063477, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8809805512428284, + "num_tokens": 852323310.0, + "step": 22336 + }, + { + "epoch": 2.8414959928762245, + "ewc_loss": 0.05124271288514137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5621357053751126e-05, + "grad_norm": 31.906217575073242, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8761848211288452, + "num_tokens": 852360850.0, + "step": 22337 + }, + { + "epoch": 2.841623203154815, + "ewc_loss": 0.05117049068212509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.558524465712253e-05, + "grad_norm": 32.13286209106445, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8655548691749573, + "num_tokens": 852399320.0, + "step": 22338 + }, + { + "epoch": 2.8417504134334055, + "ewc_loss": 0.05121209844946861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5606048438930884e-05, + "grad_norm": 31.83753204345703, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8696644902229309, + "num_tokens": 852430980.0, + "step": 22339 + }, + { + "epoch": 2.841877623711996, + "ewc_loss": 0.05122271925210953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5611359887989238e-05, + "grad_norm": 32.116127014160156, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8838500380516052, + "num_tokens": 852465688.0, + "step": 22340 + }, + { + "epoch": 2.8420048339905866, + "ewc_loss": 0.05116531625390053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5582658054190688e-05, + "grad_norm": 31.857948303222656, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8713748455047607, + "num_tokens": 852504372.0, + "step": 22341 + }, + { + "epoch": 2.842132044269177, + "ewc_loss": 0.0510481521487236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.552407568146009e-05, + "grad_norm": 31.905561447143555, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8777685165405273, + "num_tokens": 852542232.0, + "step": 22342 + }, + { + "epoch": 2.8422592545477676, + "ewc_loss": 0.05118900164961815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5594501494197175e-05, + "grad_norm": 31.94040298461914, + "learning_rate": 1e-06, + "loss": 0.4542, + "mean_token_accuracy": 0.863778829574585, + "num_tokens": 852580005.0, + "step": 22343 + }, + { + "epoch": 2.842386464826358, + "ewc_loss": 0.0511881448328495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5594072212697938e-05, + "grad_norm": 31.917062759399414, + "learning_rate": 1e-06, + "loss": 0.3769, + "mean_token_accuracy": 0.8845853805541992, + "num_tokens": 852619912.0, + "step": 22344 + }, + { + "epoch": 2.8425136751049482, + "ewc_loss": 0.05127454176545143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.563727139204275e-05, + "grad_norm": 31.899789810180664, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8756108283996582, + "num_tokens": 852654349.0, + "step": 22345 + }, + { + "epoch": 2.842640885383539, + "ewc_loss": 0.05114128440618515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5570641810190864e-05, + "grad_norm": 31.91536521911621, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8648453950881958, + "num_tokens": 852691300.0, + "step": 22346 + }, + { + "epoch": 2.8427680956621293, + "ewc_loss": 0.051238421350717545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5619210646254942e-05, + "grad_norm": 31.896146774291992, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8718206286430359, + "num_tokens": 852731228.0, + "step": 22347 + }, + { + "epoch": 2.8428953059407203, + "ewc_loss": 0.051186881959438324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5593441023374908e-05, + "grad_norm": 31.96042823791504, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8718310594558716, + "num_tokens": 852768333.0, + "step": 22348 + }, + { + "epoch": 2.8430225162193103, + "ewc_loss": 0.05110107734799385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5550538339302875e-05, + "grad_norm": 31.793228149414062, + "learning_rate": 1e-06, + "loss": 0.5147, + "mean_token_accuracy": 0.8416249752044678, + "num_tokens": 852810962.0, + "step": 22349 + }, + { + "epoch": 2.843149726497901, + "ewc_loss": 0.05124478414654732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5622392058721744e-05, + "grad_norm": 31.95456314086914, + "learning_rate": 1e-06, + "loss": 0.3739, + "mean_token_accuracy": 0.8879935145378113, + "num_tokens": 852844326.0, + "step": 22350 + }, + { + "epoch": 2.8432769367764914, + "ewc_loss": 0.05119870603084564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5599352738936432e-05, + "grad_norm": 31.78081512451172, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8666422367095947, + "num_tokens": 852877737.0, + "step": 22351 + }, + { + "epoch": 2.843404147055082, + "ewc_loss": 0.05121053010225296, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5605264454497956e-05, + "grad_norm": 32.007991790771484, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.879818320274353, + "num_tokens": 852922085.0, + "step": 22352 + }, + { + "epoch": 2.8435313573336725, + "ewc_loss": 0.05123457685112953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5617287974455394e-05, + "grad_norm": 31.752763748168945, + "learning_rate": 1e-06, + "loss": 0.3604, + "mean_token_accuracy": 0.8886831998825073, + "num_tokens": 852960742.0, + "step": 22353 + }, + { + "epoch": 2.843658567612263, + "ewc_loss": 0.05119093507528305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5595467377570458e-05, + "grad_norm": 31.993412017822266, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.877548098564148, + "num_tokens": 852993502.0, + "step": 22354 + }, + { + "epoch": 2.8437857778908535, + "ewc_loss": 0.051297180354595184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5648590963101014e-05, + "grad_norm": 31.813011169433594, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8776522278785706, + "num_tokens": 853036844.0, + "step": 22355 + }, + { + "epoch": 2.843912988169444, + "ewc_loss": 0.05123931169509888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.561965629865881e-05, + "grad_norm": 31.906461715698242, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8719834089279175, + "num_tokens": 853070851.0, + "step": 22356 + }, + { + "epoch": 2.8440401984480346, + "ewc_loss": 0.05128628760576248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5643143089837395e-05, + "grad_norm": 31.890777587890625, + "learning_rate": 1e-06, + "loss": 0.4224, + "mean_token_accuracy": 0.8734139800071716, + "num_tokens": 853106116.0, + "step": 22357 + }, + { + "epoch": 2.844167408726625, + "ewc_loss": 0.05120376870036125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5601884772186168e-05, + "grad_norm": 31.792259216308594, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.870771050453186, + "num_tokens": 853152776.0, + "step": 22358 + }, + { + "epoch": 2.8442946190052156, + "ewc_loss": 0.051369860768318176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5684930733405054e-05, + "grad_norm": 32.099754333496094, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8711867332458496, + "num_tokens": 853194176.0, + "step": 22359 + }, + { + "epoch": 2.844421829283806, + "ewc_loss": 0.05125690996646881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5628454750403762e-05, + "grad_norm": 31.88140869140625, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.8707168102264404, + "num_tokens": 853236834.0, + "step": 22360 + }, + { + "epoch": 2.8445490395623967, + "ewc_loss": 0.05123305320739746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5616525817895308e-05, + "grad_norm": 31.94474220275879, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8641413450241089, + "num_tokens": 853274145.0, + "step": 22361 + }, + { + "epoch": 2.844676249840987, + "ewc_loss": 0.051194362342357635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5597180865588598e-05, + "grad_norm": 31.787363052368164, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8751978278160095, + "num_tokens": 853312569.0, + "step": 22362 + }, + { + "epoch": 2.8448034601195777, + "ewc_loss": 0.0512433797121048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5621689928811975e-05, + "grad_norm": 31.891359329223633, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8776965141296387, + "num_tokens": 853351742.0, + "step": 22363 + }, + { + "epoch": 2.8449306703981683, + "ewc_loss": 0.05126054212450981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5630271920817904e-05, + "grad_norm": 31.875537872314453, + "learning_rate": 1e-06, + "loss": 0.41, + "mean_token_accuracy": 0.8739301562309265, + "num_tokens": 853389607.0, + "step": 22364 + }, + { + "epoch": 2.845057880676759, + "ewc_loss": 0.05126979574561119, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.563489761087112e-05, + "grad_norm": 31.907915115356445, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.879380464553833, + "num_tokens": 853427218.0, + "step": 22365 + }, + { + "epoch": 2.8451850909553493, + "ewc_loss": 0.05122623220086098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.561311521276366e-05, + "grad_norm": 31.863332748413086, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8771375417709351, + "num_tokens": 853464773.0, + "step": 22366 + }, + { + "epoch": 2.84531230123394, + "ewc_loss": 0.05128597468137741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.564298665674869e-05, + "grad_norm": 31.943410873413086, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8724381923675537, + "num_tokens": 853507250.0, + "step": 22367 + }, + { + "epoch": 2.84543951151253, + "ewc_loss": 0.05131790041923523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5658950107754208e-05, + "grad_norm": 31.923864364624023, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8739678859710693, + "num_tokens": 853551947.0, + "step": 22368 + }, + { + "epoch": 2.845566721791121, + "ewc_loss": 0.05116719752550125, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.558359847171232e-05, + "grad_norm": 31.88767433166504, + "learning_rate": 1e-06, + "loss": 0.4378, + "mean_token_accuracy": 0.8641488552093506, + "num_tokens": 853590557.0, + "step": 22369 + }, + { + "epoch": 2.845693932069711, + "ewc_loss": 0.05126231163740158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5631155949668027e-05, + "grad_norm": 31.88526153564453, + "learning_rate": 1e-06, + "loss": 0.4439, + "mean_token_accuracy": 0.8678406476974487, + "num_tokens": 853630870.0, + "step": 22370 + }, + { + "epoch": 2.845821142348302, + "ewc_loss": 0.05129237845540047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5646189897088334e-05, + "grad_norm": 31.790353775024414, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.870032548904419, + "num_tokens": 853662360.0, + "step": 22371 + }, + { + "epoch": 2.845948352626892, + "ewc_loss": 0.05130496993660927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5652485419414006e-05, + "grad_norm": 31.90370750427246, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8668609261512756, + "num_tokens": 853703661.0, + "step": 22372 + }, + { + "epoch": 2.846075562905483, + "ewc_loss": 0.05137927085161209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5689634640002623e-05, + "grad_norm": 31.878231048583984, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8625880479812622, + "num_tokens": 853742398.0, + "step": 22373 + }, + { + "epoch": 2.846202773184073, + "ewc_loss": 0.051268480718135834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.563424095569644e-05, + "grad_norm": 31.942699432373047, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8795368671417236, + "num_tokens": 853781479.0, + "step": 22374 + }, + { + "epoch": 2.8463299834626636, + "ewc_loss": 0.05132786184549332, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5663930500741117e-05, + "grad_norm": 31.82088851928711, + "learning_rate": 1e-06, + "loss": 0.4937, + "mean_token_accuracy": 0.8476060628890991, + "num_tokens": 853822330.0, + "step": 22375 + }, + { + "epoch": 2.846457193741254, + "ewc_loss": 0.05131727457046509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5658637241576798e-05, + "grad_norm": 31.935380935668945, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8600916266441345, + "num_tokens": 853862747.0, + "step": 22376 + }, + { + "epoch": 2.8465844040198447, + "ewc_loss": 0.05123266577720642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5616333005018532e-05, + "grad_norm": 31.77043914794922, + "learning_rate": 1e-06, + "loss": 0.3451, + "mean_token_accuracy": 0.8953502774238586, + "num_tokens": 853897124.0, + "step": 22377 + }, + { + "epoch": 2.8467116142984352, + "ewc_loss": 0.0513044074177742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5652203476056457e-05, + "grad_norm": 31.839521408081055, + "learning_rate": 1e-06, + "loss": 0.4648, + "mean_token_accuracy": 0.8598422408103943, + "num_tokens": 853930516.0, + "step": 22378 + }, + { + "epoch": 2.8468388245770258, + "ewc_loss": 0.051370736211538315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5685367290861905e-05, + "grad_norm": 31.94603729248047, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8771904706954956, + "num_tokens": 853967016.0, + "step": 22379 + }, + { + "epoch": 2.8469660348556163, + "ewc_loss": 0.05137442424893379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.56872117461171e-05, + "grad_norm": 31.80423355102539, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8554174304008484, + "num_tokens": 854008804.0, + "step": 22380 + }, + { + "epoch": 2.847093245134207, + "ewc_loss": 0.0512770339846611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5638517399784178e-05, + "grad_norm": 31.839319229125977, + "learning_rate": 1e-06, + "loss": 0.4606, + "mean_token_accuracy": 0.8601822853088379, + "num_tokens": 854048393.0, + "step": 22381 + }, + { + "epoch": 2.8472204554127973, + "ewc_loss": 0.051384907215833664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.569245407357812e-05, + "grad_norm": 31.84524917602539, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8777835965156555, + "num_tokens": 854089480.0, + "step": 22382 + }, + { + "epoch": 2.847347665691388, + "ewc_loss": 0.05135878175497055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5679390091681853e-05, + "grad_norm": 31.896207809448242, + "learning_rate": 1e-06, + "loss": 0.3816, + "mean_token_accuracy": 0.8829017877578735, + "num_tokens": 854125677.0, + "step": 22383 + }, + { + "epoch": 2.8474748759699784, + "ewc_loss": 0.05140607804059982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5703038772917353e-05, + "grad_norm": 31.999420166015625, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8684452772140503, + "num_tokens": 854160485.0, + "step": 22384 + }, + { + "epoch": 2.847602086248569, + "ewc_loss": 0.05137200281023979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5686002118163742e-05, + "grad_norm": 31.835193634033203, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8650103807449341, + "num_tokens": 854200635.0, + "step": 22385 + }, + { + "epoch": 2.8477292965271594, + "ewc_loss": 0.051276687532663345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.563834459579084e-05, + "grad_norm": 31.794902801513672, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8765714764595032, + "num_tokens": 854240046.0, + "step": 22386 + }, + { + "epoch": 2.84785650680575, + "ewc_loss": 0.05139683559536934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5698418539832346e-05, + "grad_norm": 31.732866287231445, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8827016353607178, + "num_tokens": 854275880.0, + "step": 22387 + }, + { + "epoch": 2.8479837170843405, + "ewc_loss": 0.05138925835490227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.569462958490476e-05, + "grad_norm": 31.96446990966797, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8738296031951904, + "num_tokens": 854311204.0, + "step": 22388 + }, + { + "epoch": 2.848110927362931, + "ewc_loss": 0.05142040178179741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5710200134199113e-05, + "grad_norm": 31.787796020507812, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8692359328269958, + "num_tokens": 854346076.0, + "step": 22389 + }, + { + "epoch": 2.8482381376415216, + "ewc_loss": 0.051357585936784744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.567879346315749e-05, + "grad_norm": 31.821165084838867, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8789311647415161, + "num_tokens": 854383075.0, + "step": 22390 + }, + { + "epoch": 2.848365347920112, + "ewc_loss": 0.05148867145180702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5744335289346054e-05, + "grad_norm": 32.03394317626953, + "learning_rate": 1e-06, + "loss": 0.4503, + "mean_token_accuracy": 0.8652102947235107, + "num_tokens": 854416011.0, + "step": 22391 + }, + { + "epoch": 2.8484925581987026, + "ewc_loss": 0.05143480747938156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5717403332237154e-05, + "grad_norm": 31.906051635742188, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8738740086555481, + "num_tokens": 854449520.0, + "step": 22392 + }, + { + "epoch": 2.8486197684772927, + "ewc_loss": 0.05126715451478958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5633577024564147e-05, + "grad_norm": 31.858625411987305, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8714411854743958, + "num_tokens": 854484355.0, + "step": 22393 + }, + { + "epoch": 2.8487469787558837, + "ewc_loss": 0.051482588052749634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5741293939063326e-05, + "grad_norm": 31.93218421936035, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8584654331207275, + "num_tokens": 854526238.0, + "step": 22394 + }, + { + "epoch": 2.8488741890344738, + "ewc_loss": 0.05134714022278786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5673569325590506e-05, + "grad_norm": 31.94107437133789, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8745459318161011, + "num_tokens": 854562619.0, + "step": 22395 + }, + { + "epoch": 2.8490013993130647, + "ewc_loss": 0.05138273164629936, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.56913663179148e-05, + "grad_norm": 31.9136962890625, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.877288281917572, + "num_tokens": 854604974.0, + "step": 22396 + }, + { + "epoch": 2.849128609591655, + "ewc_loss": 0.051298901438713074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.564945134508889e-05, + "grad_norm": 31.73592758178711, + "learning_rate": 1e-06, + "loss": 0.3966, + "mean_token_accuracy": 0.8787713646888733, + "num_tokens": 854644066.0, + "step": 22397 + }, + { + "epoch": 2.849255819870246, + "ewc_loss": 0.05137384310364723, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5686920707812533e-05, + "grad_norm": 31.86595916748047, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8751000165939331, + "num_tokens": 854682723.0, + "step": 22398 + }, + { + "epoch": 2.849383030148836, + "ewc_loss": 0.05142444744706154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5712222850415856e-05, + "grad_norm": 31.94173812866211, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8825451731681824, + "num_tokens": 854720292.0, + "step": 22399 + }, + { + "epoch": 2.8495102404274264, + "ewc_loss": 0.05140519514679909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.570259675849229e-05, + "grad_norm": 31.80021858215332, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8687388896942139, + "num_tokens": 854756917.0, + "step": 22400 + }, + { + "epoch": 2.849637450706017, + "ewc_loss": 0.051366448402404785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5683224521344528e-05, + "grad_norm": 31.836441040039062, + "learning_rate": 1e-06, + "loss": 0.4188, + "mean_token_accuracy": 0.8738458752632141, + "num_tokens": 854798771.0, + "step": 22401 + }, + { + "epoch": 2.8497646609846075, + "ewc_loss": 0.05156636983156204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5783185265026987e-05, + "grad_norm": 31.981958389282227, + "learning_rate": 1e-06, + "loss": 0.4119, + "mean_token_accuracy": 0.8698086738586426, + "num_tokens": 854838193.0, + "step": 22402 + }, + { + "epoch": 2.849891871263198, + "ewc_loss": 0.05133119225502014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5665596695034765e-05, + "grad_norm": 31.825912475585938, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8668973445892334, + "num_tokens": 854871650.0, + "step": 22403 + }, + { + "epoch": 2.8500190815417885, + "ewc_loss": 0.05139586701989174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.56979328696616e-05, + "grad_norm": 31.83296775817871, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8712825775146484, + "num_tokens": 854910757.0, + "step": 22404 + }, + { + "epoch": 2.850146291820379, + "ewc_loss": 0.051400601863861084, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5700301193865016e-05, + "grad_norm": 31.79034423828125, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8609005212783813, + "num_tokens": 854945018.0, + "step": 22405 + }, + { + "epoch": 2.8502735020989696, + "ewc_loss": 0.051430828869342804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.571541517681908e-05, + "grad_norm": 31.74901580810547, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8652333617210388, + "num_tokens": 854988369.0, + "step": 22406 + }, + { + "epoch": 2.85040071237756, + "ewc_loss": 0.051442380994558334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5721190468175337e-05, + "grad_norm": 31.83914566040039, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8738465309143066, + "num_tokens": 855023935.0, + "step": 22407 + }, + { + "epoch": 2.8505279226561506, + "ewc_loss": 0.051482152193784714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.57410756603349e-05, + "grad_norm": 31.85653305053711, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8716496229171753, + "num_tokens": 855062002.0, + "step": 22408 + }, + { + "epoch": 2.850655132934741, + "ewc_loss": 0.05140286311507225, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5701430786284618e-05, + "grad_norm": 31.806333541870117, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8757424354553223, + "num_tokens": 855101257.0, + "step": 22409 + }, + { + "epoch": 2.8507823432133317, + "ewc_loss": 0.0514441654086113, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5722081772983074e-05, + "grad_norm": 31.82145881652832, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.875560998916626, + "num_tokens": 855135834.0, + "step": 22410 + }, + { + "epoch": 2.850909553491922, + "ewc_loss": 0.051415592432022095, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5707795430207625e-05, + "grad_norm": 31.99602699279785, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8746618032455444, + "num_tokens": 855174156.0, + "step": 22411 + }, + { + "epoch": 2.8510367637705127, + "ewc_loss": 0.05136626586318016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.568313357187435e-05, + "grad_norm": 31.82732582092285, + "learning_rate": 1e-06, + "loss": 0.3867, + "mean_token_accuracy": 0.8820225596427917, + "num_tokens": 855207698.0, + "step": 22412 + }, + { + "epoch": 2.8511639740491033, + "ewc_loss": 0.0513409748673439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.56704879575409e-05, + "grad_norm": 31.908723831176758, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.872078537940979, + "num_tokens": 855240636.0, + "step": 22413 + }, + { + "epoch": 2.851291184327694, + "ewc_loss": 0.05143570154905319, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5717850803630427e-05, + "grad_norm": 31.866901397705078, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8751978874206543, + "num_tokens": 855279923.0, + "step": 22414 + }, + { + "epoch": 2.8514183946062843, + "ewc_loss": 0.05135027691721916, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5675139113445766e-05, + "grad_norm": 31.908418655395508, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8608223795890808, + "num_tokens": 855315460.0, + "step": 22415 + }, + { + "epoch": 2.851545604884875, + "ewc_loss": 0.051463302224874496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.573165147623513e-05, + "grad_norm": 31.854520797729492, + "learning_rate": 1e-06, + "loss": 0.3819, + "mean_token_accuracy": 0.8810800313949585, + "num_tokens": 855351548.0, + "step": 22416 + }, + { + "epoch": 2.8516728151634654, + "ewc_loss": 0.051260147243738174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5630073650972918e-05, + "grad_norm": 31.927539825439453, + "learning_rate": 1e-06, + "loss": 0.471, + "mean_token_accuracy": 0.8542709350585938, + "num_tokens": 855390715.0, + "step": 22417 + }, + { + "epoch": 2.8518000254420555, + "ewc_loss": 0.05145781859755516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5728908440214582e-05, + "grad_norm": 31.960289001464844, + "learning_rate": 1e-06, + "loss": 0.4838, + "mean_token_accuracy": 0.858758270740509, + "num_tokens": 855431997.0, + "step": 22418 + }, + { + "epoch": 2.8519272357206464, + "ewc_loss": 0.05137507617473602, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5687537345220335e-05, + "grad_norm": 31.898164749145508, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8668044805526733, + "num_tokens": 855470242.0, + "step": 22419 + }, + { + "epoch": 2.8520544459992365, + "ewc_loss": 0.05142020061612129, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5710100089781918e-05, + "grad_norm": 31.876066207885742, + "learning_rate": 1e-06, + "loss": 0.4509, + "mean_token_accuracy": 0.8593873381614685, + "num_tokens": 855508901.0, + "step": 22420 + }, + { + "epoch": 2.8521816562778275, + "ewc_loss": 0.05145006999373436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5725035811774433e-05, + "grad_norm": 31.858112335205078, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.868635892868042, + "num_tokens": 855545290.0, + "step": 22421 + }, + { + "epoch": 2.8523088665564176, + "ewc_loss": 0.05137133598327637, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.568566742411349e-05, + "grad_norm": 31.754932403564453, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8696484565734863, + "num_tokens": 855579567.0, + "step": 22422 + }, + { + "epoch": 2.852436076835008, + "ewc_loss": 0.05143478512763977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5717392418300733e-05, + "grad_norm": 31.88983726501465, + "learning_rate": 1e-06, + "loss": 0.4401, + "mean_token_accuracy": 0.8652509450912476, + "num_tokens": 855621117.0, + "step": 22423 + }, + { + "epoch": 2.8525632871135986, + "ewc_loss": 0.05150648579001427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5753242880455218e-05, + "grad_norm": 31.898155212402344, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8748860955238342, + "num_tokens": 855660494.0, + "step": 22424 + }, + { + "epoch": 2.852690497392189, + "ewc_loss": 0.051412880420684814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5706440283101983e-05, + "grad_norm": 31.84441566467285, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8723719120025635, + "num_tokens": 855694043.0, + "step": 22425 + }, + { + "epoch": 2.8528177076707797, + "ewc_loss": 0.05141158029437065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5705790903884917e-05, + "grad_norm": 31.855716705322266, + "learning_rate": 1e-06, + "loss": 0.4571, + "mean_token_accuracy": 0.8582289814949036, + "num_tokens": 855734615.0, + "step": 22426 + }, + { + "epoch": 2.85294491794937, + "ewc_loss": 0.05150324106216431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5751620341907255e-05, + "grad_norm": 32.02023696899414, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8840366005897522, + "num_tokens": 855771141.0, + "step": 22427 + }, + { + "epoch": 2.8530721282279607, + "ewc_loss": 0.05133650079369545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5668250600574538e-05, + "grad_norm": 31.839183807373047, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8720139861106873, + "num_tokens": 855806877.0, + "step": 22428 + }, + { + "epoch": 2.8531993385065513, + "ewc_loss": 0.051374949514865875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5687475499580614e-05, + "grad_norm": 31.942100524902344, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8859437704086304, + "num_tokens": 855840076.0, + "step": 22429 + }, + { + "epoch": 2.853326548785142, + "ewc_loss": 0.05143950134515762, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.571974982856773e-05, + "grad_norm": 32.01115798950195, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8826136589050293, + "num_tokens": 855877972.0, + "step": 22430 + }, + { + "epoch": 2.8534537590637323, + "ewc_loss": 0.051384955644607544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5692477720440365e-05, + "grad_norm": 31.890392303466797, + "learning_rate": 1e-06, + "loss": 0.4409, + "mean_token_accuracy": 0.8658604621887207, + "num_tokens": 855917424.0, + "step": 22431 + }, + { + "epoch": 2.853580969342323, + "ewc_loss": 0.05143890157341957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5719451514305547e-05, + "grad_norm": 31.882369995117188, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8612918853759766, + "num_tokens": 855954853.0, + "step": 22432 + }, + { + "epoch": 2.8537081796209134, + "ewc_loss": 0.05141255259513855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5706276574055664e-05, + "grad_norm": 31.983482360839844, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8745731115341187, + "num_tokens": 855993300.0, + "step": 22433 + }, + { + "epoch": 2.853835389899504, + "ewc_loss": 0.051436204463243484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5718101824168116e-05, + "grad_norm": 31.98649787902832, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8756043910980225, + "num_tokens": 856035307.0, + "step": 22434 + }, + { + "epoch": 2.8539626001780944, + "ewc_loss": 0.05140477046370506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.570238575572148e-05, + "grad_norm": 32.038597106933594, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8702149987220764, + "num_tokens": 856071801.0, + "step": 22435 + }, + { + "epoch": 2.854089810456685, + "ewc_loss": 0.05139224976301193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5696124794194475e-05, + "grad_norm": 31.935039520263672, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8598150014877319, + "num_tokens": 856112035.0, + "step": 22436 + }, + { + "epoch": 2.8542170207352755, + "ewc_loss": 0.05126430094242096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5632150936871767e-05, + "grad_norm": 31.94791030883789, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8771265745162964, + "num_tokens": 856148275.0, + "step": 22437 + }, + { + "epoch": 2.854344231013866, + "ewc_loss": 0.05140455439686775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5702276616357267e-05, + "grad_norm": 32.06488037109375, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.87406325340271, + "num_tokens": 856180330.0, + "step": 22438 + }, + { + "epoch": 2.8544714412924566, + "ewc_loss": 0.0513363853096962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5668192392913625e-05, + "grad_norm": 31.93845558166504, + "learning_rate": 1e-06, + "loss": 0.4638, + "mean_token_accuracy": 0.8585643768310547, + "num_tokens": 856219354.0, + "step": 22439 + }, + { + "epoch": 2.854598651571047, + "ewc_loss": 0.05129740759730339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5648703740444034e-05, + "grad_norm": 31.941892623901367, + "learning_rate": 1e-06, + "loss": 0.3873, + "mean_token_accuracy": 0.8833379745483398, + "num_tokens": 856257583.0, + "step": 22440 + }, + { + "epoch": 2.8547258618496376, + "ewc_loss": 0.051334526389837265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5667262889328413e-05, + "grad_norm": 31.969263076782227, + "learning_rate": 1e-06, + "loss": 0.4659, + "mean_token_accuracy": 0.855226457118988, + "num_tokens": 856298056.0, + "step": 22441 + }, + { + "epoch": 2.854853072128228, + "ewc_loss": 0.051296379417181015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.564818896644283e-05, + "grad_norm": 31.97138023376465, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8697419166564941, + "num_tokens": 856334109.0, + "step": 22442 + }, + { + "epoch": 2.8549802824068182, + "ewc_loss": 0.0513424277305603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5671213734312914e-05, + "grad_norm": 31.984201431274414, + "learning_rate": 1e-06, + "loss": 0.4062, + "mean_token_accuracy": 0.8740878105163574, + "num_tokens": 856376655.0, + "step": 22443 + }, + { + "epoch": 2.855107492685409, + "ewc_loss": 0.0512327216565609, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5616360289859585e-05, + "grad_norm": 31.914535522460938, + "learning_rate": 1e-06, + "loss": 0.3621, + "mean_token_accuracy": 0.8888930082321167, + "num_tokens": 856409926.0, + "step": 22444 + }, + { + "epoch": 2.8552347029639993, + "ewc_loss": 0.05129944160580635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5649720555520616e-05, + "grad_norm": 31.909292221069336, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8629558086395264, + "num_tokens": 856451471.0, + "step": 22445 + }, + { + "epoch": 2.8553619132425903, + "ewc_loss": 0.0513819195330143, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5690958864288405e-05, + "grad_norm": 31.98798370361328, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8806948661804199, + "num_tokens": 856483237.0, + "step": 22446 + }, + { + "epoch": 2.8554891235211803, + "ewc_loss": 0.05118684843182564, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5593424652470276e-05, + "grad_norm": 31.799190521240234, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8629574775695801, + "num_tokens": 856521278.0, + "step": 22447 + }, + { + "epoch": 2.855616333799771, + "ewc_loss": 0.05129324644804001, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5646622816566378e-05, + "grad_norm": 31.957517623901367, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8690992593765259, + "num_tokens": 856565048.0, + "step": 22448 + }, + { + "epoch": 2.8557435440783614, + "ewc_loss": 0.05138324201107025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5691620976431295e-05, + "grad_norm": 31.840097427368164, + "learning_rate": 1e-06, + "loss": 0.4087, + "mean_token_accuracy": 0.8753507137298584, + "num_tokens": 856603756.0, + "step": 22449 + }, + { + "epoch": 2.855870754356952, + "ewc_loss": 0.05141337588429451, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5706687665660866e-05, + "grad_norm": 32.11891174316406, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8778130412101746, + "num_tokens": 856640677.0, + "step": 22450 + }, + { + "epoch": 2.8559979646355425, + "ewc_loss": 0.05125407129526138, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5627035938668996e-05, + "grad_norm": 31.723657608032227, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.873058557510376, + "num_tokens": 856679982.0, + "step": 22451 + }, + { + "epoch": 2.856125174914133, + "ewc_loss": 0.05126780644059181, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5633902623667382e-05, + "grad_norm": 32.057159423828125, + "learning_rate": 1e-06, + "loss": 0.4469, + "mean_token_accuracy": 0.8600921034812927, + "num_tokens": 856720621.0, + "step": 22452 + }, + { + "epoch": 2.8562523851927235, + "ewc_loss": 0.05141819268465042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.570909600763116e-05, + "grad_norm": 31.906057357788086, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.8802399635314941, + "num_tokens": 856757172.0, + "step": 22453 + }, + { + "epoch": 2.856379595471314, + "ewc_loss": 0.05110858753323555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5554294552421197e-05, + "grad_norm": 31.835636138916016, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8691079616546631, + "num_tokens": 856796876.0, + "step": 22454 + }, + { + "epoch": 2.8565068057499046, + "ewc_loss": 0.05136546120047569, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5682729756226763e-05, + "grad_norm": 31.946956634521484, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.872147262096405, + "num_tokens": 856831148.0, + "step": 22455 + }, + { + "epoch": 2.856634016028495, + "ewc_loss": 0.05125904083251953, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.562952067819424e-05, + "grad_norm": 31.931854248046875, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.868196964263916, + "num_tokens": 856867091.0, + "step": 22456 + }, + { + "epoch": 2.8567612263070856, + "ewc_loss": 0.05133150517940521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.566575312812347e-05, + "grad_norm": 31.91179847717285, + "learning_rate": 1e-06, + "loss": 0.4552, + "mean_token_accuracy": 0.8615671396255493, + "num_tokens": 856906118.0, + "step": 22457 + }, + { + "epoch": 2.856888436585676, + "ewc_loss": 0.051277659833431244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5638830265961587e-05, + "grad_norm": 31.850528717041016, + "learning_rate": 1e-06, + "loss": 0.4966, + "mean_token_accuracy": 0.846044659614563, + "num_tokens": 856942694.0, + "step": 22458 + }, + { + "epoch": 2.8570156468642667, + "ewc_loss": 0.05129005014896393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5645025743870065e-05, + "grad_norm": 31.911537170410156, + "learning_rate": 1e-06, + "loss": 0.4396, + "mean_token_accuracy": 0.8640369772911072, + "num_tokens": 856977530.0, + "step": 22459 + }, + { + "epoch": 2.857142857142857, + "ewc_loss": 0.05124225094914436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5621126042096876e-05, + "grad_norm": 31.861799240112305, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8760716319084167, + "num_tokens": 857012417.0, + "step": 22460 + }, + { + "epoch": 2.8572700674214477, + "ewc_loss": 0.05131767690181732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5658839149400592e-05, + "grad_norm": 31.881587982177734, + "learning_rate": 1e-06, + "loss": 0.4587, + "mean_token_accuracy": 0.8655029535293579, + "num_tokens": 857053556.0, + "step": 22461 + }, + { + "epoch": 2.8573972777000383, + "ewc_loss": 0.05133027210831642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5665136490715668e-05, + "grad_norm": 31.88677978515625, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.875420331954956, + "num_tokens": 857094514.0, + "step": 22462 + }, + { + "epoch": 2.857524487978629, + "ewc_loss": 0.051363151520490646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5681576516944915e-05, + "grad_norm": 31.78635025024414, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8732631802558899, + "num_tokens": 857133670.0, + "step": 22463 + }, + { + "epoch": 2.8576516982572193, + "ewc_loss": 0.05126586928963661, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5632934921304695e-05, + "grad_norm": 31.92986297607422, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8640163540840149, + "num_tokens": 857167934.0, + "step": 22464 + }, + { + "epoch": 2.85777890853581, + "ewc_loss": 0.051445137709379196, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5722569262143224e-05, + "grad_norm": 31.873580932617188, + "learning_rate": 1e-06, + "loss": 0.4703, + "mean_token_accuracy": 0.8546736240386963, + "num_tokens": 857208263.0, + "step": 22465 + }, + { + "epoch": 2.8579061188144, + "ewc_loss": 0.051377780735492706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5688890673336573e-05, + "grad_norm": 31.877235412597656, + "learning_rate": 1e-06, + "loss": 0.4736, + "mean_token_accuracy": 0.8544094562530518, + "num_tokens": 857255107.0, + "step": 22466 + }, + { + "epoch": 2.858033329092991, + "ewc_loss": 0.05146491155028343, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5732455469551496e-05, + "grad_norm": 31.873191833496094, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8691539168357849, + "num_tokens": 857298368.0, + "step": 22467 + }, + { + "epoch": 2.858160539371581, + "ewc_loss": 0.05145268887281418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5726343665155582e-05, + "grad_norm": 31.867712020874023, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8730437755584717, + "num_tokens": 857336899.0, + "step": 22468 + }, + { + "epoch": 2.858287749650172, + "ewc_loss": 0.051409199833869934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5704599465825595e-05, + "grad_norm": 31.87913703918457, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8764879107475281, + "num_tokens": 857378482.0, + "step": 22469 + }, + { + "epoch": 2.858414959928762, + "ewc_loss": 0.0514550544321537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.572752782725729e-05, + "grad_norm": 31.887557983398438, + "learning_rate": 1e-06, + "loss": 0.3897, + "mean_token_accuracy": 0.8821293115615845, + "num_tokens": 857413203.0, + "step": 22470 + }, + { + "epoch": 2.858542170207353, + "ewc_loss": 0.05143888294696808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5719440600369126e-05, + "grad_norm": 31.856735229492188, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.8699423670768738, + "num_tokens": 857446706.0, + "step": 22471 + }, + { + "epoch": 2.858669380485943, + "ewc_loss": 0.051444120705127716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.572205994511023e-05, + "grad_norm": 31.98251724243164, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8755009174346924, + "num_tokens": 857483415.0, + "step": 22472 + }, + { + "epoch": 2.8587965907645336, + "ewc_loss": 0.051433369517326355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5716684831422754e-05, + "grad_norm": 31.843284606933594, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8778914213180542, + "num_tokens": 857525323.0, + "step": 22473 + }, + { + "epoch": 2.858923801043124, + "ewc_loss": 0.0513826422393322, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.569132084317971e-05, + "grad_norm": 31.89447784423828, + "learning_rate": 1e-06, + "loss": 0.3768, + "mean_token_accuracy": 0.888361930847168, + "num_tokens": 857564400.0, + "step": 22474 + }, + { + "epoch": 2.8590510113217147, + "ewc_loss": 0.051394447684288025, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5697223463794217e-05, + "grad_norm": 31.863140106201172, + "learning_rate": 1e-06, + "loss": 0.4184, + "mean_token_accuracy": 0.8708643913269043, + "num_tokens": 857602756.0, + "step": 22475 + }, + { + "epoch": 2.859178221600305, + "ewc_loss": 0.051445621997117996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5722811187733896e-05, + "grad_norm": 32.07291793823242, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8723521828651428, + "num_tokens": 857636875.0, + "step": 22476 + }, + { + "epoch": 2.8593054318788957, + "ewc_loss": 0.05138061195611954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.569030584709253e-05, + "grad_norm": 31.79267120361328, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8887714743614197, + "num_tokens": 857671950.0, + "step": 22477 + }, + { + "epoch": 2.8594326421574863, + "ewc_loss": 0.05131249129772186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5656245270511135e-05, + "grad_norm": 31.968198776245117, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.8846696019172668, + "num_tokens": 857705274.0, + "step": 22478 + }, + { + "epoch": 2.859559852436077, + "ewc_loss": 0.05153447762131691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.57672381849261e-05, + "grad_norm": 31.83256721496582, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8663030862808228, + "num_tokens": 857743730.0, + "step": 22479 + }, + { + "epoch": 2.8596870627146673, + "ewc_loss": 0.05132494494318962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5662471671239473e-05, + "grad_norm": 31.85849952697754, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8707044124603271, + "num_tokens": 857784055.0, + "step": 22480 + }, + { + "epoch": 2.859814272993258, + "ewc_loss": 0.05147937312722206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.573968595243059e-05, + "grad_norm": 31.926095962524414, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8793663382530212, + "num_tokens": 857822431.0, + "step": 22481 + }, + { + "epoch": 2.8599414832718484, + "ewc_loss": 0.051494136452674866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.574706741143018e-05, + "grad_norm": 31.932044982910156, + "learning_rate": 1e-06, + "loss": 0.3824, + "mean_token_accuracy": 0.8838986158370972, + "num_tokens": 857859258.0, + "step": 22482 + }, + { + "epoch": 2.860068693550439, + "ewc_loss": 0.051477063447237015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.573853089415934e-05, + "grad_norm": 31.788036346435547, + "learning_rate": 1e-06, + "loss": 0.4492, + "mean_token_accuracy": 0.863105058670044, + "num_tokens": 857901209.0, + "step": 22483 + }, + { + "epoch": 2.8601959038290294, + "ewc_loss": 0.051394883543252945, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5697441742522642e-05, + "grad_norm": 32.06580352783203, + "learning_rate": 1e-06, + "loss": 0.3763, + "mean_token_accuracy": 0.8855109214782715, + "num_tokens": 857935563.0, + "step": 22484 + }, + { + "epoch": 2.86032311410762, + "ewc_loss": 0.0514671690762043, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5733585061971098e-05, + "grad_norm": 31.993860244750977, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8627430200576782, + "num_tokens": 857966777.0, + "step": 22485 + }, + { + "epoch": 2.8604503243862105, + "ewc_loss": 0.05133546516299248, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5667732188594528e-05, + "grad_norm": 31.929731369018555, + "learning_rate": 1e-06, + "loss": 0.4039, + "mean_token_accuracy": 0.8775929808616638, + "num_tokens": 858005078.0, + "step": 22486 + }, + { + "epoch": 2.860577534664801, + "ewc_loss": 0.05140317603945732, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5701587219373323e-05, + "grad_norm": 32.12139129638672, + "learning_rate": 1e-06, + "loss": 0.4629, + "mean_token_accuracy": 0.8600186705589294, + "num_tokens": 858049288.0, + "step": 22487 + }, + { + "epoch": 2.8607047449433916, + "ewc_loss": 0.05141365900635719, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5706829546834342e-05, + "grad_norm": 31.934967041015625, + "learning_rate": 1e-06, + "loss": 0.3582, + "mean_token_accuracy": 0.8919334411621094, + "num_tokens": 858089834.0, + "step": 22488 + }, + { + "epoch": 2.860831955221982, + "ewc_loss": 0.05129919946193695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5649600502219982e-05, + "grad_norm": 31.90130043029785, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8772926330566406, + "num_tokens": 858124152.0, + "step": 22489 + }, + { + "epoch": 2.8609591655005726, + "ewc_loss": 0.05138137564063072, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5690687834867276e-05, + "grad_norm": 32.00364303588867, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.876496434211731, + "num_tokens": 858163202.0, + "step": 22490 + }, + { + "epoch": 2.8610863757791627, + "ewc_loss": 0.05127187445759773, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.563593807280995e-05, + "grad_norm": 31.744422912597656, + "learning_rate": 1e-06, + "loss": 0.3556, + "mean_token_accuracy": 0.8914570212364197, + "num_tokens": 858199173.0, + "step": 22491 + }, + { + "epoch": 2.8612135860577537, + "ewc_loss": 0.051350854337215424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5675426513771527e-05, + "grad_norm": 31.99860382080078, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8710227608680725, + "num_tokens": 858239693.0, + "step": 22492 + }, + { + "epoch": 2.8613407963363438, + "ewc_loss": 0.05147092416882515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5735462259035558e-05, + "grad_norm": 31.78884506225586, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8805023431777954, + "num_tokens": 858280720.0, + "step": 22493 + }, + { + "epoch": 2.8614680066149347, + "ewc_loss": 0.05139688029885292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.569844036770519e-05, + "grad_norm": 31.954145431518555, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.884131669998169, + "num_tokens": 858314145.0, + "step": 22494 + }, + { + "epoch": 2.861595216893525, + "ewc_loss": 0.051433172076940536, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5716586605994962e-05, + "grad_norm": 31.75938606262207, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8745008707046509, + "num_tokens": 858355177.0, + "step": 22495 + }, + { + "epoch": 2.861722427172116, + "ewc_loss": 0.051370542496442795, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5685270884423517e-05, + "grad_norm": 31.9608154296875, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8763040900230408, + "num_tokens": 858393070.0, + "step": 22496 + }, + { + "epoch": 2.861849637450706, + "ewc_loss": 0.05150076746940613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5750383429112844e-05, + "grad_norm": 31.805593490600586, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8719788789749146, + "num_tokens": 858431827.0, + "step": 22497 + }, + { + "epoch": 2.8619768477292964, + "ewc_loss": 0.051383696496486664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.569184835010674e-05, + "grad_norm": 31.96503257751465, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8706236481666565, + "num_tokens": 858474990.0, + "step": 22498 + }, + { + "epoch": 2.862104058007887, + "ewc_loss": 0.05153195187449455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.576597580628004e-05, + "grad_norm": 31.842056274414062, + "learning_rate": 1e-06, + "loss": 0.3805, + "mean_token_accuracy": 0.8860893249511719, + "num_tokens": 858514698.0, + "step": 22499 + }, + { + "epoch": 2.8622312682864774, + "ewc_loss": 0.05132657289505005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.566328657849226e-05, + "grad_norm": 31.86871337890625, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8733184337615967, + "num_tokens": 858557601.0, + "step": 22500 + }, + { + "epoch": 2.862358478565068, + "ewc_loss": 0.051595937460660934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5797968191909604e-05, + "grad_norm": 32.04496765136719, + "learning_rate": 1e-06, + "loss": 0.388, + "mean_token_accuracy": 0.881203830242157, + "num_tokens": 858596830.0, + "step": 22501 + }, + { + "epoch": 2.8624856888436585, + "ewc_loss": 0.05137144774198532, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5685723812785e-05, + "grad_norm": 31.74087905883789, + "learning_rate": 1e-06, + "loss": 0.4593, + "mean_token_accuracy": 0.8629623055458069, + "num_tokens": 858637555.0, + "step": 22502 + }, + { + "epoch": 2.862612899122249, + "ewc_loss": 0.051405876874923706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5702938728500158e-05, + "grad_norm": 31.823511123657227, + "learning_rate": 1e-06, + "loss": 0.3542, + "mean_token_accuracy": 0.8916789293289185, + "num_tokens": 858672813.0, + "step": 22503 + }, + { + "epoch": 2.8627401094008396, + "ewc_loss": 0.05155445262789726, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.577722625574097e-05, + "grad_norm": 32.03731155395508, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8787912130355835, + "num_tokens": 858710743.0, + "step": 22504 + }, + { + "epoch": 2.86286731967943, + "ewc_loss": 0.05149417743086815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5747089239303023e-05, + "grad_norm": 31.9735107421875, + "learning_rate": 1e-06, + "loss": 0.3708, + "mean_token_accuracy": 0.8862729072570801, + "num_tokens": 858739486.0, + "step": 22505 + }, + { + "epoch": 2.8629945299580206, + "ewc_loss": 0.05138783901929855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5693920179037377e-05, + "grad_norm": 31.908477783203125, + "learning_rate": 1e-06, + "loss": 0.4653, + "mean_token_accuracy": 0.8523291945457458, + "num_tokens": 858780678.0, + "step": 22506 + }, + { + "epoch": 2.863121740236611, + "ewc_loss": 0.05134466290473938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.567233059380669e-05, + "grad_norm": 31.90666961669922, + "learning_rate": 1e-06, + "loss": 0.4331, + "mean_token_accuracy": 0.8682172298431396, + "num_tokens": 858826112.0, + "step": 22507 + }, + { + "epoch": 2.8632489505152017, + "ewc_loss": 0.05140121281147003, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5700606784084812e-05, + "grad_norm": 32.02315139770508, + "learning_rate": 1e-06, + "loss": 0.3809, + "mean_token_accuracy": 0.88355553150177, + "num_tokens": 858862158.0, + "step": 22508 + }, + { + "epoch": 2.863376160793792, + "ewc_loss": 0.051379140466451645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.56895709753735e-05, + "grad_norm": 31.739582061767578, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8777321577072144, + "num_tokens": 858905270.0, + "step": 22509 + }, + { + "epoch": 2.8635033710723827, + "ewc_loss": 0.05128241330385208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.564120586612262e-05, + "grad_norm": 31.955358505249023, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8657875061035156, + "num_tokens": 858941802.0, + "step": 22510 + }, + { + "epoch": 2.8636305813509733, + "ewc_loss": 0.05141065642237663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5705328880576417e-05, + "grad_norm": 31.798938751220703, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8762803077697754, + "num_tokens": 858974672.0, + "step": 22511 + }, + { + "epoch": 2.863757791629564, + "ewc_loss": 0.05135951191186905, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5679755708551966e-05, + "grad_norm": 31.98052406311035, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8715732097625732, + "num_tokens": 859015305.0, + "step": 22512 + }, + { + "epoch": 2.8638850019081543, + "ewc_loss": 0.05146238952875137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.573119490989484e-05, + "grad_norm": 31.965429306030273, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8640882968902588, + "num_tokens": 859050646.0, + "step": 22513 + }, + { + "epoch": 2.864012212186745, + "ewc_loss": 0.05130242928862572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5651213945820928e-05, + "grad_norm": 31.823837280273438, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8819242715835571, + "num_tokens": 859090139.0, + "step": 22514 + }, + { + "epoch": 2.8641394224653354, + "ewc_loss": 0.05131283774971962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5656418074504472e-05, + "grad_norm": 31.976198196411133, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8771550059318542, + "num_tokens": 859130661.0, + "step": 22515 + }, + { + "epoch": 2.8642666327439255, + "ewc_loss": 0.05133698880672455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5668494345154613e-05, + "grad_norm": 31.891895294189453, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.8720308542251587, + "num_tokens": 859166399.0, + "step": 22516 + }, + { + "epoch": 2.8643938430225164, + "ewc_loss": 0.05141333490610123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5706667656777427e-05, + "grad_norm": 31.953266143798828, + "learning_rate": 1e-06, + "loss": 0.418, + "mean_token_accuracy": 0.8735342025756836, + "num_tokens": 859208349.0, + "step": 22517 + }, + { + "epoch": 2.8645210533011065, + "ewc_loss": 0.05140609294176102, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5703046048874967e-05, + "grad_norm": 32.015174865722656, + "learning_rate": 1e-06, + "loss": 0.4049, + "mean_token_accuracy": 0.8761001825332642, + "num_tokens": 859243784.0, + "step": 22518 + }, + { + "epoch": 2.8646482635796975, + "ewc_loss": 0.0515078604221344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.575393045844976e-05, + "grad_norm": 31.929439544677734, + "learning_rate": 1e-06, + "loss": 0.4561, + "mean_token_accuracy": 0.8585926294326782, + "num_tokens": 859278345.0, + "step": 22519 + }, + { + "epoch": 2.8647754738582876, + "ewc_loss": 0.05137130990624428, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5685654691187665e-05, + "grad_norm": 31.93490219116211, + "learning_rate": 1e-06, + "loss": 0.456, + "mean_token_accuracy": 0.8642245531082153, + "num_tokens": 859317454.0, + "step": 22520 + }, + { + "epoch": 2.864902684136878, + "ewc_loss": 0.051523592323064804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.576179576863069e-05, + "grad_norm": 32.03104782104492, + "learning_rate": 1e-06, + "loss": 0.4333, + "mean_token_accuracy": 0.8682911396026611, + "num_tokens": 859352076.0, + "step": 22521 + }, + { + "epoch": 2.8650298944154686, + "ewc_loss": 0.05140194296836853, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.570097058196552e-05, + "grad_norm": 31.918651580810547, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8805568814277649, + "num_tokens": 859389957.0, + "step": 22522 + }, + { + "epoch": 2.865157104694059, + "ewc_loss": 0.05142837390303612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5714187358971685e-05, + "grad_norm": 31.934101104736328, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8721904754638672, + "num_tokens": 859425167.0, + "step": 22523 + }, + { + "epoch": 2.8652843149726497, + "ewc_loss": 0.0514519140124321, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5725956220412627e-05, + "grad_norm": 31.910400390625, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8704224824905396, + "num_tokens": 859470999.0, + "step": 22524 + }, + { + "epoch": 2.86541152525124, + "ewc_loss": 0.05140659958124161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.570329888840206e-05, + "grad_norm": 32.01397705078125, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8798483610153198, + "num_tokens": 859503763.0, + "step": 22525 + }, + { + "epoch": 2.8655387355298307, + "ewc_loss": 0.051357246935367584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.567862429714296e-05, + "grad_norm": 31.83014488220215, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.8825439810752869, + "num_tokens": 859537739.0, + "step": 22526 + }, + { + "epoch": 2.8656659458084213, + "ewc_loss": 0.05132804065942764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.566401963122189e-05, + "grad_norm": 31.886369705200195, + "learning_rate": 1e-06, + "loss": 0.3796, + "mean_token_accuracy": 0.8821108937263489, + "num_tokens": 859573497.0, + "step": 22527 + }, + { + "epoch": 2.865793156087012, + "ewc_loss": 0.051473576575517654, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5736788302310742e-05, + "grad_norm": 31.98443603515625, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8693345785140991, + "num_tokens": 859606997.0, + "step": 22528 + }, + { + "epoch": 2.8659203663656023, + "ewc_loss": 0.05136403813958168, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5682018531369977e-05, + "grad_norm": 31.863515853881836, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8600555658340454, + "num_tokens": 859642278.0, + "step": 22529 + }, + { + "epoch": 2.866047576644193, + "ewc_loss": 0.051400039345026016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5700019250507466e-05, + "grad_norm": 31.977853775024414, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8840274810791016, + "num_tokens": 859684728.0, + "step": 22530 + }, + { + "epoch": 2.8661747869227834, + "ewc_loss": 0.05142248049378395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.571124059613794e-05, + "grad_norm": 31.942541122436523, + "learning_rate": 1e-06, + "loss": 0.4452, + "mean_token_accuracy": 0.8661960363388062, + "num_tokens": 859720946.0, + "step": 22531 + }, + { + "epoch": 2.866301997201374, + "ewc_loss": 0.05148245766758919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5741228455444798e-05, + "grad_norm": 31.942941665649414, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8700957298278809, + "num_tokens": 859752605.0, + "step": 22532 + }, + { + "epoch": 2.8664292074799644, + "ewc_loss": 0.051490459591150284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.57452302321326e-05, + "grad_norm": 31.96634864807129, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8850693106651306, + "num_tokens": 859784206.0, + "step": 22533 + }, + { + "epoch": 2.866556417758555, + "ewc_loss": 0.05153940990567207, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5769704734557308e-05, + "grad_norm": 31.901126861572266, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8688527345657349, + "num_tokens": 859828142.0, + "step": 22534 + }, + { + "epoch": 2.8666836280371455, + "ewc_loss": 0.05149872973561287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.574936479504686e-05, + "grad_norm": 31.986242294311523, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8679918050765991, + "num_tokens": 859870032.0, + "step": 22535 + }, + { + "epoch": 2.866810838315736, + "ewc_loss": 0.051480405032634735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5740202545421198e-05, + "grad_norm": 31.91245460510254, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8634496331214905, + "num_tokens": 859911417.0, + "step": 22536 + }, + { + "epoch": 2.8669380485943265, + "ewc_loss": 0.05143320932984352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5716604795888998e-05, + "grad_norm": 31.86048126220703, + "learning_rate": 1e-06, + "loss": 0.3698, + "mean_token_accuracy": 0.8841623067855835, + "num_tokens": 859953929.0, + "step": 22537 + }, + { + "epoch": 2.867065258872917, + "ewc_loss": 0.05156727135181427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5783636374399066e-05, + "grad_norm": 32.09674072265625, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8769716024398804, + "num_tokens": 859985402.0, + "step": 22538 + }, + { + "epoch": 2.8671924691515076, + "ewc_loss": 0.05149660259485245, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5748300686245784e-05, + "grad_norm": 31.82500457763672, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8698381781578064, + "num_tokens": 860017374.0, + "step": 22539 + }, + { + "epoch": 2.867319679430098, + "ewc_loss": 0.051415301859378815, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5707651730044745e-05, + "grad_norm": 31.926340103149414, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.881641685962677, + "num_tokens": 860058217.0, + "step": 22540 + }, + { + "epoch": 2.867446889708688, + "ewc_loss": 0.051494162529706955, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.574708196334541e-05, + "grad_norm": 31.891450881958008, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.856473445892334, + "num_tokens": 860092317.0, + "step": 22541 + }, + { + "epoch": 2.867574099987279, + "ewc_loss": 0.051494050770998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.57470255746739e-05, + "grad_norm": 31.95024871826172, + "learning_rate": 1e-06, + "loss": 0.4142, + "mean_token_accuracy": 0.869054913520813, + "num_tokens": 860125067.0, + "step": 22542 + }, + { + "epoch": 2.8677013102658693, + "ewc_loss": 0.051563628017902374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5781813747016713e-05, + "grad_norm": 31.918733596801758, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8859042525291443, + "num_tokens": 860163805.0, + "step": 22543 + }, + { + "epoch": 2.8678285205444602, + "ewc_loss": 0.05147076025605202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5735380404512398e-05, + "grad_norm": 31.94378662109375, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8595972657203674, + "num_tokens": 860200178.0, + "step": 22544 + }, + { + "epoch": 2.8679557308230503, + "ewc_loss": 0.05147116258740425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5735580493346788e-05, + "grad_norm": 31.931032180786133, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8725805282592773, + "num_tokens": 860235283.0, + "step": 22545 + }, + { + "epoch": 2.868082941101641, + "ewc_loss": 0.05148038640618324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.574019345047418e-05, + "grad_norm": 31.839614868164062, + "learning_rate": 1e-06, + "loss": 0.3626, + "mean_token_accuracy": 0.8895962238311768, + "num_tokens": 860276776.0, + "step": 22546 + }, + { + "epoch": 2.8682101513802314, + "ewc_loss": 0.05151544138789177, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5757721232366748e-05, + "grad_norm": 32.01659393310547, + "learning_rate": 1e-06, + "loss": 0.3975, + "mean_token_accuracy": 0.8779701590538025, + "num_tokens": 860318492.0, + "step": 22547 + }, + { + "epoch": 2.868337361658822, + "ewc_loss": 0.05156947672367096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5784738681977615e-05, + "grad_norm": 31.9357967376709, + "learning_rate": 1e-06, + "loss": 0.452, + "mean_token_accuracy": 0.8589152097702026, + "num_tokens": 860358556.0, + "step": 22548 + }, + { + "epoch": 2.8684645719374124, + "ewc_loss": 0.05141860619187355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5709303372423165e-05, + "grad_norm": 31.87395477294922, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8633711338043213, + "num_tokens": 860391114.0, + "step": 22549 + }, + { + "epoch": 2.868591782216003, + "ewc_loss": 0.051546208560466766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5773104425752535e-05, + "grad_norm": 32.063045501708984, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.868248462677002, + "num_tokens": 860430696.0, + "step": 22550 + }, + { + "epoch": 2.8687189924945935, + "ewc_loss": 0.051500339061021805, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5750168788363226e-05, + "grad_norm": 31.903532028198242, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8799875378608704, + "num_tokens": 860471055.0, + "step": 22551 + }, + { + "epoch": 2.868846202773184, + "ewc_loss": 0.051364123821258545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5682062187115662e-05, + "grad_norm": 32.01088333129883, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.866557240486145, + "num_tokens": 860512472.0, + "step": 22552 + }, + { + "epoch": 2.8689734130517746, + "ewc_loss": 0.05159035697579384, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5795177862164564e-05, + "grad_norm": 31.938682556152344, + "learning_rate": 1e-06, + "loss": 0.4278, + "mean_token_accuracy": 0.868370771408081, + "num_tokens": 860541014.0, + "step": 22553 + }, + { + "epoch": 2.869100623330365, + "ewc_loss": 0.051407843828201294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5703920982778072e-05, + "grad_norm": 31.93360710144043, + "learning_rate": 1e-06, + "loss": 0.3716, + "mean_token_accuracy": 0.8873872756958008, + "num_tokens": 860585025.0, + "step": 22554 + }, + { + "epoch": 2.8692278336089556, + "ewc_loss": 0.051547758281230927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5773879315238446e-05, + "grad_norm": 31.918397903442383, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8808517456054688, + "num_tokens": 860616361.0, + "step": 22555 + }, + { + "epoch": 2.869355043887546, + "ewc_loss": 0.051512930542230606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.57564661296783e-05, + "grad_norm": 32.0970458984375, + "learning_rate": 1e-06, + "loss": 0.4525, + "mean_token_accuracy": 0.8643516302108765, + "num_tokens": 860655976.0, + "step": 22556 + }, + { + "epoch": 2.8694822541661367, + "ewc_loss": 0.051524728536605835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5762365112314e-05, + "grad_norm": 31.81146812438965, + "learning_rate": 1e-06, + "loss": 0.4545, + "mean_token_accuracy": 0.8609521389007568, + "num_tokens": 860703949.0, + "step": 22557 + }, + { + "epoch": 2.869609464444727, + "ewc_loss": 0.051415976136922836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.57079882430844e-05, + "grad_norm": 32.02897644042969, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.872194766998291, + "num_tokens": 860739935.0, + "step": 22558 + }, + { + "epoch": 2.8697366747233177, + "ewc_loss": 0.051558203995227814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5779101633816026e-05, + "grad_norm": 31.943401336669922, + "learning_rate": 1e-06, + "loss": 0.4024, + "mean_token_accuracy": 0.8766297101974487, + "num_tokens": 860782483.0, + "step": 22559 + }, + { + "epoch": 2.8698638850019083, + "ewc_loss": 0.05149557441473007, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5747787731233984e-05, + "grad_norm": 32.001285552978516, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.8615782856941223, + "num_tokens": 860816821.0, + "step": 22560 + }, + { + "epoch": 2.869991095280499, + "ewc_loss": 0.0515546053647995, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.577730265329592e-05, + "grad_norm": 32.05715560913086, + "learning_rate": 1e-06, + "loss": 0.4478, + "mean_token_accuracy": 0.8611327409744263, + "num_tokens": 860851313.0, + "step": 22561 + }, + { + "epoch": 2.8701183055590893, + "ewc_loss": 0.05136032775044441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.568016316217836e-05, + "grad_norm": 31.74802017211914, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8816705942153931, + "num_tokens": 860890540.0, + "step": 22562 + }, + { + "epoch": 2.87024551583768, + "ewc_loss": 0.05154469981789589, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5772349545150064e-05, + "grad_norm": 32.02008056640625, + "learning_rate": 1e-06, + "loss": 0.4342, + "mean_token_accuracy": 0.8673279285430908, + "num_tokens": 860929686.0, + "step": 22563 + }, + { + "epoch": 2.87037272611627, + "ewc_loss": 0.051552075892686844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5776038455660455e-05, + "grad_norm": 31.910518646240234, + "learning_rate": 1e-06, + "loss": 0.4107, + "mean_token_accuracy": 0.8756104707717896, + "num_tokens": 860968186.0, + "step": 22564 + }, + { + "epoch": 2.870499936394861, + "ewc_loss": 0.05149392411112785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5746961910044774e-05, + "grad_norm": 31.97701072692871, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8685152530670166, + "num_tokens": 861007252.0, + "step": 22565 + }, + { + "epoch": 2.870627146673451, + "ewc_loss": 0.05154482275247574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5772411390789784e-05, + "grad_norm": 31.98264503479004, + "learning_rate": 1e-06, + "loss": 0.408, + "mean_token_accuracy": 0.8730438947677612, + "num_tokens": 861049898.0, + "step": 22566 + }, + { + "epoch": 2.870754356952042, + "ewc_loss": 0.05140247195959091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.570123615441844e-05, + "grad_norm": 31.976776123046875, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8808688521385193, + "num_tokens": 861087711.0, + "step": 22567 + }, + { + "epoch": 2.870881567230632, + "ewc_loss": 0.051480092108249664, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5740046112332493e-05, + "grad_norm": 31.980253219604492, + "learning_rate": 1e-06, + "loss": 0.4766, + "mean_token_accuracy": 0.8526636362075806, + "num_tokens": 861128566.0, + "step": 22568 + }, + { + "epoch": 2.871008777509223, + "ewc_loss": 0.05139222741127014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5696113880258054e-05, + "grad_norm": 31.903974533081055, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8777143359184265, + "num_tokens": 861156299.0, + "step": 22569 + }, + { + "epoch": 2.871135987787813, + "ewc_loss": 0.051455527544021606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.572776429587975e-05, + "grad_norm": 31.929759979248047, + "learning_rate": 1e-06, + "loss": 0.3767, + "mean_token_accuracy": 0.8837819695472717, + "num_tokens": 861189376.0, + "step": 22570 + }, + { + "epoch": 2.8712631980664036, + "ewc_loss": 0.05148547515273094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.574273821664974e-05, + "grad_norm": 31.932432174682617, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8694109320640564, + "num_tokens": 861227156.0, + "step": 22571 + }, + { + "epoch": 2.871390408344994, + "ewc_loss": 0.05156361684203148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5781808290048502e-05, + "grad_norm": 32.12827682495117, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8726651072502136, + "num_tokens": 861268088.0, + "step": 22572 + }, + { + "epoch": 2.8715176186235847, + "ewc_loss": 0.05148731544613838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5743658625287935e-05, + "grad_norm": 31.80593490600586, + "learning_rate": 1e-06, + "loss": 0.4047, + "mean_token_accuracy": 0.8761651515960693, + "num_tokens": 861308892.0, + "step": 22573 + }, + { + "epoch": 2.871644828902175, + "ewc_loss": 0.05133286863565445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.56664352491498e-05, + "grad_norm": 31.99757194519043, + "learning_rate": 1e-06, + "loss": 0.4069, + "mean_token_accuracy": 0.8757888078689575, + "num_tokens": 861350108.0, + "step": 22574 + }, + { + "epoch": 2.8717720391807657, + "ewc_loss": 0.05163205415010452, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5816027118708007e-05, + "grad_norm": 31.943498611450195, + "learning_rate": 1e-06, + "loss": 0.3881, + "mean_token_accuracy": 0.8830193281173706, + "num_tokens": 861390553.0, + "step": 22575 + }, + { + "epoch": 2.8718992494593563, + "ewc_loss": 0.0513981394469738, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5699069738038816e-05, + "grad_norm": 31.959688186645508, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8741985559463501, + "num_tokens": 861425878.0, + "step": 22576 + }, + { + "epoch": 2.872026459737947, + "ewc_loss": 0.051622774451971054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.581138687673956e-05, + "grad_norm": 32.003135681152344, + "learning_rate": 1e-06, + "loss": 0.371, + "mean_token_accuracy": 0.8888834714889526, + "num_tokens": 861460935.0, + "step": 22577 + }, + { + "epoch": 2.8721536700165373, + "ewc_loss": 0.05141589418053627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.570794640632812e-05, + "grad_norm": 31.951107025146484, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8562317490577698, + "num_tokens": 861500446.0, + "step": 22578 + }, + { + "epoch": 2.872280880295128, + "ewc_loss": 0.051535870879888535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5767934857867658e-05, + "grad_norm": 32.100345611572266, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8612263202667236, + "num_tokens": 861538000.0, + "step": 22579 + }, + { + "epoch": 2.8724080905737184, + "ewc_loss": 0.05144140496850014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5720702979015186e-05, + "grad_norm": 31.856891632080078, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.874832272529602, + "num_tokens": 861578052.0, + "step": 22580 + }, + { + "epoch": 2.872535300852309, + "ewc_loss": 0.05137012153863907, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5685059881652705e-05, + "grad_norm": 32.02318572998047, + "learning_rate": 1e-06, + "loss": 0.4295, + "mean_token_accuracy": 0.8691936135292053, + "num_tokens": 861619065.0, + "step": 22581 + }, + { + "epoch": 2.8726625111308994, + "ewc_loss": 0.05153709650039673, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5768547857296653e-05, + "grad_norm": 31.991615295410156, + "learning_rate": 1e-06, + "loss": 0.4136, + "mean_token_accuracy": 0.8763532638549805, + "num_tokens": 861660112.0, + "step": 22582 + }, + { + "epoch": 2.87278972140949, + "ewc_loss": 0.051395442336797714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5697721866890788e-05, + "grad_norm": 31.925073623657227, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8745684623718262, + "num_tokens": 861699312.0, + "step": 22583 + }, + { + "epoch": 2.8729169316880805, + "ewc_loss": 0.05145927891135216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5729639673954807e-05, + "grad_norm": 31.971839904785156, + "learning_rate": 1e-06, + "loss": 0.3622, + "mean_token_accuracy": 0.8895753622055054, + "num_tokens": 861735167.0, + "step": 22584 + }, + { + "epoch": 2.873044141966671, + "ewc_loss": 0.051467638462781906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5733819711604156e-05, + "grad_norm": 32.00185012817383, + "learning_rate": 1e-06, + "loss": 0.4385, + "mean_token_accuracy": 0.8665666580200195, + "num_tokens": 861778594.0, + "step": 22585 + }, + { + "epoch": 2.8731713522452615, + "ewc_loss": 0.05142786353826523, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5713930881465785e-05, + "grad_norm": 32.06181335449219, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8724997639656067, + "num_tokens": 861814754.0, + "step": 22586 + }, + { + "epoch": 2.873298562523852, + "ewc_loss": 0.05145569518208504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5727847969392315e-05, + "grad_norm": 31.879804611206055, + "learning_rate": 1e-06, + "loss": 0.4537, + "mean_token_accuracy": 0.8579705953598022, + "num_tokens": 861849019.0, + "step": 22587 + }, + { + "epoch": 2.8734257728024426, + "ewc_loss": 0.05142396688461304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5711982743814588e-05, + "grad_norm": 32.077938079833984, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8700591921806335, + "num_tokens": 861889471.0, + "step": 22588 + }, + { + "epoch": 2.8735529830810327, + "ewc_loss": 0.05149354785680771, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.574677455413621e-05, + "grad_norm": 32.06060028076172, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8639982342720032, + "num_tokens": 861926811.0, + "step": 22589 + }, + { + "epoch": 2.8736801933596237, + "ewc_loss": 0.05142372474074364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5711862690513954e-05, + "grad_norm": 31.968568801879883, + "learning_rate": 1e-06, + "loss": 0.446, + "mean_token_accuracy": 0.8709126710891724, + "num_tokens": 861963628.0, + "step": 22590 + }, + { + "epoch": 2.8738074036382137, + "ewc_loss": 0.051343269646167755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5671635739854537e-05, + "grad_norm": 32.00083923339844, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8598846197128296, + "num_tokens": 862002915.0, + "step": 22591 + }, + { + "epoch": 2.8739346139168047, + "ewc_loss": 0.05139733850955963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5698669560370035e-05, + "grad_norm": 32.00079345703125, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8794038891792297, + "num_tokens": 862047276.0, + "step": 22592 + }, + { + "epoch": 2.874061824195395, + "ewc_loss": 0.05131292715668678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.565646354923956e-05, + "grad_norm": 31.952863693237305, + "learning_rate": 1e-06, + "loss": 0.3924, + "mean_token_accuracy": 0.8812910318374634, + "num_tokens": 862091241.0, + "step": 22593 + }, + { + "epoch": 2.8741890344739858, + "ewc_loss": 0.051489003002643585, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5744500817381777e-05, + "grad_norm": 32.11383056640625, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8760875463485718, + "num_tokens": 862126025.0, + "step": 22594 + }, + { + "epoch": 2.874316244752576, + "ewc_loss": 0.05141120404005051, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.570560172898695e-05, + "grad_norm": 32.04650115966797, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8637701869010925, + "num_tokens": 862160822.0, + "step": 22595 + }, + { + "epoch": 2.8744434550311664, + "ewc_loss": 0.051317453384399414, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5658726372057572e-05, + "grad_norm": 31.831100463867188, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8777362704277039, + "num_tokens": 862196808.0, + "step": 22596 + }, + { + "epoch": 2.874570665309757, + "ewc_loss": 0.05140395835042, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5701978302095085e-05, + "grad_norm": 32.11703109741211, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8586657643318176, + "num_tokens": 862233722.0, + "step": 22597 + }, + { + "epoch": 2.8746978755883474, + "ewc_loss": 0.05145155265927315, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5725776140461676e-05, + "grad_norm": 31.81594467163086, + "learning_rate": 1e-06, + "loss": 0.3677, + "mean_token_accuracy": 0.8880176544189453, + "num_tokens": 862271914.0, + "step": 22598 + }, + { + "epoch": 2.874825085866938, + "ewc_loss": 0.05131639167666435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5658195227151737e-05, + "grad_norm": 31.891738891601562, + "learning_rate": 1e-06, + "loss": 0.4298, + "mean_token_accuracy": 0.8694627285003662, + "num_tokens": 862313696.0, + "step": 22599 + }, + { + "epoch": 2.8749522961455285, + "ewc_loss": 0.05152999982237816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.576500082795974e-05, + "grad_norm": 32.038944244384766, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8662983179092407, + "num_tokens": 862355366.0, + "step": 22600 + }, + { + "epoch": 2.875079506424119, + "ewc_loss": 0.05146366357803345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.573183155618608e-05, + "grad_norm": 31.948192596435547, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8639439344406128, + "num_tokens": 862395630.0, + "step": 22601 + }, + { + "epoch": 2.8752067167027096, + "ewc_loss": 0.051475271582603455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5737635951372795e-05, + "grad_norm": 31.99247169494629, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8742511868476868, + "num_tokens": 862429650.0, + "step": 22602 + }, + { + "epoch": 2.8753339269813, + "ewc_loss": 0.051412682980298996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5706342057674192e-05, + "grad_norm": 32.07484817504883, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.86980140209198, + "num_tokens": 862470921.0, + "step": 22603 + }, + { + "epoch": 2.8754611372598906, + "ewc_loss": 0.05149751901626587, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5748759071575478e-05, + "grad_norm": 32.15171813964844, + "learning_rate": 1e-06, + "loss": 0.3901, + "mean_token_accuracy": 0.8781711459159851, + "num_tokens": 862505204.0, + "step": 22604 + }, + { + "epoch": 2.875588347538481, + "ewc_loss": 0.05140579119324684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5702895072754472e-05, + "grad_norm": 31.836849212646484, + "learning_rate": 1e-06, + "loss": 0.4325, + "mean_token_accuracy": 0.8722262978553772, + "num_tokens": 862548686.0, + "step": 22605 + }, + { + "epoch": 2.8757155578170717, + "ewc_loss": 0.05134783312678337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5673916752566583e-05, + "grad_norm": 31.934480667114258, + "learning_rate": 1e-06, + "loss": 0.3891, + "mean_token_accuracy": 0.885198712348938, + "num_tokens": 862585920.0, + "step": 22606 + }, + { + "epoch": 2.875842768095662, + "ewc_loss": 0.0515344962477684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5767247279873118e-05, + "grad_norm": 32.14516067504883, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8665096759796143, + "num_tokens": 862619364.0, + "step": 22607 + }, + { + "epoch": 2.8759699783742527, + "ewc_loss": 0.0513470284640789, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.56735147559084e-05, + "grad_norm": 31.884740829467773, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.868759036064148, + "num_tokens": 862661317.0, + "step": 22608 + }, + { + "epoch": 2.8760971886528433, + "ewc_loss": 0.051389776170253754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5694887881400064e-05, + "grad_norm": 32.020111083984375, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8765530586242676, + "num_tokens": 862698236.0, + "step": 22609 + }, + { + "epoch": 2.876224398931434, + "ewc_loss": 0.05147740617394447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5738703698152676e-05, + "grad_norm": 31.915863037109375, + "learning_rate": 1e-06, + "loss": 0.379, + "mean_token_accuracy": 0.8834505677223206, + "num_tokens": 862739224.0, + "step": 22610 + }, + { + "epoch": 2.8763516092100243, + "ewc_loss": 0.05136445537209511, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5682227715151384e-05, + "grad_norm": 31.99015998840332, + "learning_rate": 1e-06, + "loss": 0.3807, + "mean_token_accuracy": 0.8871352672576904, + "num_tokens": 862776534.0, + "step": 22611 + }, + { + "epoch": 2.876478819488615, + "ewc_loss": 0.05143754929304123, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5718774850247428e-05, + "grad_norm": 31.996435165405273, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8654782772064209, + "num_tokens": 862815690.0, + "step": 22612 + }, + { + "epoch": 2.8766060297672054, + "ewc_loss": 0.05139917880296707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.569958996900823e-05, + "grad_norm": 31.936561584472656, + "learning_rate": 1e-06, + "loss": 0.414, + "mean_token_accuracy": 0.8722884654998779, + "num_tokens": 862851021.0, + "step": 22613 + }, + { + "epoch": 2.8767332400457954, + "ewc_loss": 0.05143038555979729, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5715193260111846e-05, + "grad_norm": 32.05021286010742, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8807418346405029, + "num_tokens": 862885122.0, + "step": 22614 + }, + { + "epoch": 2.8768604503243864, + "ewc_loss": 0.051421187818050385, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.571059303591028e-05, + "grad_norm": 31.852148056030273, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8784355521202087, + "num_tokens": 862922233.0, + "step": 22615 + }, + { + "epoch": 2.8769876606029765, + "ewc_loss": 0.05133247375488281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5666236979304813e-05, + "grad_norm": 31.841901779174805, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8832811713218689, + "num_tokens": 862961355.0, + "step": 22616 + }, + { + "epoch": 2.8771148708815675, + "ewc_loss": 0.05149398744106293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5746994651854038e-05, + "grad_norm": 31.855487823486328, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8811023831367493, + "num_tokens": 862998523.0, + "step": 22617 + }, + { + "epoch": 2.8772420811601576, + "ewc_loss": 0.05160999298095703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5804996766964905e-05, + "grad_norm": 32.07347869873047, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.873706579208374, + "num_tokens": 863039684.0, + "step": 22618 + }, + { + "epoch": 2.877369291438748, + "ewc_loss": 0.05156908184289932, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5784540412132628e-05, + "grad_norm": 31.959264755249023, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8772295713424683, + "num_tokens": 863077649.0, + "step": 22619 + }, + { + "epoch": 2.8774965017173386, + "ewc_loss": 0.051385942846536636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5692970666568726e-05, + "grad_norm": 31.867542266845703, + "learning_rate": 1e-06, + "loss": 0.4291, + "mean_token_accuracy": 0.867850661277771, + "num_tokens": 863116816.0, + "step": 22620 + }, + { + "epoch": 2.877623711995929, + "ewc_loss": 0.051445621997117996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5722811187733896e-05, + "grad_norm": 31.912145614624023, + "learning_rate": 1e-06, + "loss": 0.3848, + "mean_token_accuracy": 0.8833786845207214, + "num_tokens": 863155306.0, + "step": 22621 + }, + { + "epoch": 2.8777509222745197, + "ewc_loss": 0.05145183950662613, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5725919840624556e-05, + "grad_norm": 31.783933639526367, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8792740106582642, + "num_tokens": 863194381.0, + "step": 22622 + }, + { + "epoch": 2.87787813255311, + "ewc_loss": 0.05153116211295128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.576558108557947e-05, + "grad_norm": 31.99021339416504, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8724402785301208, + "num_tokens": 863231470.0, + "step": 22623 + }, + { + "epoch": 2.8780053428317007, + "ewc_loss": 0.05162341147661209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5811705199885182e-05, + "grad_norm": 31.89887809753418, + "learning_rate": 1e-06, + "loss": 0.4424, + "mean_token_accuracy": 0.8664782047271729, + "num_tokens": 863265617.0, + "step": 22624 + }, + { + "epoch": 2.8781325531102913, + "ewc_loss": 0.05145828798413277, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.572914308984764e-05, + "grad_norm": 31.980833053588867, + "learning_rate": 1e-06, + "loss": 0.4473, + "mean_token_accuracy": 0.862921953201294, + "num_tokens": 863300111.0, + "step": 22625 + }, + { + "epoch": 2.878259763388882, + "ewc_loss": 0.05164092406630516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5820461814873852e-05, + "grad_norm": 31.890432357788086, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.8806626796722412, + "num_tokens": 863338626.0, + "step": 22626 + }, + { + "epoch": 2.8783869736674723, + "ewc_loss": 0.05148019269108772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5740097044035792e-05, + "grad_norm": 32.03306579589844, + "learning_rate": 1e-06, + "loss": 0.4521, + "mean_token_accuracy": 0.8649694919586182, + "num_tokens": 863374672.0, + "step": 22627 + }, + { + "epoch": 2.878514183946063, + "ewc_loss": 0.05173983797430992, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.586991831776686e-05, + "grad_norm": 31.991191864013672, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8687658309936523, + "num_tokens": 863412717.0, + "step": 22628 + }, + { + "epoch": 2.8786413942246534, + "ewc_loss": 0.05147856846451759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5739283955772407e-05, + "grad_norm": 31.967971801757812, + "learning_rate": 1e-06, + "loss": 0.3922, + "mean_token_accuracy": 0.8812509179115295, + "num_tokens": 863447776.0, + "step": 22629 + }, + { + "epoch": 2.878768604503244, + "ewc_loss": 0.051603324711322784, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5801662559388205e-05, + "grad_norm": 31.917766571044922, + "learning_rate": 1e-06, + "loss": 0.3658, + "mean_token_accuracy": 0.8903413414955139, + "num_tokens": 863485697.0, + "step": 22630 + }, + { + "epoch": 2.8788958147818344, + "ewc_loss": 0.05160176753997803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5800884031923488e-05, + "grad_norm": 32.10439682006836, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8669531345367432, + "num_tokens": 863528706.0, + "step": 22631 + }, + { + "epoch": 2.879023025060425, + "ewc_loss": 0.051679495722055435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.583974855951965e-05, + "grad_norm": 31.945369720458984, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8547074198722839, + "num_tokens": 863569181.0, + "step": 22632 + }, + { + "epoch": 2.8791502353390155, + "ewc_loss": 0.05150365084409714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5751825887709856e-05, + "grad_norm": 31.92050552368164, + "learning_rate": 1e-06, + "loss": 0.4579, + "mean_token_accuracy": 0.8578196167945862, + "num_tokens": 863602985.0, + "step": 22633 + }, + { + "epoch": 2.879277445617606, + "ewc_loss": 0.051532648503780365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.576632505224552e-05, + "grad_norm": 31.971773147583008, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8781147003173828, + "num_tokens": 863634590.0, + "step": 22634 + }, + { + "epoch": 2.8794046558961965, + "ewc_loss": 0.05151977390050888, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.575988764874637e-05, + "grad_norm": 31.76441192626953, + "learning_rate": 1e-06, + "loss": 0.4329, + "mean_token_accuracy": 0.8712749481201172, + "num_tokens": 863677671.0, + "step": 22635 + }, + { + "epoch": 2.879531866174787, + "ewc_loss": 0.05155593901872635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.577797022240702e-05, + "grad_norm": 31.951946258544922, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8754434585571289, + "num_tokens": 863716102.0, + "step": 22636 + }, + { + "epoch": 2.8796590764533776, + "ewc_loss": 0.0517098642885685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5854931664071046e-05, + "grad_norm": 32.051063537597656, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8646719455718994, + "num_tokens": 863751560.0, + "step": 22637 + }, + { + "epoch": 2.879786286731968, + "ewc_loss": 0.05154326558113098, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5771632863325067e-05, + "grad_norm": 31.904800415039062, + "learning_rate": 1e-06, + "loss": 0.361, + "mean_token_accuracy": 0.8885247111320496, + "num_tokens": 863792373.0, + "step": 22638 + }, + { + "epoch": 2.879913497010558, + "ewc_loss": 0.05153216794133186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.576608312665485e-05, + "grad_norm": 31.842370986938477, + "learning_rate": 1e-06, + "loss": 0.45, + "mean_token_accuracy": 0.8691638112068176, + "num_tokens": 863829969.0, + "step": 22639 + }, + { + "epoch": 2.880040707289149, + "ewc_loss": 0.05170590057969093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5852950784610584e-05, + "grad_norm": 32.051116943359375, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8792365789413452, + "num_tokens": 863867080.0, + "step": 22640 + }, + { + "epoch": 2.8801679175677393, + "ewc_loss": 0.05162681266665459, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5813405954977497e-05, + "grad_norm": 31.971057891845703, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8773096799850464, + "num_tokens": 863904230.0, + "step": 22641 + }, + { + "epoch": 2.8802951278463302, + "ewc_loss": 0.05160098150372505, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5800491130212322e-05, + "grad_norm": 31.985254287719727, + "learning_rate": 1e-06, + "loss": 0.4065, + "mean_token_accuracy": 0.8777378797531128, + "num_tokens": 863947309.0, + "step": 22642 + }, + { + "epoch": 2.8804223381249203, + "ewc_loss": 0.05154496431350708, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5772482331376523e-05, + "grad_norm": 31.97003746032715, + "learning_rate": 1e-06, + "loss": 0.3992, + "mean_token_accuracy": 0.8791281580924988, + "num_tokens": 863985557.0, + "step": 22643 + }, + { + "epoch": 2.880549548403511, + "ewc_loss": 0.05152931436896324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.576465703896247e-05, + "grad_norm": 31.95046615600586, + "learning_rate": 1e-06, + "loss": 0.4505, + "mean_token_accuracy": 0.8610671758651733, + "num_tokens": 864021842.0, + "step": 22644 + }, + { + "epoch": 2.8806767586821014, + "ewc_loss": 0.05155801400542259, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5779007046367042e-05, + "grad_norm": 32.03923416137695, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.8645696043968201, + "num_tokens": 864060765.0, + "step": 22645 + }, + { + "epoch": 2.880803968960692, + "ewc_loss": 0.051603879779577255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5801939045777544e-05, + "grad_norm": 32.081111907958984, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.8759180307388306, + "num_tokens": 864100624.0, + "step": 22646 + }, + { + "epoch": 2.8809311792392824, + "ewc_loss": 0.051461659371852875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5730829293024726e-05, + "grad_norm": 31.86452865600586, + "learning_rate": 1e-06, + "loss": 0.3818, + "mean_token_accuracy": 0.8835641145706177, + "num_tokens": 864136599.0, + "step": 22647 + }, + { + "epoch": 2.881058389517873, + "ewc_loss": 0.05141640082001686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5708201064844616e-05, + "grad_norm": 32.004154205322266, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8771960735321045, + "num_tokens": 864169876.0, + "step": 22648 + }, + { + "epoch": 2.8811855997964635, + "ewc_loss": 0.051650553941726685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5825276679825038e-05, + "grad_norm": 32.15137481689453, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8720926642417908, + "num_tokens": 864213709.0, + "step": 22649 + }, + { + "epoch": 2.881312810075054, + "ewc_loss": 0.051429640501737595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.571482036728412e-05, + "grad_norm": 32.2119255065918, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8784038424491882, + "num_tokens": 864245581.0, + "step": 22650 + }, + { + "epoch": 2.8814400203536445, + "ewc_loss": 0.05153675377368927, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.576837687229272e-05, + "grad_norm": 31.965633392333984, + "learning_rate": 1e-06, + "loss": 0.4678, + "mean_token_accuracy": 0.8587449193000793, + "num_tokens": 864287081.0, + "step": 22651 + }, + { + "epoch": 2.881567230632235, + "ewc_loss": 0.05133785307407379, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5668927264632657e-05, + "grad_norm": 32.106117248535156, + "learning_rate": 1e-06, + "loss": 0.4614, + "mean_token_accuracy": 0.8572102785110474, + "num_tokens": 864321480.0, + "step": 22652 + }, + { + "epoch": 2.8816944409108256, + "ewc_loss": 0.05154158174991608, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5770790671231225e-05, + "grad_norm": 32.33041000366211, + "learning_rate": 1e-06, + "loss": 0.3996, + "mean_token_accuracy": 0.880090594291687, + "num_tokens": 864358580.0, + "step": 22653 + }, + { + "epoch": 2.881821651189416, + "ewc_loss": 0.05139821022748947, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5699104298837483e-05, + "grad_norm": 31.8560848236084, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.8639581799507141, + "num_tokens": 864396277.0, + "step": 22654 + }, + { + "epoch": 2.8819488614680067, + "ewc_loss": 0.05135972425341606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.567986120993737e-05, + "grad_norm": 32.14614486694336, + "learning_rate": 1e-06, + "loss": 0.4337, + "mean_token_accuracy": 0.8688740134239197, + "num_tokens": 864433282.0, + "step": 22655 + }, + { + "epoch": 2.882076071746597, + "ewc_loss": 0.05153403431177139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5767016268218867e-05, + "grad_norm": 32.23745346069336, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8653882741928101, + "num_tokens": 864469305.0, + "step": 22656 + }, + { + "epoch": 2.8822032820251877, + "ewc_loss": 0.051363181322813034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5681591068860143e-05, + "grad_norm": 32.004432678222656, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.874805212020874, + "num_tokens": 864507246.0, + "step": 22657 + }, + { + "epoch": 2.8823304923037782, + "ewc_loss": 0.05136789008975029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5683944841148332e-05, + "grad_norm": 32.06681442260742, + "learning_rate": 1e-06, + "loss": 0.4315, + "mean_token_accuracy": 0.8671768307685852, + "num_tokens": 864547315.0, + "step": 22658 + }, + { + "epoch": 2.8824577025823688, + "ewc_loss": 0.05148310214281082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5741550416569225e-05, + "grad_norm": 32.122222900390625, + "learning_rate": 1e-06, + "loss": 0.4467, + "mean_token_accuracy": 0.8663917183876038, + "num_tokens": 864590813.0, + "step": 22659 + }, + { + "epoch": 2.8825849128609593, + "ewc_loss": 0.05149596929550171, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5747984182089567e-05, + "grad_norm": 32.10833740234375, + "learning_rate": 1e-06, + "loss": 0.4451, + "mean_token_accuracy": 0.864007294178009, + "num_tokens": 864625078.0, + "step": 22660 + }, + { + "epoch": 2.88271212313955, + "ewc_loss": 0.05139783024787903, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5698915123939514e-05, + "grad_norm": 31.91006851196289, + "learning_rate": 1e-06, + "loss": 0.3917, + "mean_token_accuracy": 0.8797268867492676, + "num_tokens": 864663714.0, + "step": 22661 + }, + { + "epoch": 2.88283933341814, + "ewc_loss": 0.051406554877758026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5703277060529217e-05, + "grad_norm": 32.157352447509766, + "learning_rate": 1e-06, + "loss": 0.4275, + "mean_token_accuracy": 0.8718882203102112, + "num_tokens": 864697026.0, + "step": 22662 + }, + { + "epoch": 2.882966543696731, + "ewc_loss": 0.05159320682287216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5796603949856944e-05, + "grad_norm": 32.1412467956543, + "learning_rate": 1e-06, + "loss": 0.3929, + "mean_token_accuracy": 0.8829464912414551, + "num_tokens": 864732337.0, + "step": 22663 + }, + { + "epoch": 2.883093753975321, + "ewc_loss": 0.05145398527383804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5726993044372648e-05, + "grad_norm": 32.088260650634766, + "learning_rate": 1e-06, + "loss": 0.4376, + "mean_token_accuracy": 0.8660319447517395, + "num_tokens": 864776598.0, + "step": 22664 + }, + { + "epoch": 2.883220964253912, + "ewc_loss": 0.05140489339828491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.57024476013612e-05, + "grad_norm": 32.19489669799805, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8595207333564758, + "num_tokens": 864817279.0, + "step": 22665 + }, + { + "epoch": 2.883348174532502, + "ewc_loss": 0.05144564062356949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5722820282680914e-05, + "grad_norm": 32.02708435058594, + "learning_rate": 1e-06, + "loss": 0.396, + "mean_token_accuracy": 0.8797191381454468, + "num_tokens": 864854914.0, + "step": 22666 + }, + { + "epoch": 2.883475384811093, + "ewc_loss": 0.05134729668498039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.567364754213486e-05, + "grad_norm": 31.943927764892578, + "learning_rate": 1e-06, + "loss": 0.413, + "mean_token_accuracy": 0.8756166696548462, + "num_tokens": 864892914.0, + "step": 22667 + }, + { + "epoch": 2.883602595089683, + "ewc_loss": 0.05141347274184227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.570673677837476e-05, + "grad_norm": 32.10857391357422, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8696867227554321, + "num_tokens": 864934189.0, + "step": 22668 + }, + { + "epoch": 2.8837298053682736, + "ewc_loss": 0.05146963521838188, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5734818336786702e-05, + "grad_norm": 31.931543350219727, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8818198442459106, + "num_tokens": 864972085.0, + "step": 22669 + }, + { + "epoch": 2.883857015646864, + "ewc_loss": 0.05143677443265915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5718387405504473e-05, + "grad_norm": 31.8353328704834, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8573192358016968, + "num_tokens": 865008674.0, + "step": 22670 + }, + { + "epoch": 2.8839842259254547, + "ewc_loss": 0.051499541848897934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.574977042968385e-05, + "grad_norm": 31.978891372680664, + "learning_rate": 1e-06, + "loss": 0.4394, + "mean_token_accuracy": 0.8633415699005127, + "num_tokens": 865046004.0, + "step": 22671 + }, + { + "epoch": 2.884111436204045, + "ewc_loss": 0.051575906574726105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.578795283625368e-05, + "grad_norm": 31.84136390686035, + "learning_rate": 1e-06, + "loss": 0.3977, + "mean_token_accuracy": 0.8770118355751038, + "num_tokens": 865082409.0, + "step": 22672 + }, + { + "epoch": 2.8842386464826357, + "ewc_loss": 0.05148526281118393, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5742630896274932e-05, + "grad_norm": 31.941438674926758, + "learning_rate": 1e-06, + "loss": 0.4177, + "mean_token_accuracy": 0.8724203109741211, + "num_tokens": 865124457.0, + "step": 22673 + }, + { + "epoch": 2.8843658567612263, + "ewc_loss": 0.05165239796042442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5826198907452635e-05, + "grad_norm": 31.968015670776367, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8851110935211182, + "num_tokens": 865162290.0, + "step": 22674 + }, + { + "epoch": 2.884493067039817, + "ewc_loss": 0.05151892080903053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5759460186236538e-05, + "grad_norm": 31.932016372680664, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8705285787582397, + "num_tokens": 865201686.0, + "step": 22675 + }, + { + "epoch": 2.8846202773184073, + "ewc_loss": 0.051516205072402954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5758103220141493e-05, + "grad_norm": 31.92572021484375, + "learning_rate": 1e-06, + "loss": 0.4458, + "mean_token_accuracy": 0.8636826276779175, + "num_tokens": 865236876.0, + "step": 22676 + }, + { + "epoch": 2.884747487596998, + "ewc_loss": 0.0515778623521328, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5788931452552788e-05, + "grad_norm": 31.999544143676758, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8765822649002075, + "num_tokens": 865276994.0, + "step": 22677 + }, + { + "epoch": 2.8848746978755884, + "ewc_loss": 0.05162232369184494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5811161322053522e-05, + "grad_norm": 31.999860763549805, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8725957870483398, + "num_tokens": 865311990.0, + "step": 22678 + }, + { + "epoch": 2.885001908154179, + "ewc_loss": 0.05154862254858017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5774310415727086e-05, + "grad_norm": 31.941192626953125, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.878203809261322, + "num_tokens": 865352324.0, + "step": 22679 + }, + { + "epoch": 2.8851291184327694, + "ewc_loss": 0.05150483548641205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.575241705926601e-05, + "grad_norm": 31.970613479614258, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8723844289779663, + "num_tokens": 865391428.0, + "step": 22680 + }, + { + "epoch": 2.88525632871136, + "ewc_loss": 0.051514625549316406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.575731195975095e-05, + "grad_norm": 32.004249572753906, + "learning_rate": 1e-06, + "loss": 0.4195, + "mean_token_accuracy": 0.8708755373954773, + "num_tokens": 865428478.0, + "step": 22681 + }, + { + "epoch": 2.8853835389899505, + "ewc_loss": 0.05160743370652199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.580371619842481e-05, + "grad_norm": 31.925640106201172, + "learning_rate": 1e-06, + "loss": 0.4212, + "mean_token_accuracy": 0.8727587461471558, + "num_tokens": 865466531.0, + "step": 22682 + }, + { + "epoch": 2.885510749268541, + "ewc_loss": 0.05148642137646675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5743211153894663e-05, + "grad_norm": 32.06223678588867, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8792041540145874, + "num_tokens": 865503177.0, + "step": 22683 + }, + { + "epoch": 2.8856379595471315, + "ewc_loss": 0.05155507102608681, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5777535483939573e-05, + "grad_norm": 31.90904426574707, + "learning_rate": 1e-06, + "loss": 0.4412, + "mean_token_accuracy": 0.865736722946167, + "num_tokens": 865539454.0, + "step": 22684 + }, + { + "epoch": 2.885765169825722, + "ewc_loss": 0.05147762596607208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.573881283751689e-05, + "grad_norm": 32.104286193847656, + "learning_rate": 1e-06, + "loss": 0.3628, + "mean_token_accuracy": 0.8907346725463867, + "num_tokens": 865571937.0, + "step": 22685 + }, + { + "epoch": 2.8858923801043126, + "ewc_loss": 0.05163145810365677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5815728804445826e-05, + "grad_norm": 31.94464111328125, + "learning_rate": 1e-06, + "loss": 0.4058, + "mean_token_accuracy": 0.8746712803840637, + "num_tokens": 865609457.0, + "step": 22686 + }, + { + "epoch": 2.8860195903829027, + "ewc_loss": 0.0514647476375103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5732373615028337e-05, + "grad_norm": 32.21290588378906, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8594005107879639, + "num_tokens": 865647391.0, + "step": 22687 + }, + { + "epoch": 2.8861468006614936, + "ewc_loss": 0.05160999670624733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.580499858595431e-05, + "grad_norm": 31.905517578125, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8714925646781921, + "num_tokens": 865680578.0, + "step": 22688 + }, + { + "epoch": 2.8862740109400837, + "ewc_loss": 0.051356200128793716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.567810042819474e-05, + "grad_norm": 32.0527229309082, + "learning_rate": 1e-06, + "loss": 0.4304, + "mean_token_accuracy": 0.8693546056747437, + "num_tokens": 865714610.0, + "step": 22689 + }, + { + "epoch": 2.8864012212186747, + "ewc_loss": 0.051683202385902405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5841602109721862e-05, + "grad_norm": 32.151031494140625, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.868802011013031, + "num_tokens": 865755672.0, + "step": 22690 + }, + { + "epoch": 2.886528431497265, + "ewc_loss": 0.05144772678613663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5723862563609146e-05, + "grad_norm": 31.91209602355957, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8811379671096802, + "num_tokens": 865790111.0, + "step": 22691 + }, + { + "epoch": 2.8866556417758558, + "ewc_loss": 0.05148201063275337, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5741004719748162e-05, + "grad_norm": 32.00138854980469, + "learning_rate": 1e-06, + "loss": 0.4194, + "mean_token_accuracy": 0.8725582957267761, + "num_tokens": 865825580.0, + "step": 22692 + }, + { + "epoch": 2.886782852054446, + "ewc_loss": 0.05154537409543991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5772687877179123e-05, + "grad_norm": 32.03494644165039, + "learning_rate": 1e-06, + "loss": 0.4005, + "mean_token_accuracy": 0.8756974935531616, + "num_tokens": 865869099.0, + "step": 22693 + }, + { + "epoch": 2.8869100623330364, + "ewc_loss": 0.05158557742834091, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.579278952907771e-05, + "grad_norm": 31.90531349182129, + "learning_rate": 1e-06, + "loss": 0.4611, + "mean_token_accuracy": 0.8632832765579224, + "num_tokens": 865906957.0, + "step": 22694 + }, + { + "epoch": 2.887037272611627, + "ewc_loss": 0.05158000811934471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.579000465630088e-05, + "grad_norm": 31.952157974243164, + "learning_rate": 1e-06, + "loss": 0.3849, + "mean_token_accuracy": 0.8822312355041504, + "num_tokens": 865945712.0, + "step": 22695 + }, + { + "epoch": 2.8871644828902174, + "ewc_loss": 0.05156302824616432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5781513613765128e-05, + "grad_norm": 31.962736129760742, + "learning_rate": 1e-06, + "loss": 0.4647, + "mean_token_accuracy": 0.8583055734634399, + "num_tokens": 865984469.0, + "step": 22696 + }, + { + "epoch": 2.887291693168808, + "ewc_loss": 0.05162395164370537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.581197622930631e-05, + "grad_norm": 32.08340072631836, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8728880882263184, + "num_tokens": 866024624.0, + "step": 22697 + }, + { + "epoch": 2.8874189034473985, + "ewc_loss": 0.051621001213788986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5810501028900035e-05, + "grad_norm": 31.98736000061035, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8689876794815063, + "num_tokens": 866062446.0, + "step": 22698 + }, + { + "epoch": 2.887546113725989, + "ewc_loss": 0.05156753584742546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.578376734163612e-05, + "grad_norm": 32.08859634399414, + "learning_rate": 1e-06, + "loss": 0.3842, + "mean_token_accuracy": 0.8825451135635376, + "num_tokens": 866100896.0, + "step": 22699 + }, + { + "epoch": 2.8876733240045795, + "ewc_loss": 0.051567308604717255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.57836545642931e-05, + "grad_norm": 31.875062942504883, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8585659265518188, + "num_tokens": 866134668.0, + "step": 22700 + }, + { + "epoch": 2.88780053428317, + "ewc_loss": 0.05151915177702904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5759576601558365e-05, + "grad_norm": 32.16190719604492, + "learning_rate": 1e-06, + "loss": 0.4494, + "mean_token_accuracy": 0.8655455708503723, + "num_tokens": 866173834.0, + "step": 22701 + }, + { + "epoch": 2.8879277445617606, + "ewc_loss": 0.0516398586332798, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5819928850978613e-05, + "grad_norm": 32.03046417236328, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8564414978027344, + "num_tokens": 866211691.0, + "step": 22702 + }, + { + "epoch": 2.888054954840351, + "ewc_loss": 0.05139744281768799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5698720492073335e-05, + "grad_norm": 31.92384910583496, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8728665113449097, + "num_tokens": 866240878.0, + "step": 22703 + }, + { + "epoch": 2.8881821651189417, + "ewc_loss": 0.05158713087439537, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5793566237553023e-05, + "grad_norm": 31.96315574645996, + "learning_rate": 1e-06, + "loss": 0.4535, + "mean_token_accuracy": 0.8588927388191223, + "num_tokens": 866289552.0, + "step": 22704 + }, + { + "epoch": 2.888309375397532, + "ewc_loss": 0.05159657076001167, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.579828469606582e-05, + "grad_norm": 32.04279327392578, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8630516529083252, + "num_tokens": 866330361.0, + "step": 22705 + }, + { + "epoch": 2.8884365856761227, + "ewc_loss": 0.0515756756067276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5787838239921257e-05, + "grad_norm": 32.01055908203125, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8758369088172913, + "num_tokens": 866371557.0, + "step": 22706 + }, + { + "epoch": 2.8885637959547132, + "ewc_loss": 0.05150064453482628, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5750321583473124e-05, + "grad_norm": 31.990564346313477, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8729615211486816, + "num_tokens": 866410627.0, + "step": 22707 + }, + { + "epoch": 2.8886910062333038, + "ewc_loss": 0.051560673862695694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5780336727621034e-05, + "grad_norm": 32.15610885620117, + "learning_rate": 1e-06, + "loss": 0.4178, + "mean_token_accuracy": 0.8699135184288025, + "num_tokens": 866446036.0, + "step": 22708 + }, + { + "epoch": 2.8888182165118943, + "ewc_loss": 0.051494255661964417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5747127438080497e-05, + "grad_norm": 31.901935577392578, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8619846105575562, + "num_tokens": 866489655.0, + "step": 22709 + }, + { + "epoch": 2.888945426790485, + "ewc_loss": 0.05145091935992241, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.572545963630546e-05, + "grad_norm": 32.125953674316406, + "learning_rate": 1e-06, + "loss": 0.4438, + "mean_token_accuracy": 0.8647312521934509, + "num_tokens": 866532750.0, + "step": 22710 + }, + { + "epoch": 2.8890726370690754, + "ewc_loss": 0.05156043916940689, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5780220312299207e-05, + "grad_norm": 32.06241989135742, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8786084055900574, + "num_tokens": 866570748.0, + "step": 22711 + }, + { + "epoch": 2.8891998473476654, + "ewc_loss": 0.05140369012951851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5701845515868627e-05, + "grad_norm": 31.979036331176758, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8670463562011719, + "num_tokens": 866609422.0, + "step": 22712 + }, + { + "epoch": 2.8893270576262564, + "ewc_loss": 0.05146709457039833, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5733546863193624e-05, + "grad_norm": 32.12598419189453, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8745306730270386, + "num_tokens": 866650865.0, + "step": 22713 + }, + { + "epoch": 2.8894542679048465, + "ewc_loss": 0.05139116197824478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5695580916362815e-05, + "grad_norm": 31.977062225341797, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8700767755508423, + "num_tokens": 866687605.0, + "step": 22714 + }, + { + "epoch": 2.8895814781834375, + "ewc_loss": 0.05136515572667122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5682576961116865e-05, + "grad_norm": 32.04648971557617, + "learning_rate": 1e-06, + "loss": 0.4416, + "mean_token_accuracy": 0.8623791337013245, + "num_tokens": 866722974.0, + "step": 22715 + }, + { + "epoch": 2.8897086884620276, + "ewc_loss": 0.05143634229898453, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.571817094576545e-05, + "grad_norm": 31.970273971557617, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8692547678947449, + "num_tokens": 866760694.0, + "step": 22716 + }, + { + "epoch": 2.889835898740618, + "ewc_loss": 0.05143841356039047, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5719205950736068e-05, + "grad_norm": 31.980825424194336, + "learning_rate": 1e-06, + "loss": 0.4649, + "mean_token_accuracy": 0.8543887138366699, + "num_tokens": 866795102.0, + "step": 22717 + }, + { + "epoch": 2.8899631090192086, + "ewc_loss": 0.051572173833847046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5786086553125642e-05, + "grad_norm": 32.087066650390625, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8860154151916504, + "num_tokens": 866831282.0, + "step": 22718 + }, + { + "epoch": 2.890090319297799, + "ewc_loss": 0.05149182677268982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5745914172148332e-05, + "grad_norm": 32.0987663269043, + "learning_rate": 1e-06, + "loss": 0.4426, + "mean_token_accuracy": 0.8641153573989868, + "num_tokens": 866872661.0, + "step": 22719 + }, + { + "epoch": 2.8902175295763897, + "ewc_loss": 0.05145568773150444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5727844331413507e-05, + "grad_norm": 31.897470474243164, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8622402548789978, + "num_tokens": 866917576.0, + "step": 22720 + }, + { + "epoch": 2.89034473985498, + "ewc_loss": 0.051449261605739594, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5724630177137442e-05, + "grad_norm": 32.12225341796875, + "learning_rate": 1e-06, + "loss": 0.4112, + "mean_token_accuracy": 0.8761370182037354, + "num_tokens": 866957245.0, + "step": 22721 + }, + { + "epoch": 2.8904719501335707, + "ewc_loss": 0.0515034981071949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5751749490154907e-05, + "grad_norm": 31.881792068481445, + "learning_rate": 1e-06, + "loss": 0.4972, + "mean_token_accuracy": 0.8435617089271545, + "num_tokens": 866998695.0, + "step": 22722 + }, + { + "epoch": 2.8905991604121613, + "ewc_loss": 0.05140719190239906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5703595383674838e-05, + "grad_norm": 32.15174102783203, + "learning_rate": 1e-06, + "loss": 0.4608, + "mean_token_accuracy": 0.8586422801017761, + "num_tokens": 867033502.0, + "step": 22723 + }, + { + "epoch": 2.890726370690752, + "ewc_loss": 0.05157863721251488, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5789318897295743e-05, + "grad_norm": 31.95868492126465, + "learning_rate": 1e-06, + "loss": 0.4689, + "mean_token_accuracy": 0.8552486896514893, + "num_tokens": 867076032.0, + "step": 22724 + }, + { + "epoch": 2.8908535809693423, + "ewc_loss": 0.051454056054353714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5727027605171315e-05, + "grad_norm": 32.07503128051758, + "learning_rate": 1e-06, + "loss": 0.3913, + "mean_token_accuracy": 0.8725460767745972, + "num_tokens": 867107429.0, + "step": 22725 + }, + { + "epoch": 2.890980791247933, + "ewc_loss": 0.051546476781368256, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5773239030968398e-05, + "grad_norm": 31.92300033569336, + "learning_rate": 1e-06, + "loss": 0.389, + "mean_token_accuracy": 0.8737388849258423, + "num_tokens": 867139432.0, + "step": 22726 + }, + { + "epoch": 2.8911080015265234, + "ewc_loss": 0.051505472511053085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.575273538241163e-05, + "grad_norm": 32.24791717529297, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8685752153396606, + "num_tokens": 867180588.0, + "step": 22727 + }, + { + "epoch": 2.891235211805114, + "ewc_loss": 0.051605261862277985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5802630261750892e-05, + "grad_norm": 31.929458618164062, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8693327903747559, + "num_tokens": 867215170.0, + "step": 22728 + }, + { + "epoch": 2.8913624220837044, + "ewc_loss": 0.05144321918487549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5721608835738152e-05, + "grad_norm": 32.14313888549805, + "learning_rate": 1e-06, + "loss": 0.3756, + "mean_token_accuracy": 0.8869245052337646, + "num_tokens": 867254958.0, + "step": 22729 + }, + { + "epoch": 2.891489632362295, + "ewc_loss": 0.05161294713616371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5806473786360584e-05, + "grad_norm": 32.10744857788086, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8704046010971069, + "num_tokens": 867291135.0, + "step": 22730 + }, + { + "epoch": 2.8916168426408855, + "ewc_loss": 0.05146292224526405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5731460482347757e-05, + "grad_norm": 32.00432205200195, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8792685270309448, + "num_tokens": 867326926.0, + "step": 22731 + }, + { + "epoch": 2.891744052919476, + "ewc_loss": 0.051501210778951645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5750605345820077e-05, + "grad_norm": 32.111141204833984, + "learning_rate": 1e-06, + "loss": 0.4415, + "mean_token_accuracy": 0.8632380962371826, + "num_tokens": 867361142.0, + "step": 22732 + }, + { + "epoch": 2.8918712631980665, + "ewc_loss": 0.051436733454465866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5718367396621034e-05, + "grad_norm": 31.985658645629883, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8897345066070557, + "num_tokens": 867398344.0, + "step": 22733 + }, + { + "epoch": 2.891998473476657, + "ewc_loss": 0.05155915021896362, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5779574571060948e-05, + "grad_norm": 32.0501708984375, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.880939245223999, + "num_tokens": 867437642.0, + "step": 22734 + }, + { + "epoch": 2.8921256837552476, + "ewc_loss": 0.05152083933353424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5760418793652207e-05, + "grad_norm": 32.033912658691406, + "learning_rate": 1e-06, + "loss": 0.4544, + "mean_token_accuracy": 0.8617956638336182, + "num_tokens": 867471646.0, + "step": 22735 + }, + { + "epoch": 2.892252894033838, + "ewc_loss": 0.051512040197849274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5756020477274433e-05, + "grad_norm": 31.986495971679688, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8726712465286255, + "num_tokens": 867509682.0, + "step": 22736 + }, + { + "epoch": 2.892380104312428, + "ewc_loss": 0.05152760446071625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5763802113942802e-05, + "grad_norm": 31.949796676635742, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8680609464645386, + "num_tokens": 867544425.0, + "step": 22737 + }, + { + "epoch": 2.892507314591019, + "ewc_loss": 0.05154179781675339, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5770897991606034e-05, + "grad_norm": 32.004276275634766, + "learning_rate": 1e-06, + "loss": 0.4384, + "mean_token_accuracy": 0.8652298450469971, + "num_tokens": 867584978.0, + "step": 22738 + }, + { + "epoch": 2.8926345248696093, + "ewc_loss": 0.051494479179382324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5747240215423517e-05, + "grad_norm": 31.85272789001465, + "learning_rate": 1e-06, + "loss": 0.4176, + "mean_token_accuracy": 0.872139573097229, + "num_tokens": 867620360.0, + "step": 22739 + }, + { + "epoch": 2.8927617351482002, + "ewc_loss": 0.05156658962368965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.57832944043912e-05, + "grad_norm": 32.05323791503906, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8796879649162292, + "num_tokens": 867655246.0, + "step": 22740 + }, + { + "epoch": 2.8928889454267903, + "ewc_loss": 0.051538169384002686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.57690844591707e-05, + "grad_norm": 31.98134994506836, + "learning_rate": 1e-06, + "loss": 0.4413, + "mean_token_accuracy": 0.8632382750511169, + "num_tokens": 867692792.0, + "step": 22741 + }, + { + "epoch": 2.893016155705381, + "ewc_loss": 0.05156321823596954, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5781608201214112e-05, + "grad_norm": 32.13372039794922, + "learning_rate": 1e-06, + "loss": 0.4116, + "mean_token_accuracy": 0.873263418674469, + "num_tokens": 867732365.0, + "step": 22742 + }, + { + "epoch": 2.8931433659839714, + "ewc_loss": 0.05163407698273659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.581703847681638e-05, + "grad_norm": 32.00001525878906, + "learning_rate": 1e-06, + "loss": 0.4642, + "mean_token_accuracy": 0.8578286170959473, + "num_tokens": 867764800.0, + "step": 22743 + }, + { + "epoch": 2.893270576262562, + "ewc_loss": 0.05155722796916962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5778614144655876e-05, + "grad_norm": 32.07061767578125, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8654977679252625, + "num_tokens": 867807107.0, + "step": 22744 + }, + { + "epoch": 2.8933977865411524, + "ewc_loss": 0.05170140415430069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5850702513707802e-05, + "grad_norm": 31.933034896850586, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8632844686508179, + "num_tokens": 867850225.0, + "step": 22745 + }, + { + "epoch": 2.893524996819743, + "ewc_loss": 0.05160083249211311, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5800416551646776e-05, + "grad_norm": 32.08320236206055, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8684718012809753, + "num_tokens": 867890457.0, + "step": 22746 + }, + { + "epoch": 2.8936522070983335, + "ewc_loss": 0.051715947687625885, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5857973014353774e-05, + "grad_norm": 32.08437728881836, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.877918004989624, + "num_tokens": 867925880.0, + "step": 22747 + }, + { + "epoch": 2.893779417376924, + "ewc_loss": 0.05151408538222313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5757042749319226e-05, + "grad_norm": 31.855026245117188, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8846728801727295, + "num_tokens": 867963923.0, + "step": 22748 + }, + { + "epoch": 2.8939066276555145, + "ewc_loss": 0.05164596438407898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5822982934187166e-05, + "grad_norm": 32.12581253051758, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8751097917556763, + "num_tokens": 868000489.0, + "step": 22749 + }, + { + "epoch": 2.894033837934105, + "ewc_loss": 0.051595937460660934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5797968191909604e-05, + "grad_norm": 31.898841857910156, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8752536773681641, + "num_tokens": 868034001.0, + "step": 22750 + }, + { + "epoch": 2.8941610482126956, + "ewc_loss": 0.05156989395618439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5784947865759023e-05, + "grad_norm": 31.907546997070312, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8767333626747131, + "num_tokens": 868074956.0, + "step": 22751 + }, + { + "epoch": 2.894288258491286, + "ewc_loss": 0.05165631324052811, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.582815614005085e-05, + "grad_norm": 32.06439208984375, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8791087865829468, + "num_tokens": 868110727.0, + "step": 22752 + }, + { + "epoch": 2.8944154687698767, + "ewc_loss": 0.051605865359306335, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.580293221399188e-05, + "grad_norm": 31.97220802307129, + "learning_rate": 1e-06, + "loss": 0.4782, + "mean_token_accuracy": 0.8536864519119263, + "num_tokens": 868151480.0, + "step": 22753 + }, + { + "epoch": 2.894542679048467, + "ewc_loss": 0.051633499562740326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5816749257501215e-05, + "grad_norm": 31.990943908691406, + "learning_rate": 1e-06, + "loss": 0.4432, + "mean_token_accuracy": 0.8631312847137451, + "num_tokens": 868190620.0, + "step": 22754 + }, + { + "epoch": 2.8946698893270577, + "ewc_loss": 0.05161844938993454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5809224098338746e-05, + "grad_norm": 31.93242073059082, + "learning_rate": 1e-06, + "loss": 0.4233, + "mean_token_accuracy": 0.8702969551086426, + "num_tokens": 868230598.0, + "step": 22755 + }, + { + "epoch": 2.8947970996056482, + "ewc_loss": 0.05168928951025009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5844645278993994e-05, + "grad_norm": 32.06829833984375, + "learning_rate": 1e-06, + "loss": 0.4513, + "mean_token_accuracy": 0.8656734228134155, + "num_tokens": 868265933.0, + "step": 22756 + }, + { + "epoch": 2.8949243098842388, + "ewc_loss": 0.05157548189163208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5787740014493465e-05, + "grad_norm": 31.981407165527344, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8748812675476074, + "num_tokens": 868300282.0, + "step": 22757 + }, + { + "epoch": 2.8950515201628293, + "ewc_loss": 0.05161675810813904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5808378268266097e-05, + "grad_norm": 32.050262451171875, + "learning_rate": 1e-06, + "loss": 0.3746, + "mean_token_accuracy": 0.8865475058555603, + "num_tokens": 868331031.0, + "step": 22758 + }, + { + "epoch": 2.89517873044142, + "ewc_loss": 0.0515814833343029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5790741347009316e-05, + "grad_norm": 32.03850555419922, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8749452829360962, + "num_tokens": 868368268.0, + "step": 22759 + }, + { + "epoch": 2.89530594072001, + "ewc_loss": 0.05160541459918022, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.580270665930584e-05, + "grad_norm": 31.95416831970215, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8565739989280701, + "num_tokens": 868407551.0, + "step": 22760 + }, + { + "epoch": 2.895433150998601, + "ewc_loss": 0.05161208659410477, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5806042685871944e-05, + "grad_norm": 32.121952056884766, + "learning_rate": 1e-06, + "loss": 0.4245, + "mean_token_accuracy": 0.869619607925415, + "num_tokens": 868449301.0, + "step": 22761 + }, + { + "epoch": 2.895560361277191, + "ewc_loss": 0.05173493176698685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.586746632005088e-05, + "grad_norm": 32.047847747802734, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8691444396972656, + "num_tokens": 868482853.0, + "step": 22762 + }, + { + "epoch": 2.895687571555782, + "ewc_loss": 0.05152761563658714, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5763807570911013e-05, + "grad_norm": 32.04093933105469, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8669321537017822, + "num_tokens": 868523960.0, + "step": 22763 + }, + { + "epoch": 2.895814781834372, + "ewc_loss": 0.05170326307415962, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5851632017293014e-05, + "grad_norm": 31.991283416748047, + "learning_rate": 1e-06, + "loss": 0.4443, + "mean_token_accuracy": 0.8644130825996399, + "num_tokens": 868566447.0, + "step": 22764 + }, + { + "epoch": 2.895941992112963, + "ewc_loss": 0.051611460745334625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5805729819694534e-05, + "grad_norm": 32.06175994873047, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8813312649726868, + "num_tokens": 868608342.0, + "step": 22765 + }, + { + "epoch": 2.896069202391553, + "ewc_loss": 0.05160015821456909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5800078219617717e-05, + "grad_norm": 31.9821720123291, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8830607533454895, + "num_tokens": 868647354.0, + "step": 22766 + }, + { + "epoch": 2.8961964126701436, + "ewc_loss": 0.05151248723268509, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.575624421297107e-05, + "grad_norm": 31.951208114624023, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8536440134048462, + "num_tokens": 868690716.0, + "step": 22767 + }, + { + "epoch": 2.896323622948734, + "ewc_loss": 0.05162474885582924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5812374587985687e-05, + "grad_norm": 32.02688980102539, + "learning_rate": 1e-06, + "loss": 0.3968, + "mean_token_accuracy": 0.8789331912994385, + "num_tokens": 868728318.0, + "step": 22768 + }, + { + "epoch": 2.8964508332273247, + "ewc_loss": 0.05159829556941986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5799148716032505e-05, + "grad_norm": 31.924955368041992, + "learning_rate": 1e-06, + "loss": 0.3692, + "mean_token_accuracy": 0.8875511884689331, + "num_tokens": 868764323.0, + "step": 22769 + }, + { + "epoch": 2.896578043505915, + "ewc_loss": 0.05168280750513077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5841403839876875e-05, + "grad_norm": 32.11236572265625, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8547667264938354, + "num_tokens": 868801371.0, + "step": 22770 + }, + { + "epoch": 2.8967052537845057, + "ewc_loss": 0.051592957228422165, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.57964784395881e-05, + "grad_norm": 31.969566345214844, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8766975998878479, + "num_tokens": 868846679.0, + "step": 22771 + }, + { + "epoch": 2.8968324640630962, + "ewc_loss": 0.05156922712922096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.578461317170877e-05, + "grad_norm": 32.06314468383789, + "learning_rate": 1e-06, + "loss": 0.4645, + "mean_token_accuracy": 0.8589073419570923, + "num_tokens": 868880076.0, + "step": 22772 + }, + { + "epoch": 2.8969596743416868, + "ewc_loss": 0.05154292657971382, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5771463697310537e-05, + "grad_norm": 32.00861740112305, + "learning_rate": 1e-06, + "loss": 0.424, + "mean_token_accuracy": 0.8734551668167114, + "num_tokens": 868917086.0, + "step": 22773 + }, + { + "epoch": 2.8970868846202773, + "ewc_loss": 0.051509883254766464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.575494181655813e-05, + "grad_norm": 31.956180572509766, + "learning_rate": 1e-06, + "loss": 0.3959, + "mean_token_accuracy": 0.8785592317581177, + "num_tokens": 868954952.0, + "step": 22774 + }, + { + "epoch": 2.897214094898868, + "ewc_loss": 0.05156122148036957, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.578061139502097e-05, + "grad_norm": 31.941970825195312, + "learning_rate": 1e-06, + "loss": 0.4345, + "mean_token_accuracy": 0.8674918413162231, + "num_tokens": 868990911.0, + "step": 22775 + }, + { + "epoch": 2.8973413051774584, + "ewc_loss": 0.05161512643098831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5807563361013308e-05, + "grad_norm": 32.106075286865234, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8825817704200745, + "num_tokens": 869033398.0, + "step": 22776 + }, + { + "epoch": 2.897468515456049, + "ewc_loss": 0.05158933624625206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.579466854513157e-05, + "grad_norm": 31.960529327392578, + "learning_rate": 1e-06, + "loss": 0.4559, + "mean_token_accuracy": 0.85849928855896, + "num_tokens": 869070513.0, + "step": 22777 + }, + { + "epoch": 2.8975957257346394, + "ewc_loss": 0.051561642438173294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5780820578802377e-05, + "grad_norm": 32.015445709228516, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8792535066604614, + "num_tokens": 869113494.0, + "step": 22778 + }, + { + "epoch": 2.89772293601323, + "ewc_loss": 0.051654230803251266, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5827115678112023e-05, + "grad_norm": 32.022613525390625, + "learning_rate": 1e-06, + "loss": 0.3936, + "mean_token_accuracy": 0.8807523250579834, + "num_tokens": 869147433.0, + "step": 22779 + }, + { + "epoch": 2.8978501462918205, + "ewc_loss": 0.05161956325173378, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5809782528085634e-05, + "grad_norm": 32.06794357299805, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.8685822486877441, + "num_tokens": 869186763.0, + "step": 22780 + }, + { + "epoch": 2.897977356570411, + "ewc_loss": 0.051603902131319046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.580195177870337e-05, + "grad_norm": 32.12948989868164, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8853530883789062, + "num_tokens": 869222624.0, + "step": 22781 + }, + { + "epoch": 2.8981045668490015, + "ewc_loss": 0.05156660079956055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.578329986135941e-05, + "grad_norm": 32.117977142333984, + "learning_rate": 1e-06, + "loss": 0.3585, + "mean_token_accuracy": 0.8933870792388916, + "num_tokens": 869263027.0, + "step": 22782 + }, + { + "epoch": 2.898231777127592, + "ewc_loss": 0.05156290531158447, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.578145358711481e-05, + "grad_norm": 32.03754806518555, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8672068119049072, + "num_tokens": 869304232.0, + "step": 22783 + }, + { + "epoch": 2.8983589874061826, + "ewc_loss": 0.05149637535214424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5748187908902764e-05, + "grad_norm": 32.024169921875, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8779336214065552, + "num_tokens": 869340141.0, + "step": 22784 + }, + { + "epoch": 2.8984861976847727, + "ewc_loss": 0.05153776705265045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.576888437033631e-05, + "grad_norm": 32.04951095581055, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8634333610534668, + "num_tokens": 869382428.0, + "step": 22785 + }, + { + "epoch": 2.8986134079633636, + "ewc_loss": 0.05152936652302742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5764682504814118e-05, + "grad_norm": 32.11783981323242, + "learning_rate": 1e-06, + "loss": 0.3943, + "mean_token_accuracy": 0.8798486590385437, + "num_tokens": 869418890.0, + "step": 22786 + }, + { + "epoch": 2.8987406182419537, + "ewc_loss": 0.051474668085575104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5737333999131806e-05, + "grad_norm": 32.01420211791992, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8737251162528992, + "num_tokens": 869453704.0, + "step": 22787 + }, + { + "epoch": 2.8988678285205447, + "ewc_loss": 0.05161449313163757, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.580724685685709e-05, + "grad_norm": 31.97722053527832, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8694679737091064, + "num_tokens": 869491845.0, + "step": 22788 + }, + { + "epoch": 2.898995038799135, + "ewc_loss": 0.05150243639945984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5751218345249072e-05, + "grad_norm": 32.010013580322266, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8834742903709412, + "num_tokens": 869523393.0, + "step": 22789 + }, + { + "epoch": 2.8991222490777258, + "ewc_loss": 0.05159243568778038, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5796218324103393e-05, + "grad_norm": 31.94761848449707, + "learning_rate": 1e-06, + "loss": 0.4236, + "mean_token_accuracy": 0.8700629472732544, + "num_tokens": 869561927.0, + "step": 22790 + }, + { + "epoch": 2.899249459356316, + "ewc_loss": 0.05150436609983444, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.575218240963295e-05, + "grad_norm": 32.011993408203125, + "learning_rate": 1e-06, + "loss": 0.4138, + "mean_token_accuracy": 0.8711774945259094, + "num_tokens": 869599568.0, + "step": 22791 + }, + { + "epoch": 2.8993766696349064, + "ewc_loss": 0.05159429460763931, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5797147827688605e-05, + "grad_norm": 31.88084602355957, + "learning_rate": 1e-06, + "loss": 0.4044, + "mean_token_accuracy": 0.8767259120941162, + "num_tokens": 869636329.0, + "step": 22792 + }, + { + "epoch": 2.899503879913497, + "ewc_loss": 0.05160472169518471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5802361051319167e-05, + "grad_norm": 32.0060920715332, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8674613833427429, + "num_tokens": 869679720.0, + "step": 22793 + }, + { + "epoch": 2.8996310901920874, + "ewc_loss": 0.051623303443193436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.581165244919248e-05, + "grad_norm": 32.03800964355469, + "learning_rate": 1e-06, + "loss": 0.4264, + "mean_token_accuracy": 0.8754599094390869, + "num_tokens": 869719527.0, + "step": 22794 + }, + { + "epoch": 2.899758300470678, + "ewc_loss": 0.05159571394324303, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5797857233555987e-05, + "grad_norm": 31.86127281188965, + "learning_rate": 1e-06, + "loss": 0.4032, + "mean_token_accuracy": 0.8765101432800293, + "num_tokens": 869755139.0, + "step": 22795 + }, + { + "epoch": 2.8998855107492685, + "ewc_loss": 0.05161868780851364, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.580934415163938e-05, + "grad_norm": 32.02730941772461, + "learning_rate": 1e-06, + "loss": 0.4284, + "mean_token_accuracy": 0.8676885366439819, + "num_tokens": 869791860.0, + "step": 22796 + }, + { + "epoch": 2.900012721027859, + "ewc_loss": 0.05170389637351036, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.585194852144923e-05, + "grad_norm": 32.051429748535156, + "learning_rate": 1e-06, + "loss": 0.3554, + "mean_token_accuracy": 0.894190788269043, + "num_tokens": 869827294.0, + "step": 22797 + }, + { + "epoch": 2.9001399313064495, + "ewc_loss": 0.051658570766448975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5829285732470453e-05, + "grad_norm": 31.98319435119629, + "learning_rate": 1e-06, + "loss": 0.4309, + "mean_token_accuracy": 0.8670129776000977, + "num_tokens": 869859398.0, + "step": 22798 + }, + { + "epoch": 2.90026714158504, + "ewc_loss": 0.051686618477106094, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.584331014077179e-05, + "grad_norm": 31.978864669799805, + "learning_rate": 1e-06, + "loss": 0.4154, + "mean_token_accuracy": 0.8714818954467773, + "num_tokens": 869898215.0, + "step": 22799 + }, + { + "epoch": 2.9003943518636306, + "ewc_loss": 0.05170386657118797, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5851933969534002e-05, + "grad_norm": 31.991809844970703, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8622589111328125, + "num_tokens": 869939787.0, + "step": 22800 + }, + { + "epoch": 2.900521562142221, + "ewc_loss": 0.05164387449622154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5821937015280128e-05, + "grad_norm": 32.01909255981445, + "learning_rate": 1e-06, + "loss": 0.454, + "mean_token_accuracy": 0.8635978698730469, + "num_tokens": 869976790.0, + "step": 22801 + }, + { + "epoch": 2.9006487724208116, + "ewc_loss": 0.0516221709549427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5811084924498573e-05, + "grad_norm": 32.00090026855469, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.877479076385498, + "num_tokens": 870013547.0, + "step": 22802 + }, + { + "epoch": 2.900775982699402, + "ewc_loss": 0.051598500460386276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5799250579439104e-05, + "grad_norm": 31.845762252807617, + "learning_rate": 1e-06, + "loss": 0.3774, + "mean_token_accuracy": 0.8836554288864136, + "num_tokens": 870049540.0, + "step": 22803 + }, + { + "epoch": 2.9009031929779927, + "ewc_loss": 0.051611773669719696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.580588625278324e-05, + "grad_norm": 32.07571029663086, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8793636560440063, + "num_tokens": 870088247.0, + "step": 22804 + }, + { + "epoch": 2.9010304032565832, + "ewc_loss": 0.05177093297243118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.588546703918837e-05, + "grad_norm": 31.95795440673828, + "learning_rate": 1e-06, + "loss": 0.4227, + "mean_token_accuracy": 0.8706430196762085, + "num_tokens": 870129112.0, + "step": 22805 + }, + { + "epoch": 2.9011576135351738, + "ewc_loss": 0.051602791994810104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5801395167945884e-05, + "grad_norm": 31.910154342651367, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8724384903907776, + "num_tokens": 870162082.0, + "step": 22806 + }, + { + "epoch": 2.9012848238137643, + "ewc_loss": 0.05167344957590103, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5836725399130955e-05, + "grad_norm": 31.99495506286621, + "learning_rate": 1e-06, + "loss": 0.3876, + "mean_token_accuracy": 0.8829116821289062, + "num_tokens": 870199868.0, + "step": 22807 + }, + { + "epoch": 2.901412034092355, + "ewc_loss": 0.05167223885655403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5836119675659575e-05, + "grad_norm": 32.02465057373047, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.8769111037254333, + "num_tokens": 870238121.0, + "step": 22808 + }, + { + "epoch": 2.9015392443709453, + "ewc_loss": 0.05165494978427887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.582747401902452e-05, + "grad_norm": 31.986774444580078, + "learning_rate": 1e-06, + "loss": 0.4455, + "mean_token_accuracy": 0.8642569780349731, + "num_tokens": 870278008.0, + "step": 22809 + }, + { + "epoch": 2.9016664546495354, + "ewc_loss": 0.05161529779434204, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5807648853515275e-05, + "grad_norm": 32.08163070678711, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8692158460617065, + "num_tokens": 870317671.0, + "step": 22810 + }, + { + "epoch": 2.9017936649281264, + "ewc_loss": 0.05156584084033966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.578291969257407e-05, + "grad_norm": 31.93589973449707, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8800488710403442, + "num_tokens": 870352893.0, + "step": 22811 + }, + { + "epoch": 2.9019208752067165, + "ewc_loss": 0.051514603197574615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.575730104581453e-05, + "grad_norm": 32.030235290527344, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8799707889556885, + "num_tokens": 870387513.0, + "step": 22812 + }, + { + "epoch": 2.9020480854853075, + "ewc_loss": 0.051657404750585556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5828701836871915e-05, + "grad_norm": 32.0531005859375, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.8788743019104004, + "num_tokens": 870428649.0, + "step": 22813 + }, + { + "epoch": 2.9021752957638975, + "ewc_loss": 0.05153607577085495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.576803854026366e-05, + "grad_norm": 32.00667953491211, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.8870553970336914, + "num_tokens": 870468329.0, + "step": 22814 + }, + { + "epoch": 2.902302506042488, + "ewc_loss": 0.05164043605327606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5820218070293777e-05, + "grad_norm": 32.17729949951172, + "learning_rate": 1e-06, + "loss": 0.4259, + "mean_token_accuracy": 0.8724120259284973, + "num_tokens": 870509273.0, + "step": 22815 + }, + { + "epoch": 2.9024297163210786, + "ewc_loss": 0.05158458277583122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5792291125981137e-05, + "grad_norm": 32.039039611816406, + "learning_rate": 1e-06, + "loss": 0.428, + "mean_token_accuracy": 0.867408037185669, + "num_tokens": 870544206.0, + "step": 22816 + }, + { + "epoch": 2.902556926599669, + "ewc_loss": 0.05157122761011124, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.578561361588072e-05, + "grad_norm": 32.01917266845703, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8891544342041016, + "num_tokens": 870578708.0, + "step": 22817 + }, + { + "epoch": 2.9026841368782597, + "ewc_loss": 0.051476918160915375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.57384599535726e-05, + "grad_norm": 31.956628799438477, + "learning_rate": 1e-06, + "loss": 0.3933, + "mean_token_accuracy": 0.8791768550872803, + "num_tokens": 870620599.0, + "step": 22818 + }, + { + "epoch": 2.90281134715685, + "ewc_loss": 0.05160116031765938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5800580260693096e-05, + "grad_norm": 32.13971710205078, + "learning_rate": 1e-06, + "loss": 0.3688, + "mean_token_accuracy": 0.8862874507904053, + "num_tokens": 870657828.0, + "step": 22819 + }, + { + "epoch": 2.9029385574354407, + "ewc_loss": 0.05156147480010986, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5780736905289814e-05, + "grad_norm": 31.980281829833984, + "learning_rate": 1e-06, + "loss": 0.4365, + "mean_token_accuracy": 0.8687357306480408, + "num_tokens": 870694786.0, + "step": 22820 + }, + { + "epoch": 2.9030657677140312, + "ewc_loss": 0.051418058574199677, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.570902870502323e-05, + "grad_norm": 31.97524642944336, + "learning_rate": 1e-06, + "loss": 0.3704, + "mean_token_accuracy": 0.8898345232009888, + "num_tokens": 870733850.0, + "step": 22821 + }, + { + "epoch": 2.9031929779926218, + "ewc_loss": 0.05153711885213852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5768558771233074e-05, + "grad_norm": 32.041526794433594, + "learning_rate": 1e-06, + "loss": 0.4179, + "mean_token_accuracy": 0.8728901147842407, + "num_tokens": 870771556.0, + "step": 22822 + }, + { + "epoch": 2.9033201882712123, + "ewc_loss": 0.05153992399573326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5769961212063208e-05, + "grad_norm": 32.05427551269531, + "learning_rate": 1e-06, + "loss": 0.3808, + "mean_token_accuracy": 0.884169340133667, + "num_tokens": 870809335.0, + "step": 22823 + }, + { + "epoch": 2.903447398549803, + "ewc_loss": 0.051433365792036057, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.571668301243335e-05, + "grad_norm": 31.954465866088867, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8683129549026489, + "num_tokens": 870844239.0, + "step": 22824 + }, + { + "epoch": 2.9035746088283934, + "ewc_loss": 0.05156203359365463, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.578101702965796e-05, + "grad_norm": 32.07176971435547, + "learning_rate": 1e-06, + "loss": 0.42, + "mean_token_accuracy": 0.8713206052780151, + "num_tokens": 870879540.0, + "step": 22825 + }, + { + "epoch": 2.903701819106984, + "ewc_loss": 0.051534973084926605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5767487386474386e-05, + "grad_norm": 31.991378784179688, + "learning_rate": 1e-06, + "loss": 0.4211, + "mean_token_accuracy": 0.8727540969848633, + "num_tokens": 870917219.0, + "step": 22826 + }, + { + "epoch": 2.9038290293855744, + "ewc_loss": 0.05154256895184517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5771283617359586e-05, + "grad_norm": 32.08979415893555, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8740154504776001, + "num_tokens": 870952332.0, + "step": 22827 + }, + { + "epoch": 2.903956239664165, + "ewc_loss": 0.051589131355285645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.579456486273557e-05, + "grad_norm": 32.021427154541016, + "learning_rate": 1e-06, + "loss": 0.4596, + "mean_token_accuracy": 0.8614325523376465, + "num_tokens": 870991609.0, + "step": 22828 + }, + { + "epoch": 2.9040834499427555, + "ewc_loss": 0.051553383469581604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5776691472856328e-05, + "grad_norm": 32.02995681762695, + "learning_rate": 1e-06, + "loss": 0.4028, + "mean_token_accuracy": 0.8763459920883179, + "num_tokens": 871027456.0, + "step": 22829 + }, + { + "epoch": 2.904210660221346, + "ewc_loss": 0.051504991948604584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.575249527581036e-05, + "grad_norm": 31.848299026489258, + "learning_rate": 1e-06, + "loss": 0.4388, + "mean_token_accuracy": 0.863093376159668, + "num_tokens": 871071833.0, + "step": 22830 + }, + { + "epoch": 2.9043378704999365, + "ewc_loss": 0.051587749272584915, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5793875465751626e-05, + "grad_norm": 32.01246643066406, + "learning_rate": 1e-06, + "loss": 0.4202, + "mean_token_accuracy": 0.8712968230247498, + "num_tokens": 871105446.0, + "step": 22831 + }, + { + "epoch": 2.904465080778527, + "ewc_loss": 0.05160677060484886, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5803385142353363e-05, + "grad_norm": 31.980531692504883, + "learning_rate": 1e-06, + "loss": 0.417, + "mean_token_accuracy": 0.8708148002624512, + "num_tokens": 871141408.0, + "step": 22832 + }, + { + "epoch": 2.9045922910571176, + "ewc_loss": 0.05155037343502045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5775187168619595e-05, + "grad_norm": 31.89503288269043, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8753877878189087, + "num_tokens": 871173296.0, + "step": 22833 + }, + { + "epoch": 2.904719501335708, + "ewc_loss": 0.05163723602890968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5818617359618656e-05, + "grad_norm": 32.08831787109375, + "learning_rate": 1e-06, + "loss": 0.4343, + "mean_token_accuracy": 0.8696523904800415, + "num_tokens": 871210711.0, + "step": 22834 + }, + { + "epoch": 2.904846711614298, + "ewc_loss": 0.051647838205099106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5823919713729993e-05, + "grad_norm": 31.9301700592041, + "learning_rate": 1e-06, + "loss": 0.4221, + "mean_token_accuracy": 0.8696821928024292, + "num_tokens": 871251179.0, + "step": 22835 + }, + { + "epoch": 2.904973921892889, + "ewc_loss": 0.05159202590584755, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5796012778300792e-05, + "grad_norm": 31.964845657348633, + "learning_rate": 1e-06, + "loss": 0.3576, + "mean_token_accuracy": 0.8914434909820557, + "num_tokens": 871295758.0, + "step": 22836 + }, + { + "epoch": 2.9051011321714793, + "ewc_loss": 0.05174548923969269, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.587274502729997e-05, + "grad_norm": 32.030189514160156, + "learning_rate": 1e-06, + "loss": 0.4634, + "mean_token_accuracy": 0.8572902679443359, + "num_tokens": 871331145.0, + "step": 22837 + }, + { + "epoch": 2.9052283424500702, + "ewc_loss": 0.051750004291534424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.587500239314977e-05, + "grad_norm": 32.01715850830078, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8486050367355347, + "num_tokens": 871372819.0, + "step": 22838 + }, + { + "epoch": 2.9053555527286603, + "ewc_loss": 0.051716625690460205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5858313165372238e-05, + "grad_norm": 31.990633010864258, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8683247566223145, + "num_tokens": 871414223.0, + "step": 22839 + }, + { + "epoch": 2.905482763007251, + "ewc_loss": 0.051738183945417404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.586909249657765e-05, + "grad_norm": 31.957059860229492, + "learning_rate": 1e-06, + "loss": 0.4014, + "mean_token_accuracy": 0.8781356811523438, + "num_tokens": 871455013.0, + "step": 22840 + }, + { + "epoch": 2.9056099732858414, + "ewc_loss": 0.05170136317610741, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5850682504824363e-05, + "grad_norm": 32.012908935546875, + "learning_rate": 1e-06, + "loss": 0.4279, + "mean_token_accuracy": 0.8715484142303467, + "num_tokens": 871495462.0, + "step": 22841 + }, + { + "epoch": 2.905737183564432, + "ewc_loss": 0.051805783063173294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5902891138684936e-05, + "grad_norm": 32.034114837646484, + "learning_rate": 1e-06, + "loss": 0.4133, + "mean_token_accuracy": 0.8754280209541321, + "num_tokens": 871534999.0, + "step": 22842 + }, + { + "epoch": 2.9058643938430224, + "ewc_loss": 0.051656756550073624, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5828378056758083e-05, + "grad_norm": 32.00043487548828, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8733638525009155, + "num_tokens": 871569816.0, + "step": 22843 + }, + { + "epoch": 2.905991604121613, + "ewc_loss": 0.05167616158723831, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5838080546236597e-05, + "grad_norm": 31.950986862182617, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8738000392913818, + "num_tokens": 871610441.0, + "step": 22844 + }, + { + "epoch": 2.9061188144002035, + "ewc_loss": 0.05179131403565407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5895657017827034e-05, + "grad_norm": 32.05759048461914, + "learning_rate": 1e-06, + "loss": 0.4548, + "mean_token_accuracy": 0.8588187098503113, + "num_tokens": 871649005.0, + "step": 22845 + }, + { + "epoch": 2.906246024678794, + "ewc_loss": 0.05168537050485611, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.584268440841697e-05, + "grad_norm": 32.08455276489258, + "learning_rate": 1e-06, + "loss": 0.4294, + "mean_token_accuracy": 0.8686675429344177, + "num_tokens": 871689263.0, + "step": 22846 + }, + { + "epoch": 2.9063732349573845, + "ewc_loss": 0.05163554474711418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5817771529546008e-05, + "grad_norm": 31.998088836669922, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8822686672210693, + "num_tokens": 871725440.0, + "step": 22847 + }, + { + "epoch": 2.906500445235975, + "ewc_loss": 0.051710791885852814, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.585539550636895e-05, + "grad_norm": 31.987245559692383, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8692026138305664, + "num_tokens": 871767839.0, + "step": 22848 + }, + { + "epoch": 2.9066276555145656, + "ewc_loss": 0.051590196788311005, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.579509782663081e-05, + "grad_norm": 31.999853134155273, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8766459226608276, + "num_tokens": 871801516.0, + "step": 22849 + }, + { + "epoch": 2.906754865793156, + "ewc_loss": 0.05164643004536629, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.582321576483082e-05, + "grad_norm": 31.951725006103516, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8680348992347717, + "num_tokens": 871840359.0, + "step": 22850 + }, + { + "epoch": 2.9068820760717466, + "ewc_loss": 0.051674775779247284, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5837387511273846e-05, + "grad_norm": 32.016639709472656, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.877509355545044, + "num_tokens": 871879396.0, + "step": 22851 + }, + { + "epoch": 2.907009286350337, + "ewc_loss": 0.05164980888366699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.582490378699731e-05, + "grad_norm": 31.91181755065918, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8658429384231567, + "num_tokens": 871915043.0, + "step": 22852 + }, + { + "epoch": 2.9071364966289277, + "ewc_loss": 0.05163925141096115, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.581962507974822e-05, + "grad_norm": 31.990318298339844, + "learning_rate": 1e-06, + "loss": 0.4322, + "mean_token_accuracy": 0.8676843643188477, + "num_tokens": 871953391.0, + "step": 22853 + }, + { + "epoch": 2.9072637069075182, + "ewc_loss": 0.05181775614619255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5908877432812005e-05, + "grad_norm": 32.094566345214844, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8766151070594788, + "num_tokens": 871989879.0, + "step": 22854 + }, + { + "epoch": 2.9073909171861088, + "ewc_loss": 0.05164447799324989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5822238967521116e-05, + "grad_norm": 31.86858558654785, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8760501742362976, + "num_tokens": 872022428.0, + "step": 22855 + }, + { + "epoch": 2.9075181274646993, + "ewc_loss": 0.05164293572306633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5821467716014013e-05, + "grad_norm": 31.971027374267578, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.877310037612915, + "num_tokens": 872060644.0, + "step": 22856 + }, + { + "epoch": 2.90764533774329, + "ewc_loss": 0.051712144166231155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.585607217042707e-05, + "grad_norm": 32.026737213134766, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8696058392524719, + "num_tokens": 872098230.0, + "step": 22857 + }, + { + "epoch": 2.90777254802188, + "ewc_loss": 0.05172127112746239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5860636014840566e-05, + "grad_norm": 31.947301864624023, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8625425100326538, + "num_tokens": 872131217.0, + "step": 22858 + }, + { + "epoch": 2.907899758300471, + "ewc_loss": 0.051733244210481644, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5866622308967635e-05, + "grad_norm": 32.117034912109375, + "learning_rate": 1e-06, + "loss": 0.3906, + "mean_token_accuracy": 0.8817116022109985, + "num_tokens": 872163912.0, + "step": 22859 + }, + { + "epoch": 2.908026968579061, + "ewc_loss": 0.05178270488977432, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5891353288898245e-05, + "grad_norm": 31.87422752380371, + "learning_rate": 1e-06, + "loss": 0.4958, + "mean_token_accuracy": 0.8490238189697266, + "num_tokens": 872200694.0, + "step": 22860 + }, + { + "epoch": 2.908154178857652, + "ewc_loss": 0.05168791487812996, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5843957700999454e-05, + "grad_norm": 32.0328369140625, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8717761039733887, + "num_tokens": 872241156.0, + "step": 22861 + }, + { + "epoch": 2.908281389136242, + "ewc_loss": 0.05183089151978493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.591544580354821e-05, + "grad_norm": 32.058326721191406, + "learning_rate": 1e-06, + "loss": 0.4083, + "mean_token_accuracy": 0.8755394816398621, + "num_tokens": 872273048.0, + "step": 22862 + }, + { + "epoch": 2.908408599414833, + "ewc_loss": 0.051781654357910156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.589082760096062e-05, + "grad_norm": 32.051509857177734, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8763706088066101, + "num_tokens": 872312165.0, + "step": 22863 + }, + { + "epoch": 2.908535809693423, + "ewc_loss": 0.051687631756067276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5843815819825977e-05, + "grad_norm": 31.93733787536621, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8746840953826904, + "num_tokens": 872352740.0, + "step": 22864 + }, + { + "epoch": 2.9086630199720136, + "ewc_loss": 0.05170495808124542, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5852479666355066e-05, + "grad_norm": 31.968637466430664, + "learning_rate": 1e-06, + "loss": 0.4491, + "mean_token_accuracy": 0.8591278791427612, + "num_tokens": 872397800.0, + "step": 22865 + }, + { + "epoch": 2.908790230250604, + "ewc_loss": 0.051795028150081635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5897514206008054e-05, + "grad_norm": 32.07670211791992, + "learning_rate": 1e-06, + "loss": 0.3874, + "mean_token_accuracy": 0.884649395942688, + "num_tokens": 872429307.0, + "step": 22866 + }, + { + "epoch": 2.9089174405291947, + "ewc_loss": 0.051801666617393494, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5900833861669526e-05, + "grad_norm": 32.09317398071289, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8697822093963623, + "num_tokens": 872465897.0, + "step": 22867 + }, + { + "epoch": 2.909044650807785, + "ewc_loss": 0.051699768751859665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5849883968476206e-05, + "grad_norm": 31.897092819213867, + "learning_rate": 1e-06, + "loss": 0.4422, + "mean_token_accuracy": 0.8648295402526855, + "num_tokens": 872503252.0, + "step": 22868 + }, + { + "epoch": 2.9091718610863757, + "ewc_loss": 0.05172209441661835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5861047106445767e-05, + "grad_norm": 32.118778228759766, + "learning_rate": 1e-06, + "loss": 0.4054, + "mean_token_accuracy": 0.8763554692268372, + "num_tokens": 872546276.0, + "step": 22869 + }, + { + "epoch": 2.9092990713649662, + "ewc_loss": 0.051790349185466766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5895174985635094e-05, + "grad_norm": 32.04922103881836, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8706651926040649, + "num_tokens": 872585690.0, + "step": 22870 + }, + { + "epoch": 2.9094262816435568, + "ewc_loss": 0.05168326944112778, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5841634851531126e-05, + "grad_norm": 32.1026611328125, + "learning_rate": 1e-06, + "loss": 0.4526, + "mean_token_accuracy": 0.8628377914428711, + "num_tokens": 872621718.0, + "step": 22871 + }, + { + "epoch": 2.9095534919221473, + "ewc_loss": 0.051781028509140015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.589051473478321e-05, + "grad_norm": 32.007728576660156, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8767282962799072, + "num_tokens": 872659349.0, + "step": 22872 + }, + { + "epoch": 2.909680702200738, + "ewc_loss": 0.05158928781747818, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5794643079279922e-05, + "grad_norm": 32.00874328613281, + "learning_rate": 1e-06, + "loss": 0.4303, + "mean_token_accuracy": 0.8686709403991699, + "num_tokens": 872704468.0, + "step": 22873 + }, + { + "epoch": 2.9098079124793284, + "ewc_loss": 0.05183244124054909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.591622069303412e-05, + "grad_norm": 32.081111907958984, + "learning_rate": 1e-06, + "loss": 0.4231, + "mean_token_accuracy": 0.8735981583595276, + "num_tokens": 872740003.0, + "step": 22874 + }, + { + "epoch": 2.909935122757919, + "ewc_loss": 0.051650743931531906, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5825371267274022e-05, + "grad_norm": 32.1306037902832, + "learning_rate": 1e-06, + "loss": 0.4894, + "mean_token_accuracy": 0.8513258099555969, + "num_tokens": 872780362.0, + "step": 22875 + }, + { + "epoch": 2.9100623330365094, + "ewc_loss": 0.05174194276332855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5870971512631513e-05, + "grad_norm": 31.965978622436523, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8801465034484863, + "num_tokens": 872823719.0, + "step": 22876 + }, + { + "epoch": 2.9101895433151, + "ewc_loss": 0.05156346410512924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5781731892493553e-05, + "grad_norm": 32.00346374511719, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8761098384857178, + "num_tokens": 872856276.0, + "step": 22877 + }, + { + "epoch": 2.9103167535936905, + "ewc_loss": 0.051781944930553436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5890973120112903e-05, + "grad_norm": 32.095149993896484, + "learning_rate": 1e-06, + "loss": 0.436, + "mean_token_accuracy": 0.8654487133026123, + "num_tokens": 872893688.0, + "step": 22878 + }, + { + "epoch": 2.910443963872281, + "ewc_loss": 0.05171685293316841, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5858425942715257e-05, + "grad_norm": 32.05817413330078, + "learning_rate": 1e-06, + "loss": 0.3595, + "mean_token_accuracy": 0.8911081552505493, + "num_tokens": 872931169.0, + "step": 22879 + }, + { + "epoch": 2.9105711741508715, + "ewc_loss": 0.05161384120583534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5806921257753856e-05, + "grad_norm": 31.983844757080078, + "learning_rate": 1e-06, + "loss": 0.426, + "mean_token_accuracy": 0.8682385683059692, + "num_tokens": 872970836.0, + "step": 22880 + }, + { + "epoch": 2.910698384429462, + "ewc_loss": 0.05171796306967735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.585898073448334e-05, + "grad_norm": 32.0793571472168, + "learning_rate": 1e-06, + "loss": 0.3742, + "mean_token_accuracy": 0.8870735168457031, + "num_tokens": 873010879.0, + "step": 22881 + }, + { + "epoch": 2.9108255947080526, + "ewc_loss": 0.05164984241127968, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5824921976891346e-05, + "grad_norm": 31.91379737854004, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8748652935028076, + "num_tokens": 873058018.0, + "step": 22882 + }, + { + "epoch": 2.9109528049866427, + "ewc_loss": 0.05164013057947159, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.582006527518388e-05, + "grad_norm": 32.13026428222656, + "learning_rate": 1e-06, + "loss": 0.4529, + "mean_token_accuracy": 0.8619896769523621, + "num_tokens": 873096259.0, + "step": 22883 + }, + { + "epoch": 2.9110800152652336, + "ewc_loss": 0.05170651897788048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5853260012809187e-05, + "grad_norm": 32.08263397216797, + "learning_rate": 1e-06, + "loss": 0.4528, + "mean_token_accuracy": 0.8623449802398682, + "num_tokens": 873131955.0, + "step": 22884 + }, + { + "epoch": 2.9112072255438237, + "ewc_loss": 0.0516754649579525, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.583773311926052e-05, + "grad_norm": 32.10750961303711, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8660070300102234, + "num_tokens": 873171024.0, + "step": 22885 + }, + { + "epoch": 2.9113344358224147, + "ewc_loss": 0.0515543557703495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5777177143027075e-05, + "grad_norm": 31.924291610717773, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8760716915130615, + "num_tokens": 873210568.0, + "step": 22886 + }, + { + "epoch": 2.9114616461010048, + "ewc_loss": 0.051639363169670105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.581968146841973e-05, + "grad_norm": 31.9959716796875, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8753931522369385, + "num_tokens": 873250025.0, + "step": 22887 + }, + { + "epoch": 2.9115888563795957, + "ewc_loss": 0.05172843858599663, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.586421942396555e-05, + "grad_norm": 32.10276412963867, + "learning_rate": 1e-06, + "loss": 0.4474, + "mean_token_accuracy": 0.8656225204467773, + "num_tokens": 873285284.0, + "step": 22888 + }, + { + "epoch": 2.911716066658186, + "ewc_loss": 0.051602911204099655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.58014551945962e-05, + "grad_norm": 31.91291618347168, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.875670850276947, + "num_tokens": 873320102.0, + "step": 22889 + }, + { + "epoch": 2.9118432769367764, + "ewc_loss": 0.051735829561948776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5867915610433556e-05, + "grad_norm": 32.136329650878906, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8778531551361084, + "num_tokens": 873354668.0, + "step": 22890 + }, + { + "epoch": 2.911970487215367, + "ewc_loss": 0.05173672363162041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5868361262837425e-05, + "grad_norm": 32.06119918823242, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8700636625289917, + "num_tokens": 873387415.0, + "step": 22891 + }, + { + "epoch": 2.9120976974939574, + "ewc_loss": 0.05163568630814552, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5817842470132746e-05, + "grad_norm": 32.04719543457031, + "learning_rate": 1e-06, + "loss": 0.367, + "mean_token_accuracy": 0.8900570869445801, + "num_tokens": 873428679.0, + "step": 22892 + }, + { + "epoch": 2.912224907772548, + "ewc_loss": 0.05173996090888977, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.586998016340658e-05, + "grad_norm": 31.99973487854004, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8733934164047241, + "num_tokens": 873466973.0, + "step": 22893 + }, + { + "epoch": 2.9123521180511385, + "ewc_loss": 0.05172574892640114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.586287519079633e-05, + "grad_norm": 32.09880447387695, + "learning_rate": 1e-06, + "loss": 0.3668, + "mean_token_accuracy": 0.8874785900115967, + "num_tokens": 873510957.0, + "step": 22894 + }, + { + "epoch": 2.912479328329729, + "ewc_loss": 0.05174391716718674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.587195922387764e-05, + "grad_norm": 32.02090835571289, + "learning_rate": 1e-06, + "loss": 0.4791, + "mean_token_accuracy": 0.8533486127853394, + "num_tokens": 873546456.0, + "step": 22895 + }, + { + "epoch": 2.9126065386083195, + "ewc_loss": 0.051775168627500534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5887584342854097e-05, + "grad_norm": 32.11333084106445, + "learning_rate": 1e-06, + "loss": 0.4601, + "mean_token_accuracy": 0.8592895865440369, + "num_tokens": 873591972.0, + "step": 22896 + }, + { + "epoch": 2.91273374888691, + "ewc_loss": 0.05183045193552971, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.591522570583038e-05, + "grad_norm": 32.047157287597656, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8712660670280457, + "num_tokens": 873629790.0, + "step": 22897 + }, + { + "epoch": 2.9128609591655006, + "ewc_loss": 0.05161229521036148, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.580614818725735e-05, + "grad_norm": 32.013389587402344, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.87287437915802, + "num_tokens": 873667419.0, + "step": 22898 + }, + { + "epoch": 2.912988169444091, + "ewc_loss": 0.051756635308265686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5878318410832435e-05, + "grad_norm": 32.216068267822266, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8825327754020691, + "num_tokens": 873705834.0, + "step": 22899 + }, + { + "epoch": 2.9131153797226816, + "ewc_loss": 0.05162694677710533, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.581347325758543e-05, + "grad_norm": 32.03346633911133, + "learning_rate": 1e-06, + "loss": 0.4441, + "mean_token_accuracy": 0.8636282086372375, + "num_tokens": 873748859.0, + "step": 22900 + }, + { + "epoch": 2.913242590001272, + "ewc_loss": 0.051606517285108566, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5803257813095115e-05, + "grad_norm": 32.19015121459961, + "learning_rate": 1e-06, + "loss": 0.4434, + "mean_token_accuracy": 0.8637163639068604, + "num_tokens": 873788068.0, + "step": 22901 + }, + { + "epoch": 2.9133698002798627, + "ewc_loss": 0.051714349538087845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5857174478005618e-05, + "grad_norm": 32.129417419433594, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8807709217071533, + "num_tokens": 873827820.0, + "step": 22902 + }, + { + "epoch": 2.9134970105584532, + "ewc_loss": 0.05156789347529411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5783947421587072e-05, + "grad_norm": 32.07535934448242, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8691297173500061, + "num_tokens": 873866271.0, + "step": 22903 + }, + { + "epoch": 2.9136242208370438, + "ewc_loss": 0.05163586884737015, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5817935238592327e-05, + "grad_norm": 32.10586929321289, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8693432807922363, + "num_tokens": 873907409.0, + "step": 22904 + }, + { + "epoch": 2.9137514311156343, + "ewc_loss": 0.051605574786663055, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5802786694839597e-05, + "grad_norm": 32.06127166748047, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8789443969726562, + "num_tokens": 873947940.0, + "step": 22905 + }, + { + "epoch": 2.913878641394225, + "ewc_loss": 0.051584478467702866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5792238375288434e-05, + "grad_norm": 32.10201644897461, + "learning_rate": 1e-06, + "loss": 0.4573, + "mean_token_accuracy": 0.8571851253509521, + "num_tokens": 873990062.0, + "step": 22906 + }, + { + "epoch": 2.9140058516728153, + "ewc_loss": 0.0516418032348156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.582090201030951e-05, + "grad_norm": 32.06667709350586, + "learning_rate": 1e-06, + "loss": 0.3783, + "mean_token_accuracy": 0.8845880627632141, + "num_tokens": 874026660.0, + "step": 22907 + }, + { + "epoch": 2.9141330619514054, + "ewc_loss": 0.051565125584602356, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5782563170650974e-05, + "grad_norm": 32.06557846069336, + "learning_rate": 1e-06, + "loss": 0.4132, + "mean_token_accuracy": 0.8789350986480713, + "num_tokens": 874064935.0, + "step": 22908 + }, + { + "epoch": 2.9142602722299964, + "ewc_loss": 0.051637545228004456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5818771973717958e-05, + "grad_norm": 31.97740364074707, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8722591400146484, + "num_tokens": 874102807.0, + "step": 22909 + }, + { + "epoch": 2.9143874825085865, + "ewc_loss": 0.051552046090364456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5776023903745227e-05, + "grad_norm": 32.10457992553711, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8751386404037476, + "num_tokens": 874139688.0, + "step": 22910 + }, + { + "epoch": 2.9145146927871775, + "ewc_loss": 0.051629167050123215, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.581458284112159e-05, + "grad_norm": 32.09721755981445, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.8770953416824341, + "num_tokens": 874184505.0, + "step": 22911 + }, + { + "epoch": 2.9146419030657675, + "ewc_loss": 0.051585257053375244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5792629458010197e-05, + "grad_norm": 32.084632873535156, + "learning_rate": 1e-06, + "loss": 0.4146, + "mean_token_accuracy": 0.8718866109848022, + "num_tokens": 874223210.0, + "step": 22912 + }, + { + "epoch": 2.914769113344358, + "ewc_loss": 0.05153147876262665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5765739337657578e-05, + "grad_norm": 32.09862518310547, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8598459959030151, + "num_tokens": 874263939.0, + "step": 22913 + }, + { + "epoch": 2.9148963236229486, + "ewc_loss": 0.051609668880701065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.580483487690799e-05, + "grad_norm": 32.10126495361328, + "learning_rate": 1e-06, + "loss": 0.401, + "mean_token_accuracy": 0.880340576171875, + "num_tokens": 874303946.0, + "step": 22914 + }, + { + "epoch": 2.915023533901539, + "ewc_loss": 0.05156787484884262, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5783938326640055e-05, + "grad_norm": 32.053871154785156, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8749252557754517, + "num_tokens": 874343701.0, + "step": 22915 + }, + { + "epoch": 2.9151507441801296, + "ewc_loss": 0.051544785499572754, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.577239320089575e-05, + "grad_norm": 32.09049987792969, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8755607008934021, + "num_tokens": 874380425.0, + "step": 22916 + }, + { + "epoch": 2.91527795445872, + "ewc_loss": 0.05160856246948242, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.580428190412931e-05, + "grad_norm": 32.14063262939453, + "learning_rate": 1e-06, + "loss": 0.4164, + "mean_token_accuracy": 0.8733588457107544, + "num_tokens": 874417643.0, + "step": 22917 + }, + { + "epoch": 2.9154051647373107, + "ewc_loss": 0.05158676952123642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5793384338612668e-05, + "grad_norm": 32.09614181518555, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.8656445741653442, + "num_tokens": 874463939.0, + "step": 22918 + }, + { + "epoch": 2.9155323750159012, + "ewc_loss": 0.05159134790301323, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5795674446271732e-05, + "grad_norm": 32.14199447631836, + "learning_rate": 1e-06, + "loss": 0.4312, + "mean_token_accuracy": 0.8688103556632996, + "num_tokens": 874503156.0, + "step": 22919 + }, + { + "epoch": 2.9156595852944918, + "ewc_loss": 0.05148114264011383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5740571800270118e-05, + "grad_norm": 31.958486557006836, + "learning_rate": 1e-06, + "loss": 0.4472, + "mean_token_accuracy": 0.8624621033668518, + "num_tokens": 874546754.0, + "step": 22920 + }, + { + "epoch": 2.9157867955730823, + "ewc_loss": 0.05153811722993851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.576905899331905e-05, + "grad_norm": 32.058719635009766, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8766564130783081, + "num_tokens": 874590781.0, + "step": 22921 + }, + { + "epoch": 2.915914005851673, + "ewc_loss": 0.05158495157957077, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.57924766629003e-05, + "grad_norm": 32.02869415283203, + "learning_rate": 1e-06, + "loss": 0.3843, + "mean_token_accuracy": 0.8816729784011841, + "num_tokens": 874629592.0, + "step": 22922 + }, + { + "epoch": 2.9160412161302633, + "ewc_loss": 0.05157586932182312, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5787934646359645e-05, + "grad_norm": 32.05343246459961, + "learning_rate": 1e-06, + "loss": 0.3855, + "mean_token_accuracy": 0.8796716332435608, + "num_tokens": 874667372.0, + "step": 22923 + }, + { + "epoch": 2.916168426408854, + "ewc_loss": 0.05161832273006439, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.580916043370962e-05, + "grad_norm": 32.05614471435547, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8693711161613464, + "num_tokens": 874704687.0, + "step": 22924 + }, + { + "epoch": 2.9162956366874444, + "ewc_loss": 0.051602642983198166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.580132058938034e-05, + "grad_norm": 32.18527603149414, + "learning_rate": 1e-06, + "loss": 0.4187, + "mean_token_accuracy": 0.8717350959777832, + "num_tokens": 874740471.0, + "step": 22925 + }, + { + "epoch": 2.916422846966035, + "ewc_loss": 0.05160132795572281, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.580066393420566e-05, + "grad_norm": 32.03812026977539, + "learning_rate": 1e-06, + "loss": 0.3944, + "mean_token_accuracy": 0.877708375453949, + "num_tokens": 874780747.0, + "step": 22926 + }, + { + "epoch": 2.9165500572446255, + "ewc_loss": 0.051488246768713, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.574412246758584e-05, + "grad_norm": 32.18516540527344, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8767298460006714, + "num_tokens": 874814204.0, + "step": 22927 + }, + { + "epoch": 2.916677267523216, + "ewc_loss": 0.0515701100230217, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5785055186133832e-05, + "grad_norm": 32.00596618652344, + "learning_rate": 1e-06, + "loss": 0.4228, + "mean_token_accuracy": 0.8727042078971863, + "num_tokens": 874851872.0, + "step": 22928 + }, + { + "epoch": 2.9168044778018065, + "ewc_loss": 0.05152595043182373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.576297447376419e-05, + "grad_norm": 32.17858123779297, + "learning_rate": 1e-06, + "loss": 0.4514, + "mean_token_accuracy": 0.8571569919586182, + "num_tokens": 874884215.0, + "step": 22929 + }, + { + "epoch": 2.916931688080397, + "ewc_loss": 0.05171307548880577, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5856537831714377e-05, + "grad_norm": 32.16667938232422, + "learning_rate": 1e-06, + "loss": 0.4332, + "mean_token_accuracy": 0.8703593611717224, + "num_tokens": 874919047.0, + "step": 22930 + }, + { + "epoch": 2.917058898358987, + "ewc_loss": 0.051537852734327316, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.576892620709259e-05, + "grad_norm": 32.15191650390625, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8658021092414856, + "num_tokens": 874956469.0, + "step": 22931 + }, + { + "epoch": 2.917186108637578, + "ewc_loss": 0.051570825278759, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.578541352704633e-05, + "grad_norm": 32.014705657958984, + "learning_rate": 1e-06, + "loss": 0.3802, + "mean_token_accuracy": 0.884537935256958, + "num_tokens": 874994633.0, + "step": 22932 + }, + { + "epoch": 2.917313318916168, + "ewc_loss": 0.051609139889478683, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.580456930445507e-05, + "grad_norm": 32.10114669799805, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8856067657470703, + "num_tokens": 875028530.0, + "step": 22933 + }, + { + "epoch": 2.917440529194759, + "ewc_loss": 0.05171026661992073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.585513357189484e-05, + "grad_norm": 32.262454986572266, + "learning_rate": 1e-06, + "loss": 0.4402, + "mean_token_accuracy": 0.8661648035049438, + "num_tokens": 875059785.0, + "step": 22934 + }, + { + "epoch": 2.9175677394733492, + "ewc_loss": 0.0515688918530941, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5784445824683644e-05, + "grad_norm": 32.09767532348633, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8699580430984497, + "num_tokens": 875096057.0, + "step": 22935 + }, + { + "epoch": 2.91769494975194, + "ewc_loss": 0.051533471792936325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.576673614385072e-05, + "grad_norm": 32.1284065246582, + "learning_rate": 1e-06, + "loss": 0.3951, + "mean_token_accuracy": 0.87933349609375, + "num_tokens": 875141988.0, + "step": 22936 + }, + { + "epoch": 2.9178221600305303, + "ewc_loss": 0.05162012204527855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5810060833464377e-05, + "grad_norm": 32.10809326171875, + "learning_rate": 1e-06, + "loss": 0.4248, + "mean_token_accuracy": 0.8776238560676575, + "num_tokens": 875176973.0, + "step": 22937 + }, + { + "epoch": 2.917949370309121, + "ewc_loss": 0.0515763983130455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.578819839982316e-05, + "grad_norm": 31.9896183013916, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8811089992523193, + "num_tokens": 875209271.0, + "step": 22938 + }, + { + "epoch": 2.9180765805877114, + "ewc_loss": 0.0516928993165493, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.584644971648231e-05, + "grad_norm": 32.1389045715332, + "learning_rate": 1e-06, + "loss": 0.4041, + "mean_token_accuracy": 0.8780052661895752, + "num_tokens": 875247460.0, + "step": 22939 + }, + { + "epoch": 2.918203790866302, + "ewc_loss": 0.051685087382793427, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.58425443462329e-05, + "grad_norm": 32.09446334838867, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8706469535827637, + "num_tokens": 875286013.0, + "step": 22940 + }, + { + "epoch": 2.9183310011448924, + "ewc_loss": 0.05164129659533501, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5820649170782417e-05, + "grad_norm": 32.18400573730469, + "learning_rate": 1e-06, + "loss": 0.4351, + "mean_token_accuracy": 0.869294285774231, + "num_tokens": 875324547.0, + "step": 22941 + }, + { + "epoch": 2.918458211423483, + "ewc_loss": 0.05166323110461235, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5831615857896395e-05, + "grad_norm": 32.077415466308594, + "learning_rate": 1e-06, + "loss": 0.4321, + "mean_token_accuracy": 0.8640183210372925, + "num_tokens": 875366655.0, + "step": 22942 + }, + { + "epoch": 2.9185854217020735, + "ewc_loss": 0.05158667266368866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5793337044888176e-05, + "grad_norm": 32.09020233154297, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8671678304672241, + "num_tokens": 875408571.0, + "step": 22943 + }, + { + "epoch": 2.918712631980664, + "ewc_loss": 0.051725465804338455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5862733309622854e-05, + "grad_norm": 32.23047637939453, + "learning_rate": 1e-06, + "loss": 0.4387, + "mean_token_accuracy": 0.8666673898696899, + "num_tokens": 875450730.0, + "step": 22944 + }, + { + "epoch": 2.9188398422592545, + "ewc_loss": 0.05159782990813255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5798914066399448e-05, + "grad_norm": 32.06719207763672, + "learning_rate": 1e-06, + "loss": 0.4099, + "mean_token_accuracy": 0.8762643933296204, + "num_tokens": 875486850.0, + "step": 22945 + }, + { + "epoch": 2.918967052537845, + "ewc_loss": 0.051561277359724045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5780638679862022e-05, + "grad_norm": 32.10886001586914, + "learning_rate": 1e-06, + "loss": 0.4489, + "mean_token_accuracy": 0.8665152788162231, + "num_tokens": 875528653.0, + "step": 22946 + }, + { + "epoch": 2.9190942628164356, + "ewc_loss": 0.05158891901373863, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5794459361350164e-05, + "grad_norm": 32.02039337158203, + "learning_rate": 1e-06, + "loss": 0.3888, + "mean_token_accuracy": 0.8835548162460327, + "num_tokens": 875564155.0, + "step": 22947 + }, + { + "epoch": 2.919221473095026, + "ewc_loss": 0.0515856072306633, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5792804080992937e-05, + "grad_norm": 32.14030075073242, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8714313507080078, + "num_tokens": 875598548.0, + "step": 22948 + }, + { + "epoch": 2.9193486833736166, + "ewc_loss": 0.05169053003191948, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5845265554380603e-05, + "grad_norm": 32.16617202758789, + "learning_rate": 1e-06, + "loss": 0.372, + "mean_token_accuracy": 0.8880273103713989, + "num_tokens": 875634587.0, + "step": 22949 + }, + { + "epoch": 2.919475893652207, + "ewc_loss": 0.05158523842692375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5792618544073775e-05, + "grad_norm": 31.943256378173828, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8726915717124939, + "num_tokens": 875671956.0, + "step": 22950 + }, + { + "epoch": 2.9196031039307977, + "ewc_loss": 0.051589976996183395, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5794988687266596e-05, + "grad_norm": 32.138648986816406, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8747237920761108, + "num_tokens": 875706327.0, + "step": 22951 + }, + { + "epoch": 2.9197303142093882, + "ewc_loss": 0.0516330860555172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5816543711698614e-05, + "grad_norm": 32.01011276245117, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8790273666381836, + "num_tokens": 875749320.0, + "step": 22952 + }, + { + "epoch": 2.9198575244879788, + "ewc_loss": 0.05158457159996033, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5792285669012927e-05, + "grad_norm": 32.05284881591797, + "learning_rate": 1e-06, + "loss": 0.3945, + "mean_token_accuracy": 0.8816434144973755, + "num_tokens": 875783871.0, + "step": 22953 + }, + { + "epoch": 2.9199847347665693, + "ewc_loss": 0.05160915106534958, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5804574761423282e-05, + "grad_norm": 32.127017974853516, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.872512698173523, + "num_tokens": 875821716.0, + "step": 22954 + }, + { + "epoch": 2.92011194504516, + "ewc_loss": 0.05158160999417305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.579080501163844e-05, + "grad_norm": 32.04481506347656, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8761835098266602, + "num_tokens": 875854339.0, + "step": 22955 + }, + { + "epoch": 2.92023915532375, + "ewc_loss": 0.05163710191845894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.581855187600013e-05, + "grad_norm": 32.04648208618164, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8745107650756836, + "num_tokens": 875889456.0, + "step": 22956 + }, + { + "epoch": 2.920366365602341, + "ewc_loss": 0.05160484462976456, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5802422896958888e-05, + "grad_norm": 31.978397369384766, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8646488785743713, + "num_tokens": 875926659.0, + "step": 22957 + }, + { + "epoch": 2.920493575880931, + "ewc_loss": 0.051756199449300766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.587810013210401e-05, + "grad_norm": 32.05422592163086, + "learning_rate": 1e-06, + "loss": 0.3965, + "mean_token_accuracy": 0.8809036612510681, + "num_tokens": 875972766.0, + "step": 22958 + }, + { + "epoch": 2.920620786159522, + "ewc_loss": 0.05172530561685562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5862653274089098e-05, + "grad_norm": 32.043209075927734, + "learning_rate": 1e-06, + "loss": 0.4363, + "mean_token_accuracy": 0.868576169013977, + "num_tokens": 876011824.0, + "step": 22959 + }, + { + "epoch": 2.920747996438112, + "ewc_loss": 0.05168107524514198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5840538000920787e-05, + "grad_norm": 32.04906463623047, + "learning_rate": 1e-06, + "loss": 0.4808, + "mean_token_accuracy": 0.8560959100723267, + "num_tokens": 876050353.0, + "step": 22960 + }, + { + "epoch": 2.920875206716703, + "ewc_loss": 0.05167170241475105, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.583585046522785e-05, + "grad_norm": 32.067718505859375, + "learning_rate": 1e-06, + "loss": 0.4605, + "mean_token_accuracy": 0.858372688293457, + "num_tokens": 876089358.0, + "step": 22961 + }, + { + "epoch": 2.921002416995293, + "ewc_loss": 0.05165895074605942, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.582947490736842e-05, + "grad_norm": 32.024940490722656, + "learning_rate": 1e-06, + "loss": 0.4584, + "mean_token_accuracy": 0.8587275743484497, + "num_tokens": 876126287.0, + "step": 22962 + }, + { + "epoch": 2.9211296272738836, + "ewc_loss": 0.051658112555742264, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5829056539805606e-05, + "grad_norm": 32.027244567871094, + "learning_rate": 1e-06, + "loss": 0.4018, + "mean_token_accuracy": 0.8823777437210083, + "num_tokens": 876165157.0, + "step": 22963 + }, + { + "epoch": 2.921256837552474, + "ewc_loss": 0.051684167236089706, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.58420841419138e-05, + "grad_norm": 32.119014739990234, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8739458322525024, + "num_tokens": 876201391.0, + "step": 22964 + }, + { + "epoch": 2.9213840478310646, + "ewc_loss": 0.051671914756298065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.583595778560266e-05, + "grad_norm": 32.00270462036133, + "learning_rate": 1e-06, + "loss": 0.3981, + "mean_token_accuracy": 0.8797049522399902, + "num_tokens": 876243004.0, + "step": 22965 + }, + { + "epoch": 2.921511258109655, + "ewc_loss": 0.05161551758646965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5807757992879488e-05, + "grad_norm": 31.984798431396484, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.874708890914917, + "num_tokens": 876279245.0, + "step": 22966 + }, + { + "epoch": 2.9216384683882457, + "ewc_loss": 0.051783639937639236, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5891820769174956e-05, + "grad_norm": 32.10094451904297, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8715345859527588, + "num_tokens": 876312145.0, + "step": 22967 + }, + { + "epoch": 2.9217656786668362, + "ewc_loss": 0.0517432801425457, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5871640900732018e-05, + "grad_norm": 31.971912384033203, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8738963007926941, + "num_tokens": 876351354.0, + "step": 22968 + }, + { + "epoch": 2.9218928889454268, + "ewc_loss": 0.051685042679309845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5842520699370652e-05, + "grad_norm": 32.067115783691406, + "learning_rate": 1e-06, + "loss": 0.462, + "mean_token_accuracy": 0.8617828488349915, + "num_tokens": 876388621.0, + "step": 22969 + }, + { + "epoch": 2.9220200992240173, + "ewc_loss": 0.05182698369026184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.59134922089288e-05, + "grad_norm": 32.02248001098633, + "learning_rate": 1e-06, + "loss": 0.4377, + "mean_token_accuracy": 0.8663617372512817, + "num_tokens": 876418377.0, + "step": 22970 + }, + { + "epoch": 2.922147309502608, + "ewc_loss": 0.05182429030537605, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5912144337780774e-05, + "grad_norm": 32.20485305786133, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8751755952835083, + "num_tokens": 876453351.0, + "step": 22971 + }, + { + "epoch": 2.9222745197811983, + "ewc_loss": 0.051886025816202164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5943012587958947e-05, + "grad_norm": 32.10525131225586, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8791663646697998, + "num_tokens": 876484830.0, + "step": 22972 + }, + { + "epoch": 2.922401730059789, + "ewc_loss": 0.05178244784474373, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5891224140650593e-05, + "grad_norm": 32.202903747558594, + "learning_rate": 1e-06, + "loss": 0.3847, + "mean_token_accuracy": 0.8828293085098267, + "num_tokens": 876524318.0, + "step": 22973 + }, + { + "epoch": 2.9225289403383794, + "ewc_loss": 0.05181397870182991, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5906989321811125e-05, + "grad_norm": 32.15102005004883, + "learning_rate": 1e-06, + "loss": 0.4349, + "mean_token_accuracy": 0.8697807788848877, + "num_tokens": 876562421.0, + "step": 22974 + }, + { + "epoch": 2.92265615061697, + "ewc_loss": 0.05178423225879669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.589211544545833e-05, + "grad_norm": 32.11774444580078, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8664422035217285, + "num_tokens": 876605681.0, + "step": 22975 + }, + { + "epoch": 2.9227833608955605, + "ewc_loss": 0.051827769726514816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5913885110639967e-05, + "grad_norm": 32.10134506225586, + "learning_rate": 1e-06, + "loss": 0.4475, + "mean_token_accuracy": 0.8631846308708191, + "num_tokens": 876647371.0, + "step": 22976 + }, + { + "epoch": 2.922910571174151, + "ewc_loss": 0.051765430718660355, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5882714908220805e-05, + "grad_norm": 32.10636901855469, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8777954578399658, + "num_tokens": 876685563.0, + "step": 22977 + }, + { + "epoch": 2.9230377814527415, + "ewc_loss": 0.05185120180249214, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5925601221388206e-05, + "grad_norm": 32.14738082885742, + "learning_rate": 1e-06, + "loss": 0.4328, + "mean_token_accuracy": 0.8684854507446289, + "num_tokens": 876718404.0, + "step": 22978 + }, + { + "epoch": 2.923164991731332, + "ewc_loss": 0.05179828405380249, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5899142201524228e-05, + "grad_norm": 32.05009460449219, + "learning_rate": 1e-06, + "loss": 0.4103, + "mean_token_accuracy": 0.8756265044212341, + "num_tokens": 876751888.0, + "step": 22979 + }, + { + "epoch": 2.9232922020099226, + "ewc_loss": 0.051823876798152924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5911938791978173e-05, + "grad_norm": 32.198211669921875, + "learning_rate": 1e-06, + "loss": 0.4025, + "mean_token_accuracy": 0.877820611000061, + "num_tokens": 876786813.0, + "step": 22980 + }, + { + "epoch": 2.9234194122885127, + "ewc_loss": 0.05175883695483208, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.587941889942158e-05, + "grad_norm": 32.0067024230957, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.882278561592102, + "num_tokens": 876824511.0, + "step": 22981 + }, + { + "epoch": 2.9235466225671036, + "ewc_loss": 0.0517926849424839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.589634277683217e-05, + "grad_norm": 32.166587829589844, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.875618577003479, + "num_tokens": 876856409.0, + "step": 22982 + }, + { + "epoch": 2.9236738328456937, + "ewc_loss": 0.05193881317973137, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5969406124204397e-05, + "grad_norm": 32.22098159790039, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8588045835494995, + "num_tokens": 876889204.0, + "step": 22983 + }, + { + "epoch": 2.9238010431242847, + "ewc_loss": 0.051685966551303864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5842982722679153e-05, + "grad_norm": 31.92177963256836, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8735287189483643, + "num_tokens": 876927185.0, + "step": 22984 + }, + { + "epoch": 2.9239282534028748, + "ewc_loss": 0.05185673013329506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5928364266292192e-05, + "grad_norm": 32.35187911987305, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8697266578674316, + "num_tokens": 876965744.0, + "step": 22985 + }, + { + "epoch": 2.9240554636814657, + "ewc_loss": 0.05186324566602707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5931622076313943e-05, + "grad_norm": 31.966419219970703, + "learning_rate": 1e-06, + "loss": 0.3986, + "mean_token_accuracy": 0.8778261542320251, + "num_tokens": 876997107.0, + "step": 22986 + }, + { + "epoch": 2.924182673960056, + "ewc_loss": 0.05169958993792534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5849794837995432e-05, + "grad_norm": 32.157676696777344, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8628904819488525, + "num_tokens": 877037894.0, + "step": 22987 + }, + { + "epoch": 2.9243098842386464, + "ewc_loss": 0.05183158814907074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5915793230524287e-05, + "grad_norm": 32.1545524597168, + "learning_rate": 1e-06, + "loss": 0.4675, + "mean_token_accuracy": 0.8598174452781677, + "num_tokens": 877078724.0, + "step": 22988 + }, + { + "epoch": 2.924437094517237, + "ewc_loss": 0.051674481481313705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.583724017313216e-05, + "grad_norm": 32.10770034790039, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8733240365982056, + "num_tokens": 877119685.0, + "step": 22989 + }, + { + "epoch": 2.9245643047958274, + "ewc_loss": 0.051741041243076324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5870520403259434e-05, + "grad_norm": 32.03892135620117, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8755131363868713, + "num_tokens": 877157055.0, + "step": 22990 + }, + { + "epoch": 2.924691515074418, + "ewc_loss": 0.051742516458034515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5871258912957273e-05, + "grad_norm": 32.14908981323242, + "learning_rate": 1e-06, + "loss": 0.4038, + "mean_token_accuracy": 0.8774971961975098, + "num_tokens": 877188569.0, + "step": 22991 + }, + { + "epoch": 2.9248187253530085, + "ewc_loss": 0.05181329697370529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.590664917079266e-05, + "grad_norm": 32.192535400390625, + "learning_rate": 1e-06, + "loss": 0.442, + "mean_token_accuracy": 0.8652151823043823, + "num_tokens": 877222745.0, + "step": 22992 + }, + { + "epoch": 2.924945935631599, + "ewc_loss": 0.051785703748464584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5892852136166766e-05, + "grad_norm": 32.10920715332031, + "learning_rate": 1e-06, + "loss": 0.4692, + "mean_token_accuracy": 0.8576546311378479, + "num_tokens": 877262510.0, + "step": 22993 + }, + { + "epoch": 2.9250731459101895, + "ewc_loss": 0.051757488399744034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5878744054352865e-05, + "grad_norm": 31.95893669128418, + "learning_rate": 1e-06, + "loss": 0.4382, + "mean_token_accuracy": 0.8656570911407471, + "num_tokens": 877300164.0, + "step": 22994 + }, + { + "epoch": 2.92520035618878, + "ewc_loss": 0.05179675295948982, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5898376406985335e-05, + "grad_norm": 32.12295150756836, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8816583156585693, + "num_tokens": 877337187.0, + "step": 22995 + }, + { + "epoch": 2.9253275664673706, + "ewc_loss": 0.051770128309726715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5885063223540783e-05, + "grad_norm": 32.00720977783203, + "learning_rate": 1e-06, + "loss": 0.4742, + "mean_token_accuracy": 0.8541815280914307, + "num_tokens": 877375902.0, + "step": 22996 + }, + { + "epoch": 2.925454776745961, + "ewc_loss": 0.05183634161949158, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.591817064967472e-05, + "grad_norm": 32.052616119384766, + "learning_rate": 1e-06, + "loss": 0.4208, + "mean_token_accuracy": 0.8711313009262085, + "num_tokens": 877414406.0, + "step": 22997 + }, + { + "epoch": 2.9255819870245516, + "ewc_loss": 0.05187911167740822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.593955650809221e-05, + "grad_norm": 32.07200241088867, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8792327046394348, + "num_tokens": 877448977.0, + "step": 22998 + }, + { + "epoch": 2.925709197303142, + "ewc_loss": 0.051704391837120056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5852195904008113e-05, + "grad_norm": 32.038368225097656, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8797165155410767, + "num_tokens": 877486766.0, + "step": 22999 + }, + { + "epoch": 2.9258364075817327, + "ewc_loss": 0.05180107802152634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.590053918538615e-05, + "grad_norm": 31.980411529541016, + "learning_rate": 1e-06, + "loss": 0.3771, + "mean_token_accuracy": 0.8866605758666992, + "num_tokens": 877519320.0, + "step": 23000 + }, + { + "epoch": 2.925963617860323, + "ewc_loss": 0.051825858652591705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5912930141203105e-05, + "grad_norm": 31.977901458740234, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8480437397956848, + "num_tokens": 877557913.0, + "step": 23001 + }, + { + "epoch": 2.9260908281389137, + "ewc_loss": 0.051944609731435776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.597230559331365e-05, + "grad_norm": 32.173370361328125, + "learning_rate": 1e-06, + "loss": 0.4148, + "mean_token_accuracy": 0.8727089762687683, + "num_tokens": 877591197.0, + "step": 23002 + }, + { + "epoch": 2.9262180384175043, + "ewc_loss": 0.051915910094976425, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5957955585909076e-05, + "grad_norm": 32.04176712036133, + "learning_rate": 1e-06, + "loss": 0.3657, + "mean_token_accuracy": 0.8893367052078247, + "num_tokens": 877625018.0, + "step": 23003 + }, + { + "epoch": 2.926345248696095, + "ewc_loss": 0.05181777477264404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5908886527759023e-05, + "grad_norm": 32.102237701416016, + "learning_rate": 1e-06, + "loss": 0.4115, + "mean_token_accuracy": 0.876865565776825, + "num_tokens": 877662433.0, + "step": 23004 + }, + { + "epoch": 2.9264724589746853, + "ewc_loss": 0.05189896747469902, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5949484552256763e-05, + "grad_norm": 32.03310775756836, + "learning_rate": 1e-06, + "loss": 0.3927, + "mean_token_accuracy": 0.8851698637008667, + "num_tokens": 877703576.0, + "step": 23005 + }, + { + "epoch": 2.9265996692532754, + "ewc_loss": 0.05180128291249275, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.590064104879275e-05, + "grad_norm": 32.049461364746094, + "learning_rate": 1e-06, + "loss": 0.4269, + "mean_token_accuracy": 0.8688498139381409, + "num_tokens": 877734025.0, + "step": 23006 + }, + { + "epoch": 2.9267268795318664, + "ewc_loss": 0.05181778222322464, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5908891984727234e-05, + "grad_norm": 32.01295471191406, + "learning_rate": 1e-06, + "loss": 0.38, + "mean_token_accuracy": 0.8820314407348633, + "num_tokens": 877771942.0, + "step": 23007 + }, + { + "epoch": 2.9268540898104565, + "ewc_loss": 0.05192912369966507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5964562155422755e-05, + "grad_norm": 32.1704216003418, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8764032125473022, + "num_tokens": 877815662.0, + "step": 23008 + }, + { + "epoch": 2.9269813000890474, + "ewc_loss": 0.05196747928857803, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.598373976070434e-05, + "grad_norm": 32.13166046142578, + "learning_rate": 1e-06, + "loss": 0.4668, + "mean_token_accuracy": 0.8581560254096985, + "num_tokens": 877855398.0, + "step": 23009 + }, + { + "epoch": 2.9271085103676375, + "ewc_loss": 0.0517985001206398, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5899249521899037e-05, + "grad_norm": 31.973236083984375, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8660687208175659, + "num_tokens": 877898544.0, + "step": 23010 + }, + { + "epoch": 2.927235720646228, + "ewc_loss": 0.051790155470371246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5895078579196706e-05, + "grad_norm": 31.965726852416992, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8717902898788452, + "num_tokens": 877937914.0, + "step": 23011 + }, + { + "epoch": 2.9273629309248186, + "ewc_loss": 0.051909640431404114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5954819648177363e-05, + "grad_norm": 32.092430114746094, + "learning_rate": 1e-06, + "loss": 0.4757, + "mean_token_accuracy": 0.8562827110290527, + "num_tokens": 877979266.0, + "step": 23012 + }, + { + "epoch": 2.927490141203409, + "ewc_loss": 0.05195676535367966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5978382836910896e-05, + "grad_norm": 31.9603271484375, + "learning_rate": 1e-06, + "loss": 0.4109, + "mean_token_accuracy": 0.877126157283783, + "num_tokens": 878019762.0, + "step": 23013 + }, + { + "epoch": 2.9276173514819996, + "ewc_loss": 0.05183174088597298, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5915869628079236e-05, + "grad_norm": 32.18787384033203, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.867020308971405, + "num_tokens": 878062174.0, + "step": 23014 + }, + { + "epoch": 2.92774456176059, + "ewc_loss": 0.051815215498209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.590760777820833e-05, + "grad_norm": 32.0478401184082, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8675644993782043, + "num_tokens": 878098197.0, + "step": 23015 + }, + { + "epoch": 2.9278717720391807, + "ewc_loss": 0.05173884704709053, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5869423552649096e-05, + "grad_norm": 32.14727020263672, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8781702518463135, + "num_tokens": 878130478.0, + "step": 23016 + }, + { + "epoch": 2.9279989823177712, + "ewc_loss": 0.051777470856904984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5888735763146542e-05, + "grad_norm": 32.03541564941406, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8559707403182983, + "num_tokens": 878175670.0, + "step": 23017 + }, + { + "epoch": 2.9281261925963618, + "ewc_loss": 0.05174791067838669, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.587395465525333e-05, + "grad_norm": 32.12752151489258, + "learning_rate": 1e-06, + "loss": 0.4522, + "mean_token_accuracy": 0.8617558479309082, + "num_tokens": 878216415.0, + "step": 23018 + }, + { + "epoch": 2.9282534028749523, + "ewc_loss": 0.05172709748148918, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5863548216875643e-05, + "grad_norm": 31.972829818725586, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.87149977684021, + "num_tokens": 878249582.0, + "step": 23019 + }, + { + "epoch": 2.928380613153543, + "ewc_loss": 0.05174560099840164, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5872799596982077e-05, + "grad_norm": 32.08414840698242, + "learning_rate": 1e-06, + "loss": 0.3806, + "mean_token_accuracy": 0.8829036355018616, + "num_tokens": 878290859.0, + "step": 23020 + }, + { + "epoch": 2.9285078234321333, + "ewc_loss": 0.05189737305045128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5948686015908606e-05, + "grad_norm": 32.283592224121094, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8888731002807617, + "num_tokens": 878322463.0, + "step": 23021 + }, + { + "epoch": 2.928635033710724, + "ewc_loss": 0.051820870488882065, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.591043448774144e-05, + "grad_norm": 32.06410598754883, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8820817470550537, + "num_tokens": 878365754.0, + "step": 23022 + }, + { + "epoch": 2.9287622439893144, + "ewc_loss": 0.05174529179930687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.587264680187218e-05, + "grad_norm": 32.19660186767578, + "learning_rate": 1e-06, + "loss": 0.3788, + "mean_token_accuracy": 0.8857918977737427, + "num_tokens": 878410325.0, + "step": 23023 + }, + { + "epoch": 2.928889454267905, + "ewc_loss": 0.05190795287489891, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5953975637094118e-05, + "grad_norm": 32.28147506713867, + "learning_rate": 1e-06, + "loss": 0.3971, + "mean_token_accuracy": 0.8778041005134583, + "num_tokens": 878451604.0, + "step": 23024 + }, + { + "epoch": 2.9290166645464955, + "ewc_loss": 0.05161568522453308, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5807843485381454e-05, + "grad_norm": 32.115203857421875, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8752357959747314, + "num_tokens": 878485174.0, + "step": 23025 + }, + { + "epoch": 2.929143874825086, + "ewc_loss": 0.051700741052627563, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5850371457636356e-05, + "grad_norm": 32.134498596191406, + "learning_rate": 1e-06, + "loss": 0.448, + "mean_token_accuracy": 0.8643808364868164, + "num_tokens": 878525867.0, + "step": 23026 + }, + { + "epoch": 2.9292710851036765, + "ewc_loss": 0.05173153802752495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5865769202937372e-05, + "grad_norm": 32.13300704956055, + "learning_rate": 1e-06, + "loss": 0.4576, + "mean_token_accuracy": 0.8605563640594482, + "num_tokens": 878561327.0, + "step": 23027 + }, + { + "epoch": 2.929398295382267, + "ewc_loss": 0.05171215534210205, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.585607762739528e-05, + "grad_norm": 32.04974365234375, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8784605264663696, + "num_tokens": 878603045.0, + "step": 23028 + }, + { + "epoch": 2.929525505660857, + "ewc_loss": 0.05165969580411911, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.582984780019615e-05, + "grad_norm": 32.029937744140625, + "learning_rate": 1e-06, + "loss": 0.3954, + "mean_token_accuracy": 0.8779230117797852, + "num_tokens": 878645051.0, + "step": 23029 + }, + { + "epoch": 2.929652715939448, + "ewc_loss": 0.05187768489122391, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5938841645256616e-05, + "grad_norm": 32.14112854003906, + "learning_rate": 1e-06, + "loss": 0.4789, + "mean_token_accuracy": 0.854941725730896, + "num_tokens": 878682003.0, + "step": 23030 + }, + { + "epoch": 2.929779926218038, + "ewc_loss": 0.051670514047145844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5835257474682294e-05, + "grad_norm": 32.058048248291016, + "learning_rate": 1e-06, + "loss": 0.4034, + "mean_token_accuracy": 0.8776364326477051, + "num_tokens": 878720509.0, + "step": 23031 + }, + { + "epoch": 2.929907136496629, + "ewc_loss": 0.05174804478883743, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.587402195786126e-05, + "grad_norm": 32.09330749511719, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8780925273895264, + "num_tokens": 878757950.0, + "step": 23032 + }, + { + "epoch": 2.9300343467752192, + "ewc_loss": 0.051703546196222305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.585177389846649e-05, + "grad_norm": 32.07440948486328, + "learning_rate": 1e-06, + "loss": 0.4437, + "mean_token_accuracy": 0.8660991191864014, + "num_tokens": 878801477.0, + "step": 23033 + }, + { + "epoch": 2.93016155705381, + "ewc_loss": 0.05175209790468216, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.587604831205681e-05, + "grad_norm": 32.016021728515625, + "learning_rate": 1e-06, + "loss": 0.445, + "mean_token_accuracy": 0.8655999898910522, + "num_tokens": 878838802.0, + "step": 23034 + }, + { + "epoch": 2.9302887673324003, + "ewc_loss": 0.05175085365772247, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5875426217680797e-05, + "grad_norm": 32.19886016845703, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8683180212974548, + "num_tokens": 878875566.0, + "step": 23035 + }, + { + "epoch": 2.930415977610991, + "ewc_loss": 0.05181146413087845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5905732400133274e-05, + "grad_norm": 32.118316650390625, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8641389608383179, + "num_tokens": 878913463.0, + "step": 23036 + }, + { + "epoch": 2.9305431878895813, + "ewc_loss": 0.051642753183841705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5821376766543835e-05, + "grad_norm": 32.09120178222656, + "learning_rate": 1e-06, + "loss": 0.4546, + "mean_token_accuracy": 0.8611737489700317, + "num_tokens": 878950769.0, + "step": 23037 + }, + { + "epoch": 2.930670398168172, + "ewc_loss": 0.051813311874866486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5906656446750276e-05, + "grad_norm": 32.223609924316406, + "learning_rate": 1e-06, + "loss": 0.3947, + "mean_token_accuracy": 0.8831353187561035, + "num_tokens": 878989041.0, + "step": 23038 + }, + { + "epoch": 2.9307976084467624, + "ewc_loss": 0.05167076736688614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.583538298495114e-05, + "grad_norm": 32.049163818359375, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8795506954193115, + "num_tokens": 879020195.0, + "step": 23039 + }, + { + "epoch": 2.930924818725353, + "ewc_loss": 0.05176033079624176, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5880164685077034e-05, + "grad_norm": 32.09696578979492, + "learning_rate": 1e-06, + "loss": 0.4145, + "mean_token_accuracy": 0.8714084625244141, + "num_tokens": 879057818.0, + "step": 23040 + }, + { + "epoch": 2.9310520290039435, + "ewc_loss": 0.0517190620303154, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5859531888272613e-05, + "grad_norm": 32.17701721191406, + "learning_rate": 1e-06, + "loss": 0.4265, + "mean_token_accuracy": 0.868792712688446, + "num_tokens": 879094747.0, + "step": 23041 + }, + { + "epoch": 2.931179239282534, + "ewc_loss": 0.05174537003040314, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5872685000649653e-05, + "grad_norm": 32.07159423828125, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8636916875839233, + "num_tokens": 879131127.0, + "step": 23042 + }, + { + "epoch": 2.9313064495611245, + "ewc_loss": 0.05171743407845497, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5858716981019825e-05, + "grad_norm": 32.28587341308594, + "learning_rate": 1e-06, + "loss": 0.4073, + "mean_token_accuracy": 0.8726022839546204, + "num_tokens": 879168541.0, + "step": 23043 + }, + { + "epoch": 2.931433659839715, + "ewc_loss": 0.051875364035367966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5937681130017154e-05, + "grad_norm": 32.241966247558594, + "learning_rate": 1e-06, + "loss": 0.4341, + "mean_token_accuracy": 0.8687251806259155, + "num_tokens": 879199409.0, + "step": 23044 + }, + { + "epoch": 2.9315608701183056, + "ewc_loss": 0.05171910300850868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5859551897156052e-05, + "grad_norm": 32.1374626159668, + "learning_rate": 1e-06, + "loss": 0.3758, + "mean_token_accuracy": 0.8852635025978088, + "num_tokens": 879236447.0, + "step": 23045 + }, + { + "epoch": 2.931688080396896, + "ewc_loss": 0.051702775061130524, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.585138827271294e-05, + "grad_norm": 32.071353912353516, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8743414282798767, + "num_tokens": 879275596.0, + "step": 23046 + }, + { + "epoch": 2.9318152906754866, + "ewc_loss": 0.051845356822013855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5922678105416708e-05, + "grad_norm": 32.060081481933594, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.871065080165863, + "num_tokens": 879315620.0, + "step": 23047 + }, + { + "epoch": 2.931942500954077, + "ewc_loss": 0.051882341504096985, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5941169951693155e-05, + "grad_norm": 32.224266052246094, + "learning_rate": 1e-06, + "loss": 0.4082, + "mean_token_accuracy": 0.8748040199279785, + "num_tokens": 879356220.0, + "step": 23048 + }, + { + "epoch": 2.9320697112326677, + "ewc_loss": 0.05188130587339401, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5940653358702548e-05, + "grad_norm": 31.947139739990234, + "learning_rate": 1e-06, + "loss": 0.4704, + "mean_token_accuracy": 0.8584178686141968, + "num_tokens": 879397544.0, + "step": 23049 + }, + { + "epoch": 2.932196921511258, + "ewc_loss": 0.05190856382250786, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5954281227313913e-05, + "grad_norm": 32.07694625854492, + "learning_rate": 1e-06, + "loss": 0.3916, + "mean_token_accuracy": 0.8818874359130859, + "num_tokens": 879438165.0, + "step": 23050 + }, + { + "epoch": 2.9323241317898487, + "ewc_loss": 0.05195248872041702, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5976243705372326e-05, + "grad_norm": 32.1141242980957, + "learning_rate": 1e-06, + "loss": 0.4091, + "mean_token_accuracy": 0.8743137717247009, + "num_tokens": 879473601.0, + "step": 23051 + }, + { + "epoch": 2.9324513420684393, + "ewc_loss": 0.05196039006114006, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5980194550356828e-05, + "grad_norm": 32.13814163208008, + "learning_rate": 1e-06, + "loss": 0.4517, + "mean_token_accuracy": 0.8649923801422119, + "num_tokens": 879510998.0, + "step": 23052 + }, + { + "epoch": 2.93257855234703, + "ewc_loss": 0.05194712430238724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.59735625149915e-05, + "grad_norm": 32.10171127319336, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8782344460487366, + "num_tokens": 879551469.0, + "step": 23053 + }, + { + "epoch": 2.93270576262562, + "ewc_loss": 0.051933787763118744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.59668940998381e-05, + "grad_norm": 32.051856994628906, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8647297620773315, + "num_tokens": 879585620.0, + "step": 23054 + }, + { + "epoch": 2.932832972904211, + "ewc_loss": 0.05193675309419632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.596837657620199e-05, + "grad_norm": 32.115684509277344, + "learning_rate": 1e-06, + "loss": 0.3857, + "mean_token_accuracy": 0.8830053806304932, + "num_tokens": 879621749.0, + "step": 23055 + }, + { + "epoch": 2.932960183182801, + "ewc_loss": 0.051920898258686066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5960449420381337e-05, + "grad_norm": 32.07136154174805, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.876481294631958, + "num_tokens": 879664529.0, + "step": 23056 + }, + { + "epoch": 2.933087393461392, + "ewc_loss": 0.05191145837306976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5955729142879136e-05, + "grad_norm": 32.02719497680664, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8778151869773865, + "num_tokens": 879696293.0, + "step": 23057 + }, + { + "epoch": 2.933214603739982, + "ewc_loss": 0.051967207342386246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5983603336499073e-05, + "grad_norm": 32.1927604675293, + "learning_rate": 1e-06, + "loss": 0.3958, + "mean_token_accuracy": 0.8776450157165527, + "num_tokens": 879733562.0, + "step": 23058 + }, + { + "epoch": 2.933341814018573, + "ewc_loss": 0.051960404962301254, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5980201826314442e-05, + "grad_norm": 32.0760498046875, + "learning_rate": 1e-06, + "loss": 0.4244, + "mean_token_accuracy": 0.8704351782798767, + "num_tokens": 879771740.0, + "step": 23059 + }, + { + "epoch": 2.933469024297163, + "ewc_loss": 0.05197754129767418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5988771085394546e-05, + "grad_norm": 32.155208587646484, + "learning_rate": 1e-06, + "loss": 0.4306, + "mean_token_accuracy": 0.8689733743667603, + "num_tokens": 879805578.0, + "step": 23060 + }, + { + "epoch": 2.9335962345757536, + "ewc_loss": 0.051980555057525635, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5990277208620682e-05, + "grad_norm": 32.109336853027344, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8697592616081238, + "num_tokens": 879845175.0, + "step": 23061 + }, + { + "epoch": 2.933723444854344, + "ewc_loss": 0.051976196467876434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5988098059315234e-05, + "grad_norm": 32.255733489990234, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8854184150695801, + "num_tokens": 879878975.0, + "step": 23062 + }, + { + "epoch": 2.9338506551329346, + "ewc_loss": 0.051873743534088135, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5936871679732576e-05, + "grad_norm": 32.13023376464844, + "learning_rate": 1e-06, + "loss": 0.4448, + "mean_token_accuracy": 0.8637615442276001, + "num_tokens": 879919373.0, + "step": 23063 + }, + { + "epoch": 2.933977865411525, + "ewc_loss": 0.0518684946000576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.593424687802326e-05, + "grad_norm": 32.105770111083984, + "learning_rate": 1e-06, + "loss": 0.4684, + "mean_token_accuracy": 0.8515027165412903, + "num_tokens": 879953821.0, + "step": 23064 + }, + { + "epoch": 2.9341050756901157, + "ewc_loss": 0.05197117477655411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.598558785393834e-05, + "grad_norm": 32.15510940551758, + "learning_rate": 1e-06, + "loss": 0.4423, + "mean_token_accuracy": 0.8644660711288452, + "num_tokens": 879993505.0, + "step": 23065 + }, + { + "epoch": 2.934232285968706, + "ewc_loss": 0.05184340104460716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5921701308107004e-05, + "grad_norm": 32.101322174072266, + "learning_rate": 1e-06, + "loss": 0.3985, + "mean_token_accuracy": 0.8776489496231079, + "num_tokens": 880030124.0, + "step": 23066 + }, + { + "epoch": 2.9343594962472968, + "ewc_loss": 0.051957257091999054, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5978628400480375e-05, + "grad_norm": 32.28871154785156, + "learning_rate": 1e-06, + "loss": 0.4857, + "mean_token_accuracy": 0.8498567342758179, + "num_tokens": 880059480.0, + "step": 23067 + }, + { + "epoch": 2.9344867065258873, + "ewc_loss": 0.05190742015838623, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.59537100646412e-05, + "grad_norm": 32.09067153930664, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8732225894927979, + "num_tokens": 880093402.0, + "step": 23068 + }, + { + "epoch": 2.934613916804478, + "ewc_loss": 0.05183909088373184, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5919545805663802e-05, + "grad_norm": 32.13484573364258, + "learning_rate": 1e-06, + "loss": 0.4191, + "mean_token_accuracy": 0.8707377910614014, + "num_tokens": 880127874.0, + "step": 23069 + }, + { + "epoch": 2.9347411270830683, + "ewc_loss": 0.05196469649672508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5982348233810626e-05, + "grad_norm": 32.03739929199219, + "learning_rate": 1e-06, + "loss": 0.3993, + "mean_token_accuracy": 0.8814467191696167, + "num_tokens": 880165755.0, + "step": 23070 + }, + { + "epoch": 2.934868337361659, + "ewc_loss": 0.05182312801480293, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5911564080161043e-05, + "grad_norm": 32.11872482299805, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8669202327728271, + "num_tokens": 880203931.0, + "step": 23071 + }, + { + "epoch": 2.9349955476402494, + "ewc_loss": 0.051984094083309174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5992047085310332e-05, + "grad_norm": 31.98870277404785, + "learning_rate": 1e-06, + "loss": 0.4097, + "mean_token_accuracy": 0.8753008842468262, + "num_tokens": 880242844.0, + "step": 23072 + }, + { + "epoch": 2.93512275791884, + "ewc_loss": 0.05186109617352486, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.593054887256585e-05, + "grad_norm": 32.166404724121094, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.858243465423584, + "num_tokens": 880280068.0, + "step": 23073 + }, + { + "epoch": 2.9352499681974304, + "ewc_loss": 0.05200992524623871, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6004961910075508e-05, + "grad_norm": 32.077457427978516, + "learning_rate": 1e-06, + "loss": 0.3987, + "mean_token_accuracy": 0.8791594505310059, + "num_tokens": 880314044.0, + "step": 23074 + }, + { + "epoch": 2.935377178476021, + "ewc_loss": 0.05191778391599655, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5958892365451902e-05, + "grad_norm": 32.14933776855469, + "learning_rate": 1e-06, + "loss": 0.4308, + "mean_token_accuracy": 0.8690973520278931, + "num_tokens": 880355077.0, + "step": 23075 + }, + { + "epoch": 2.9355043887546115, + "ewc_loss": 0.051899224519729614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.594961188151501e-05, + "grad_norm": 32.01755142211914, + "learning_rate": 1e-06, + "loss": 0.3693, + "mean_token_accuracy": 0.8870893716812134, + "num_tokens": 880388906.0, + "step": 23076 + }, + { + "epoch": 2.935631599033202, + "ewc_loss": 0.05195639282464981, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5978197299991734e-05, + "grad_norm": 32.2974967956543, + "learning_rate": 1e-06, + "loss": 0.3902, + "mean_token_accuracy": 0.8833082914352417, + "num_tokens": 880426834.0, + "step": 23077 + }, + { + "epoch": 2.9357588093117926, + "ewc_loss": 0.05195301026105881, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5976505639846437e-05, + "grad_norm": 32.08717346191406, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8707370758056641, + "num_tokens": 880463477.0, + "step": 23078 + }, + { + "epoch": 2.9358860195903826, + "ewc_loss": 0.05177723988890648, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5888619347824715e-05, + "grad_norm": 32.0876350402832, + "learning_rate": 1e-06, + "loss": 0.4662, + "mean_token_accuracy": 0.855689287185669, + "num_tokens": 880500884.0, + "step": 23079 + }, + { + "epoch": 2.9360132298689736, + "ewc_loss": 0.05189630389213562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5948151233023964e-05, + "grad_norm": 31.992149353027344, + "learning_rate": 1e-06, + "loss": 0.4293, + "mean_token_accuracy": 0.8670180439949036, + "num_tokens": 880539724.0, + "step": 23080 + }, + { + "epoch": 2.9361404401475637, + "ewc_loss": 0.05184014514088631, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.592007331259083e-05, + "grad_norm": 31.95353889465332, + "learning_rate": 1e-06, + "loss": 0.4326, + "mean_token_accuracy": 0.8657219409942627, + "num_tokens": 880577820.0, + "step": 23081 + }, + { + "epoch": 2.9362676504261547, + "ewc_loss": 0.05200384929776192, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6001924197771586e-05, + "grad_norm": 32.122230529785156, + "learning_rate": 1e-06, + "loss": 0.407, + "mean_token_accuracy": 0.8762167692184448, + "num_tokens": 880612055.0, + "step": 23082 + }, + { + "epoch": 2.9363948607047448, + "ewc_loss": 0.051970258355140686, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5985129468608648e-05, + "grad_norm": 32.10688018798828, + "learning_rate": 1e-06, + "loss": 0.3932, + "mean_token_accuracy": 0.8795095086097717, + "num_tokens": 880654907.0, + "step": 23083 + }, + { + "epoch": 2.9365220709833357, + "ewc_loss": 0.05196784436702728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5983921659644693e-05, + "grad_norm": 32.1327018737793, + "learning_rate": 1e-06, + "loss": 0.3625, + "mean_token_accuracy": 0.8887203931808472, + "num_tokens": 880692418.0, + "step": 23084 + }, + { + "epoch": 2.936649281261926, + "ewc_loss": 0.05190283805131912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5951418137992732e-05, + "grad_norm": 32.1389274597168, + "learning_rate": 1e-06, + "loss": 0.4566, + "mean_token_accuracy": 0.8596841096878052, + "num_tokens": 880727742.0, + "step": 23085 + }, + { + "epoch": 2.9367764915405163, + "ewc_loss": 0.052034467458724976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6017234631581232e-05, + "grad_norm": 32.172035217285156, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8844637870788574, + "num_tokens": 880766673.0, + "step": 23086 + }, + { + "epoch": 2.936903701819107, + "ewc_loss": 0.05188044533133507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5940222258213907e-05, + "grad_norm": 32.052894592285156, + "learning_rate": 1e-06, + "loss": 0.4442, + "mean_token_accuracy": 0.8653099536895752, + "num_tokens": 880810390.0, + "step": 23087 + }, + { + "epoch": 2.9370309120976974, + "ewc_loss": 0.051912080496549606, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5956040190067142e-05, + "grad_norm": 32.15140914916992, + "learning_rate": 1e-06, + "loss": 0.43, + "mean_token_accuracy": 0.8672131299972534, + "num_tokens": 880846920.0, + "step": 23088 + }, + { + "epoch": 2.937158122376288, + "ewc_loss": 0.05197388678789139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5986943001043983e-05, + "grad_norm": 32.10783767700195, + "learning_rate": 1e-06, + "loss": 0.3909, + "mean_token_accuracy": 0.8781921863555908, + "num_tokens": 880885867.0, + "step": 23089 + }, + { + "epoch": 2.9372853326548785, + "ewc_loss": 0.051752910017967224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5876455765683204e-05, + "grad_norm": 31.9217586517334, + "learning_rate": 1e-06, + "loss": 0.3941, + "mean_token_accuracy": 0.8833173513412476, + "num_tokens": 880928408.0, + "step": 23090 + }, + { + "epoch": 2.937412542933469, + "ewc_loss": 0.05191199108958244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5955994715332054e-05, + "grad_norm": 32.18757629394531, + "learning_rate": 1e-06, + "loss": 0.4166, + "mean_token_accuracy": 0.8720840811729431, + "num_tokens": 880960681.0, + "step": 23091 + }, + { + "epoch": 2.9375397532120595, + "ewc_loss": 0.05184886232018471, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5924431611201726e-05, + "grad_norm": 32.06869125366211, + "learning_rate": 1e-06, + "loss": 0.4463, + "mean_token_accuracy": 0.8616231679916382, + "num_tokens": 881000358.0, + "step": 23092 + }, + { + "epoch": 2.93766696349065, + "ewc_loss": 0.05187224969267845, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.593612407508772e-05, + "grad_norm": 32.078548431396484, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.873924970626831, + "num_tokens": 881037765.0, + "step": 23093 + }, + { + "epoch": 2.9377941737692406, + "ewc_loss": 0.051897112280130386, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5948556867660955e-05, + "grad_norm": 32.13076400756836, + "learning_rate": 1e-06, + "loss": 0.3794, + "mean_token_accuracy": 0.8863763213157654, + "num_tokens": 881076091.0, + "step": 23094 + }, + { + "epoch": 2.937921384047831, + "ewc_loss": 0.05192965269088745, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.596482590888627e-05, + "grad_norm": 32.118751525878906, + "learning_rate": 1e-06, + "loss": 0.4272, + "mean_token_accuracy": 0.8716516494750977, + "num_tokens": 881112784.0, + "step": 23095 + }, + { + "epoch": 2.9380485943264216, + "ewc_loss": 0.051813844591379166, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5906922019203193e-05, + "grad_norm": 32.06355285644531, + "learning_rate": 1e-06, + "loss": 0.4017, + "mean_token_accuracy": 0.8789238929748535, + "num_tokens": 881149938.0, + "step": 23096 + }, + { + "epoch": 2.938175804605012, + "ewc_loss": 0.05179760977625847, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5898805688484572e-05, + "grad_norm": 31.976268768310547, + "learning_rate": 1e-06, + "loss": 0.4428, + "mean_token_accuracy": 0.8634432554244995, + "num_tokens": 881187541.0, + "step": 23097 + }, + { + "epoch": 2.9383030148836027, + "ewc_loss": 0.051841482520103455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.592074088170193e-05, + "grad_norm": 32.00591278076172, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8732306957244873, + "num_tokens": 881221920.0, + "step": 23098 + }, + { + "epoch": 2.938430225162193, + "ewc_loss": 0.05189505219459534, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5947525500669144e-05, + "grad_norm": 32.03544998168945, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8576282262802124, + "num_tokens": 881264804.0, + "step": 23099 + }, + { + "epoch": 2.9385574354407837, + "ewc_loss": 0.05188119411468506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5940596970031038e-05, + "grad_norm": 32.0457649230957, + "learning_rate": 1e-06, + "loss": 0.4374, + "mean_token_accuracy": 0.8642114996910095, + "num_tokens": 881304482.0, + "step": 23100 + }, + { + "epoch": 2.9386846457193743, + "ewc_loss": 0.05192263796925545, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5961318897316232e-05, + "grad_norm": 32.01142501831055, + "learning_rate": 1e-06, + "loss": 0.4534, + "mean_token_accuracy": 0.8623854517936707, + "num_tokens": 881340822.0, + "step": 23101 + }, + { + "epoch": 2.938811855997965, + "ewc_loss": 0.05193024501204491, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5965122404159047e-05, + "grad_norm": 32.21607208251953, + "learning_rate": 1e-06, + "loss": 0.4338, + "mean_token_accuracy": 0.8711462020874023, + "num_tokens": 881374352.0, + "step": 23102 + }, + { + "epoch": 2.9389390662765553, + "ewc_loss": 0.0518864281475544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.594321449578274e-05, + "grad_norm": 32.048919677734375, + "learning_rate": 1e-06, + "loss": 0.4311, + "mean_token_accuracy": 0.866145133972168, + "num_tokens": 881413176.0, + "step": 23103 + }, + { + "epoch": 2.9390662765551454, + "ewc_loss": 0.05187052860856056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.593526369309984e-05, + "grad_norm": 32.134578704833984, + "learning_rate": 1e-06, + "loss": 0.4346, + "mean_token_accuracy": 0.8669334650039673, + "num_tokens": 881454309.0, + "step": 23104 + }, + { + "epoch": 2.9391934868337364, + "ewc_loss": 0.05194061994552612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.597031016193796e-05, + "grad_norm": 32.185245513916016, + "learning_rate": 1e-06, + "loss": 0.4134, + "mean_token_accuracy": 0.8737123608589172, + "num_tokens": 881490702.0, + "step": 23105 + }, + { + "epoch": 2.9393206971123265, + "ewc_loss": 0.05191295966506004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.59564803855028e-05, + "grad_norm": 32.05080032348633, + "learning_rate": 1e-06, + "loss": 0.4222, + "mean_token_accuracy": 0.8784000873565674, + "num_tokens": 881530394.0, + "step": 23106 + }, + { + "epoch": 2.9394479073909174, + "ewc_loss": 0.051817964762449265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.590898293419741e-05, + "grad_norm": 31.972185134887695, + "learning_rate": 1e-06, + "loss": 0.3728, + "mean_token_accuracy": 0.885529637336731, + "num_tokens": 881567286.0, + "step": 23107 + }, + { + "epoch": 2.9395751176695075, + "ewc_loss": 0.05201122537255287, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6005613108281977e-05, + "grad_norm": 32.244903564453125, + "learning_rate": 1e-06, + "loss": 0.4543, + "mean_token_accuracy": 0.8663866519927979, + "num_tokens": 881607017.0, + "step": 23108 + }, + { + "epoch": 2.939702327948098, + "ewc_loss": 0.05203218013048172, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.60160904872464e-05, + "grad_norm": 32.25001907348633, + "learning_rate": 1e-06, + "loss": 0.4108, + "mean_token_accuracy": 0.8747760057449341, + "num_tokens": 881645793.0, + "step": 23109 + }, + { + "epoch": 2.9398295382266886, + "ewc_loss": 0.05192960053682327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.596480044303462e-05, + "grad_norm": 32.1329345703125, + "learning_rate": 1e-06, + "loss": 0.4155, + "mean_token_accuracy": 0.8754624128341675, + "num_tokens": 881683596.0, + "step": 23110 + }, + { + "epoch": 2.939956748505279, + "ewc_loss": 0.051848042756319046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5924020519596525e-05, + "grad_norm": 32.11881637573242, + "learning_rate": 1e-06, + "loss": 0.4702, + "mean_token_accuracy": 0.8569965362548828, + "num_tokens": 881717607.0, + "step": 23111 + }, + { + "epoch": 2.9400839587838696, + "ewc_loss": 0.051905252039432526, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5952625946956687e-05, + "grad_norm": 32.127498626708984, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8839418292045593, + "num_tokens": 881750884.0, + "step": 23112 + }, + { + "epoch": 2.94021116906246, + "ewc_loss": 0.05194627866148949, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5973138690460473e-05, + "grad_norm": 32.16101837158203, + "learning_rate": 1e-06, + "loss": 0.4373, + "mean_token_accuracy": 0.8659959435462952, + "num_tokens": 881789345.0, + "step": 23113 + }, + { + "epoch": 2.9403383793410507, + "ewc_loss": 0.05182882398366928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5914412617566995e-05, + "grad_norm": 32.116390228271484, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8681477308273315, + "num_tokens": 881828459.0, + "step": 23114 + }, + { + "epoch": 2.940465589619641, + "ewc_loss": 0.05189520865678787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5947603717213497e-05, + "grad_norm": 32.17585754394531, + "learning_rate": 1e-06, + "loss": 0.4431, + "mean_token_accuracy": 0.8658050298690796, + "num_tokens": 881865294.0, + "step": 23115 + }, + { + "epoch": 2.9405927998982317, + "ewc_loss": 0.05191711708903313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.595855767140165e-05, + "grad_norm": 32.142269134521484, + "learning_rate": 1e-06, + "loss": 0.4267, + "mean_token_accuracy": 0.87505704164505, + "num_tokens": 881904936.0, + "step": 23116 + }, + { + "epoch": 2.9407200101768223, + "ewc_loss": 0.05192474648356438, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.596237391117029e-05, + "grad_norm": 32.24057388305664, + "learning_rate": 1e-06, + "loss": 0.3832, + "mean_token_accuracy": 0.8824645280838013, + "num_tokens": 881936300.0, + "step": 23117 + }, + { + "epoch": 2.940847220455413, + "ewc_loss": 0.05189169943332672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.594585021142848e-05, + "grad_norm": 32.24748229980469, + "learning_rate": 1e-06, + "loss": 0.3866, + "mean_token_accuracy": 0.8831915855407715, + "num_tokens": 881974092.0, + "step": 23118 + }, + { + "epoch": 2.9409744307340033, + "ewc_loss": 0.05180605128407478, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5903025743900798e-05, + "grad_norm": 32.180049896240234, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8728939294815063, + "num_tokens": 882019339.0, + "step": 23119 + }, + { + "epoch": 2.941101641012594, + "ewc_loss": 0.05175474286079407, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5877370717353188e-05, + "grad_norm": 32.10508346557617, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8752779364585876, + "num_tokens": 882052837.0, + "step": 23120 + }, + { + "epoch": 2.9412288512911844, + "ewc_loss": 0.05182354524731636, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.591177326394245e-05, + "grad_norm": 32.19891357421875, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8769855499267578, + "num_tokens": 882088449.0, + "step": 23121 + }, + { + "epoch": 2.941356061569775, + "ewc_loss": 0.051862768828868866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5931383788702078e-05, + "grad_norm": 32.20534133911133, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8629480600357056, + "num_tokens": 882124844.0, + "step": 23122 + }, + { + "epoch": 2.9414832718483654, + "ewc_loss": 0.05177539214491844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5887695301207714e-05, + "grad_norm": 32.19611358642578, + "learning_rate": 1e-06, + "loss": 0.4022, + "mean_token_accuracy": 0.8738589882850647, + "num_tokens": 882162246.0, + "step": 23123 + }, + { + "epoch": 2.941610482126956, + "ewc_loss": 0.05169021338224411, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5845107302302495e-05, + "grad_norm": 32.04643249511719, + "learning_rate": 1e-06, + "loss": 0.4974, + "mean_token_accuracy": 0.8610005378723145, + "num_tokens": 882202825.0, + "step": 23124 + }, + { + "epoch": 2.9417376924055465, + "ewc_loss": 0.05177924409508705, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.588962161098607e-05, + "grad_norm": 32.19272994995117, + "learning_rate": 1e-06, + "loss": 0.4565, + "mean_token_accuracy": 0.8582548499107361, + "num_tokens": 882237951.0, + "step": 23125 + }, + { + "epoch": 2.941864902684137, + "ewc_loss": 0.05182285234332085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5911425836966373e-05, + "grad_norm": 32.02389144897461, + "learning_rate": 1e-06, + "loss": 0.4234, + "mean_token_accuracy": 0.8710570931434631, + "num_tokens": 882279147.0, + "step": 23126 + }, + { + "epoch": 2.941992112962727, + "ewc_loss": 0.05173075571656227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.586537812021561e-05, + "grad_norm": 32.20042037963867, + "learning_rate": 1e-06, + "loss": 0.3539, + "mean_token_accuracy": 0.8927885293960571, + "num_tokens": 882322777.0, + "step": 23127 + }, + { + "epoch": 2.942119323241318, + "ewc_loss": 0.051952045410871506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5976023607654497e-05, + "grad_norm": 32.071067810058594, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8762909173965454, + "num_tokens": 882360681.0, + "step": 23128 + }, + { + "epoch": 2.942246533519908, + "ewc_loss": 0.0517071895301342, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.585359470685944e-05, + "grad_norm": 32.18550109863281, + "learning_rate": 1e-06, + "loss": 0.4755, + "mean_token_accuracy": 0.8520774245262146, + "num_tokens": 882403823.0, + "step": 23129 + }, + { + "epoch": 2.942373743798499, + "ewc_loss": 0.05183075740933418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5915378500940278e-05, + "grad_norm": 32.110931396484375, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8803562521934509, + "num_tokens": 882443723.0, + "step": 23130 + }, + { + "epoch": 2.9425009540770892, + "ewc_loss": 0.05170144513249397, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.585072252259124e-05, + "grad_norm": 32.02254104614258, + "learning_rate": 1e-06, + "loss": 0.4705, + "mean_token_accuracy": 0.8577971458435059, + "num_tokens": 882480610.0, + "step": 23131 + }, + { + "epoch": 2.94262816435568, + "ewc_loss": 0.051864977926015854, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5932489734259434e-05, + "grad_norm": 32.14441680908203, + "learning_rate": 1e-06, + "loss": 0.4126, + "mean_token_accuracy": 0.8751376867294312, + "num_tokens": 882518789.0, + "step": 23132 + }, + { + "epoch": 2.9427553746342703, + "ewc_loss": 0.051835767924785614, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.591788324934896e-05, + "grad_norm": 32.229026794433594, + "learning_rate": 1e-06, + "loss": 0.4607, + "mean_token_accuracy": 0.8585749864578247, + "num_tokens": 882556598.0, + "step": 23133 + }, + { + "epoch": 2.942882584912861, + "ewc_loss": 0.051875099539756775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.59375501627801e-05, + "grad_norm": 32.18352508544922, + "learning_rate": 1e-06, + "loss": 0.435, + "mean_token_accuracy": 0.8677658438682556, + "num_tokens": 882593788.0, + "step": 23134 + }, + { + "epoch": 2.9430097951914513, + "ewc_loss": 0.051762230694293976, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.588111601653509e-05, + "grad_norm": 32.10886001586914, + "learning_rate": 1e-06, + "loss": 0.4009, + "mean_token_accuracy": 0.8782370686531067, + "num_tokens": 882630861.0, + "step": 23135 + }, + { + "epoch": 2.943137005470042, + "ewc_loss": 0.051903028041124344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.595151454443112e-05, + "grad_norm": 32.251346588134766, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8805486559867859, + "num_tokens": 882670950.0, + "step": 23136 + }, + { + "epoch": 2.9432642157486324, + "ewc_loss": 0.05177878215909004, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.588939059933182e-05, + "grad_norm": 32.2388916015625, + "learning_rate": 1e-06, + "loss": 0.451, + "mean_token_accuracy": 0.8630313873291016, + "num_tokens": 882705680.0, + "step": 23137 + }, + { + "epoch": 2.943391426027223, + "ewc_loss": 0.05171273648738861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5856368665699847e-05, + "grad_norm": 32.169437408447266, + "learning_rate": 1e-06, + "loss": 0.3914, + "mean_token_accuracy": 0.8826566338539124, + "num_tokens": 882737589.0, + "step": 23138 + }, + { + "epoch": 2.9435186363058135, + "ewc_loss": 0.051841363310813904, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5920680855051614e-05, + "grad_norm": 32.313472747802734, + "learning_rate": 1e-06, + "loss": 0.466, + "mean_token_accuracy": 0.8572981357574463, + "num_tokens": 882774502.0, + "step": 23139 + }, + { + "epoch": 2.943645846584404, + "ewc_loss": 0.051781002432107925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5890502001857385e-05, + "grad_norm": 32.188961029052734, + "learning_rate": 1e-06, + "loss": 0.4464, + "mean_token_accuracy": 0.8620486259460449, + "num_tokens": 882815184.0, + "step": 23140 + }, + { + "epoch": 2.9437730568629945, + "ewc_loss": 0.05172022804617882, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5860113964881748e-05, + "grad_norm": 32.11545944213867, + "learning_rate": 1e-06, + "loss": 0.3814, + "mean_token_accuracy": 0.8846893310546875, + "num_tokens": 882856793.0, + "step": 23141 + }, + { + "epoch": 2.943900267141585, + "ewc_loss": 0.05184023827314377, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.592011878732592e-05, + "grad_norm": 32.37245178222656, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8825864195823669, + "num_tokens": 882896128.0, + "step": 23142 + }, + { + "epoch": 2.9440274774201756, + "ewc_loss": 0.05169710889458656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5848554287222214e-05, + "grad_norm": 32.083580017089844, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8798856139183044, + "num_tokens": 882931253.0, + "step": 23143 + }, + { + "epoch": 2.944154687698766, + "ewc_loss": 0.05175233259797096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.587616654636804e-05, + "grad_norm": 32.285396575927734, + "learning_rate": 1e-06, + "loss": 0.3679, + "mean_token_accuracy": 0.8880738019943237, + "num_tokens": 882975072.0, + "step": 23144 + }, + { + "epoch": 2.9442818979773566, + "ewc_loss": 0.05182908475399017, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5914541765814647e-05, + "grad_norm": 32.1231575012207, + "learning_rate": 1e-06, + "loss": 0.3823, + "mean_token_accuracy": 0.8849624395370483, + "num_tokens": 883013576.0, + "step": 23145 + }, + { + "epoch": 2.944409108255947, + "ewc_loss": 0.051693323999643326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5846662538242526e-05, + "grad_norm": 32.17082977294922, + "learning_rate": 1e-06, + "loss": 0.4045, + "mean_token_accuracy": 0.8743225336074829, + "num_tokens": 883046052.0, + "step": 23146 + }, + { + "epoch": 2.9445363185345377, + "ewc_loss": 0.051762569695711136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5881285182549618e-05, + "grad_norm": 32.155147552490234, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8525575399398804, + "num_tokens": 883080398.0, + "step": 23147 + }, + { + "epoch": 2.944663528813128, + "ewc_loss": 0.05178695544600487, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5893477868521586e-05, + "grad_norm": 32.2401123046875, + "learning_rate": 1e-06, + "loss": 0.4414, + "mean_token_accuracy": 0.8633719682693481, + "num_tokens": 883120321.0, + "step": 23148 + }, + { + "epoch": 2.9447907390917187, + "ewc_loss": 0.051825735718011856, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5912868295563385e-05, + "grad_norm": 32.14997482299805, + "learning_rate": 1e-06, + "loss": 0.4002, + "mean_token_accuracy": 0.8791100382804871, + "num_tokens": 883161285.0, + "step": 23149 + }, + { + "epoch": 2.9449179493703093, + "ewc_loss": 0.05180969834327698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.590484837128315e-05, + "grad_norm": 32.1702995300293, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.8719403147697449, + "num_tokens": 883199857.0, + "step": 23150 + }, + { + "epoch": 2.9450451596489, + "ewc_loss": 0.051790036261081696, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.589501855254639e-05, + "grad_norm": 32.172122955322266, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8790032863616943, + "num_tokens": 883238325.0, + "step": 23151 + }, + { + "epoch": 2.94517236992749, + "ewc_loss": 0.05182913318276405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5914567231666297e-05, + "grad_norm": 32.21537399291992, + "learning_rate": 1e-06, + "loss": 0.4407, + "mean_token_accuracy": 0.8641310930252075, + "num_tokens": 883281814.0, + "step": 23152 + }, + { + "epoch": 2.945299580206081, + "ewc_loss": 0.05169538035988808, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.584769026725553e-05, + "grad_norm": 32.06067657470703, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8765419721603394, + "num_tokens": 883317385.0, + "step": 23153 + }, + { + "epoch": 2.945426790484671, + "ewc_loss": 0.05178464204072952, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.589232099126093e-05, + "grad_norm": 32.284141540527344, + "learning_rate": 1e-06, + "loss": 0.3911, + "mean_token_accuracy": 0.8802700042724609, + "num_tokens": 883353498.0, + "step": 23154 + }, + { + "epoch": 2.945554000763262, + "ewc_loss": 0.05185526981949806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.592763485154137e-05, + "grad_norm": 32.0033073425293, + "learning_rate": 1e-06, + "loss": 0.4001, + "mean_token_accuracy": 0.8804614543914795, + "num_tokens": 883391709.0, + "step": 23155 + }, + { + "epoch": 2.945681211041852, + "ewc_loss": 0.051734842360019684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.586742084531579e-05, + "grad_norm": 32.22167205810547, + "learning_rate": 1e-06, + "loss": 0.469, + "mean_token_accuracy": 0.8558364510536194, + "num_tokens": 883426329.0, + "step": 23156 + }, + { + "epoch": 2.945808421320443, + "ewc_loss": 0.051908962428569794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5954481316148303e-05, + "grad_norm": 32.135948181152344, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8702552318572998, + "num_tokens": 883465797.0, + "step": 23157 + }, + { + "epoch": 2.945935631599033, + "ewc_loss": 0.05178161337971687, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5890805773087777e-05, + "grad_norm": 32.1811637878418, + "learning_rate": 1e-06, + "loss": 0.411, + "mean_token_accuracy": 0.8740296959877014, + "num_tokens": 883501632.0, + "step": 23158 + }, + { + "epoch": 2.9460628418776236, + "ewc_loss": 0.05189836770296097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5949184419005178e-05, + "grad_norm": 32.043006896972656, + "learning_rate": 1e-06, + "loss": 0.387, + "mean_token_accuracy": 0.8820492029190063, + "num_tokens": 883537622.0, + "step": 23159 + }, + { + "epoch": 2.946190052156214, + "ewc_loss": 0.051796603947877884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.589830182841979e-05, + "grad_norm": 32.146644592285156, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8737022876739502, + "num_tokens": 883576458.0, + "step": 23160 + }, + { + "epoch": 2.9463172624348046, + "ewc_loss": 0.05192432180047035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5962161089410074e-05, + "grad_norm": 32.175567626953125, + "learning_rate": 1e-06, + "loss": 0.4369, + "mean_token_accuracy": 0.8642274141311646, + "num_tokens": 883620929.0, + "step": 23161 + }, + { + "epoch": 2.946444472713395, + "ewc_loss": 0.051817335188388824, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5908668249030598e-05, + "grad_norm": 32.0009880065918, + "learning_rate": 1e-06, + "loss": 0.4994, + "mean_token_accuracy": 0.8485822081565857, + "num_tokens": 883652589.0, + "step": 23162 + }, + { + "epoch": 2.9465716829919857, + "ewc_loss": 0.05183206498622894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5916033337125555e-05, + "grad_norm": 32.14845657348633, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8732311129570007, + "num_tokens": 883691844.0, + "step": 23163 + }, + { + "epoch": 2.946698893270576, + "ewc_loss": 0.051941994577646255, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.59709977399325e-05, + "grad_norm": 32.07450485229492, + "learning_rate": 1e-06, + "loss": 0.4812, + "mean_token_accuracy": 0.85268235206604, + "num_tokens": 883732838.0, + "step": 23164 + }, + { + "epoch": 2.9468261035491667, + "ewc_loss": 0.05181613564491272, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5908067982527427e-05, + "grad_norm": 32.1240119934082, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8699842691421509, + "num_tokens": 883771415.0, + "step": 23165 + }, + { + "epoch": 2.9469533138277573, + "ewc_loss": 0.05198357626795769, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.599178878881503e-05, + "grad_norm": 32.19289016723633, + "learning_rate": 1e-06, + "loss": 0.4657, + "mean_token_accuracy": 0.8578441143035889, + "num_tokens": 883808807.0, + "step": 23166 + }, + { + "epoch": 2.947080524106348, + "ewc_loss": 0.051794569939374924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5897285013343208e-05, + "grad_norm": 31.98910903930664, + "learning_rate": 1e-06, + "loss": 0.369, + "mean_token_accuracy": 0.886345624923706, + "num_tokens": 883844300.0, + "step": 23167 + }, + { + "epoch": 2.9472077343849383, + "ewc_loss": 0.051929112523794174, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5964556698454544e-05, + "grad_norm": 32.132896423339844, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8870124816894531, + "num_tokens": 883879337.0, + "step": 23168 + }, + { + "epoch": 2.947334944663529, + "ewc_loss": 0.05197902023792267, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5989509595092386e-05, + "grad_norm": 32.21357345581055, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8615893125534058, + "num_tokens": 883925810.0, + "step": 23169 + }, + { + "epoch": 2.9474621549421194, + "ewc_loss": 0.05187680572271347, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.593840326881036e-05, + "grad_norm": 32.00450897216797, + "learning_rate": 1e-06, + "loss": 0.3995, + "mean_token_accuracy": 0.8815895318984985, + "num_tokens": 883964753.0, + "step": 23170 + }, + { + "epoch": 2.94758936522071, + "ewc_loss": 0.051965195685625076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5982597435358912e-05, + "grad_norm": 32.180946350097656, + "learning_rate": 1e-06, + "loss": 0.4502, + "mean_token_accuracy": 0.8647316098213196, + "num_tokens": 884005737.0, + "step": 23171 + }, + { + "epoch": 2.9477165754993004, + "ewc_loss": 0.05200605466961861, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6003026505350135e-05, + "grad_norm": 32.2230224609375, + "learning_rate": 1e-06, + "loss": 0.4366, + "mean_token_accuracy": 0.8731468319892883, + "num_tokens": 884043414.0, + "step": 23172 + }, + { + "epoch": 2.947843785777891, + "ewc_loss": 0.05199234187602997, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.599617073428817e-05, + "grad_norm": 32.26138687133789, + "learning_rate": 1e-06, + "loss": 0.4042, + "mean_token_accuracy": 0.8779282569885254, + "num_tokens": 884077399.0, + "step": 23173 + }, + { + "epoch": 2.9479709960564815, + "ewc_loss": 0.05194087326526642, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5970437491196208e-05, + "grad_norm": 32.26272201538086, + "learning_rate": 1e-06, + "loss": 0.4243, + "mean_token_accuracy": 0.8708431124687195, + "num_tokens": 884116539.0, + "step": 23174 + }, + { + "epoch": 2.948098206335072, + "ewc_loss": 0.05194620415568352, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5973102310672402e-05, + "grad_norm": 32.189064025878906, + "learning_rate": 1e-06, + "loss": 0.3979, + "mean_token_accuracy": 0.8780543208122253, + "num_tokens": 884149756.0, + "step": 23175 + }, + { + "epoch": 2.9482254166136626, + "ewc_loss": 0.051910072565078735, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5955036107916385e-05, + "grad_norm": 32.33232879638672, + "learning_rate": 1e-06, + "loss": 0.3681, + "mean_token_accuracy": 0.8916653394699097, + "num_tokens": 884181960.0, + "step": 23176 + }, + { + "epoch": 2.9483526268922526, + "ewc_loss": 0.05181604623794556, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.590802250779234e-05, + "grad_norm": 32.040550231933594, + "learning_rate": 1e-06, + "loss": 0.4297, + "mean_token_accuracy": 0.867779016494751, + "num_tokens": 884223562.0, + "step": 23177 + }, + { + "epoch": 2.9484798371708436, + "ewc_loss": 0.051841214299201965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.592060627648607e-05, + "grad_norm": 32.30176544189453, + "learning_rate": 1e-06, + "loss": 0.3952, + "mean_token_accuracy": 0.8786308765411377, + "num_tokens": 884264541.0, + "step": 23178 + }, + { + "epoch": 2.9486070474494337, + "ewc_loss": 0.051964592188596725, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5982295483117923e-05, + "grad_norm": 32.17918014526367, + "learning_rate": 1e-06, + "loss": 0.463, + "mean_token_accuracy": 0.8588002920150757, + "num_tokens": 884301171.0, + "step": 23179 + }, + { + "epoch": 2.9487342577280247, + "ewc_loss": 0.05178287252783775, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5891436962410808e-05, + "grad_norm": 32.26702117919922, + "learning_rate": 1e-06, + "loss": 0.4128, + "mean_token_accuracy": 0.8711392283439636, + "num_tokens": 884332481.0, + "step": 23180 + }, + { + "epoch": 2.9488614680066147, + "ewc_loss": 0.051936302334070206, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.596815102151595e-05, + "grad_norm": 32.19093704223633, + "learning_rate": 1e-06, + "loss": 0.356, + "mean_token_accuracy": 0.8940412402153015, + "num_tokens": 884371616.0, + "step": 23181 + }, + { + "epoch": 2.9489886782852053, + "ewc_loss": 0.05179084837436676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.589542418718338e-05, + "grad_norm": 32.134281158447266, + "learning_rate": 1e-06, + "loss": 0.4493, + "mean_token_accuracy": 0.8616234064102173, + "num_tokens": 884411245.0, + "step": 23182 + }, + { + "epoch": 2.949115888563796, + "ewc_loss": 0.051930587738752365, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.596529338916298e-05, + "grad_norm": 32.21510696411133, + "learning_rate": 1e-06, + "loss": 0.4316, + "mean_token_accuracy": 0.8698867559432983, + "num_tokens": 884445712.0, + "step": 23183 + }, + { + "epoch": 2.9492430988423863, + "ewc_loss": 0.05195628106594086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5978140911320224e-05, + "grad_norm": 32.195220947265625, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.8759769201278687, + "num_tokens": 884484935.0, + "step": 23184 + }, + { + "epoch": 2.949370309120977, + "ewc_loss": 0.05184481292963028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.592240707599558e-05, + "grad_norm": 32.169490814208984, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8728868961334229, + "num_tokens": 884518714.0, + "step": 23185 + }, + { + "epoch": 2.9494975193995674, + "ewc_loss": 0.05188560485839844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5942801585188136e-05, + "grad_norm": 32.266971588134766, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8643614053726196, + "num_tokens": 884560430.0, + "step": 23186 + }, + { + "epoch": 2.949624729678158, + "ewc_loss": 0.051858898252248764, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5929448383976705e-05, + "grad_norm": 32.108036041259766, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8653361797332764, + "num_tokens": 884595937.0, + "step": 23187 + }, + { + "epoch": 2.9497519399567484, + "ewc_loss": 0.05180349946022034, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5901750632328913e-05, + "grad_norm": 32.22833251953125, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8671653866767883, + "num_tokens": 884633201.0, + "step": 23188 + }, + { + "epoch": 2.949879150235339, + "ewc_loss": 0.05191439017653465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5957195248338394e-05, + "grad_norm": 32.2081298828125, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.873498797416687, + "num_tokens": 884672602.0, + "step": 23189 + }, + { + "epoch": 2.9500063605139295, + "ewc_loss": 0.05183742940425873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.591871452750638e-05, + "grad_norm": 32.12759780883789, + "learning_rate": 1e-06, + "loss": 0.406, + "mean_token_accuracy": 0.8758440017700195, + "num_tokens": 884708378.0, + "step": 23190 + }, + { + "epoch": 2.95013357079252, + "ewc_loss": 0.05190236493945122, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.595118166937027e-05, + "grad_norm": 32.14815139770508, + "learning_rate": 1e-06, + "loss": 0.4223, + "mean_token_accuracy": 0.8703754544258118, + "num_tokens": 884747527.0, + "step": 23191 + }, + { + "epoch": 2.9502607810711106, + "ewc_loss": 0.05184445157647133, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5922225177055225e-05, + "grad_norm": 32.14081954956055, + "learning_rate": 1e-06, + "loss": 0.3599, + "mean_token_accuracy": 0.8900308609008789, + "num_tokens": 884781738.0, + "step": 23192 + }, + { + "epoch": 2.950387991349701, + "ewc_loss": 0.05190419778227806, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.595209844002966e-05, + "grad_norm": 32.19828414916992, + "learning_rate": 1e-06, + "loss": 0.4532, + "mean_token_accuracy": 0.8604282140731812, + "num_tokens": 884818884.0, + "step": 23193 + }, + { + "epoch": 2.9505152016282916, + "ewc_loss": 0.051874443888664246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.593722274468746e-05, + "grad_norm": 32.062923431396484, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8821895718574524, + "num_tokens": 884851751.0, + "step": 23194 + }, + { + "epoch": 2.950642411906882, + "ewc_loss": 0.05183270201086998, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5916351660271175e-05, + "grad_norm": 32.164676666259766, + "learning_rate": 1e-06, + "loss": 0.4066, + "mean_token_accuracy": 0.8768780827522278, + "num_tokens": 884892935.0, + "step": 23195 + }, + { + "epoch": 2.9507696221854727, + "ewc_loss": 0.051935307681560516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5967654437408783e-05, + "grad_norm": 31.987857818603516, + "learning_rate": 1e-06, + "loss": 0.4048, + "mean_token_accuracy": 0.8763041496276855, + "num_tokens": 884935820.0, + "step": 23196 + }, + { + "epoch": 2.950896832464063, + "ewc_loss": 0.051775287836790085, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5887644369504414e-05, + "grad_norm": 32.09782409667969, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8708833456039429, + "num_tokens": 884976042.0, + "step": 23197 + }, + { + "epoch": 2.9510240427426537, + "ewc_loss": 0.05197620391845703, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.598810169729404e-05, + "grad_norm": 32.0800895690918, + "learning_rate": 1e-06, + "loss": 0.3561, + "mean_token_accuracy": 0.8904126882553101, + "num_tokens": 885013511.0, + "step": 23198 + }, + { + "epoch": 2.9511512530212443, + "ewc_loss": 0.05189824104309082, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5949120754376054e-05, + "grad_norm": 32.15769958496094, + "learning_rate": 1e-06, + "loss": 0.4533, + "mean_token_accuracy": 0.8598229289054871, + "num_tokens": 885047291.0, + "step": 23199 + }, + { + "epoch": 2.951278463299835, + "ewc_loss": 0.051887474954128265, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.594373836473096e-05, + "grad_norm": 32.11007308959961, + "learning_rate": 1e-06, + "loss": 0.433, + "mean_token_accuracy": 0.8691654205322266, + "num_tokens": 885083116.0, + "step": 23200 + }, + { + "epoch": 2.9514056735784253, + "ewc_loss": 0.0518655851483345, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5932793505489826e-05, + "grad_norm": 32.08578872680664, + "learning_rate": 1e-06, + "loss": 0.4285, + "mean_token_accuracy": 0.868057370185852, + "num_tokens": 885123282.0, + "step": 23201 + }, + { + "epoch": 2.9515328838570154, + "ewc_loss": 0.05198914185166359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.599457002361305e-05, + "grad_norm": 32.1750602722168, + "learning_rate": 1e-06, + "loss": 0.4023, + "mean_token_accuracy": 0.8764477968215942, + "num_tokens": 885166277.0, + "step": 23202 + }, + { + "epoch": 2.9516600941356064, + "ewc_loss": 0.05192473530769348, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5962368454202078e-05, + "grad_norm": 32.127723693847656, + "learning_rate": 1e-06, + "loss": 0.376, + "mean_token_accuracy": 0.8842750191688538, + "num_tokens": 885206414.0, + "step": 23203 + }, + { + "epoch": 2.9517873044141965, + "ewc_loss": 0.05185650289058685, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5928251488949172e-05, + "grad_norm": 32.06399917602539, + "learning_rate": 1e-06, + "loss": 0.403, + "mean_token_accuracy": 0.8802359700202942, + "num_tokens": 885240977.0, + "step": 23204 + }, + { + "epoch": 2.9519145146927874, + "ewc_loss": 0.05195320397615433, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5976602046284825e-05, + "grad_norm": 32.297019958496094, + "learning_rate": 1e-06, + "loss": 0.3686, + "mean_token_accuracy": 0.8870747685432434, + "num_tokens": 885283190.0, + "step": 23205 + }, + { + "epoch": 2.9520417249713775, + "ewc_loss": 0.05186762288212776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5933812139555812e-05, + "grad_norm": 32.00138473510742, + "learning_rate": 1e-06, + "loss": 0.3801, + "mean_token_accuracy": 0.8864446878433228, + "num_tokens": 885321340.0, + "step": 23206 + }, + { + "epoch": 2.952168935249968, + "ewc_loss": 0.05185679346323013, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5928397008101456e-05, + "grad_norm": 32.34878921508789, + "learning_rate": 1e-06, + "loss": 0.3918, + "mean_token_accuracy": 0.8811694979667664, + "num_tokens": 885357524.0, + "step": 23207 + }, + { + "epoch": 2.9522961455285586, + "ewc_loss": 0.05192667990922928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5963339794543572e-05, + "grad_norm": 32.13035583496094, + "learning_rate": 1e-06, + "loss": 0.4862, + "mean_token_accuracy": 0.8486343622207642, + "num_tokens": 885389696.0, + "step": 23208 + }, + { + "epoch": 2.952423355807149, + "ewc_loss": 0.05164555460214615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5822777388384566e-05, + "grad_norm": 32.16957473754883, + "learning_rate": 1e-06, + "loss": 0.3699, + "mean_token_accuracy": 0.8882913589477539, + "num_tokens": 885429574.0, + "step": 23209 + }, + { + "epoch": 2.9525505660857396, + "ewc_loss": 0.05201556161046028, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6007781343651004e-05, + "grad_norm": 32.29826736450195, + "learning_rate": 1e-06, + "loss": 0.4043, + "mean_token_accuracy": 0.8765271902084351, + "num_tokens": 885459961.0, + "step": 23210 + }, + { + "epoch": 2.95267777636433, + "ewc_loss": 0.051846981048583984, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5923491193680093e-05, + "grad_norm": 32.23774337768555, + "learning_rate": 1e-06, + "loss": 0.4405, + "mean_token_accuracy": 0.8665059208869934, + "num_tokens": 885506725.0, + "step": 23211 + }, + { + "epoch": 2.9528049866429207, + "ewc_loss": 0.05192292109131813, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.596146077848971e-05, + "grad_norm": 32.220733642578125, + "learning_rate": 1e-06, + "loss": 0.4086, + "mean_token_accuracy": 0.8772677183151245, + "num_tokens": 885549409.0, + "step": 23212 + }, + { + "epoch": 2.952932196921511, + "ewc_loss": 0.051845699548721313, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.592284909042064e-05, + "grad_norm": 32.14076232910156, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8784559965133667, + "num_tokens": 885589272.0, + "step": 23213 + }, + { + "epoch": 2.9530594072001017, + "ewc_loss": 0.05180148407816887, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5900742912199348e-05, + "grad_norm": 32.1689338684082, + "learning_rate": 1e-06, + "loss": 0.4661, + "mean_token_accuracy": 0.8550374507904053, + "num_tokens": 885625951.0, + "step": 23214 + }, + { + "epoch": 2.9531866174786923, + "ewc_loss": 0.05196442827582359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5982213628594764e-05, + "grad_norm": 32.185447692871094, + "learning_rate": 1e-06, + "loss": 0.4094, + "mean_token_accuracy": 0.8752557635307312, + "num_tokens": 885665844.0, + "step": 23215 + }, + { + "epoch": 2.953313827757283, + "ewc_loss": 0.05188440904021263, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5942204956663772e-05, + "grad_norm": 32.19807052612305, + "learning_rate": 1e-06, + "loss": 0.4186, + "mean_token_accuracy": 0.8727385401725769, + "num_tokens": 885705616.0, + "step": 23216 + }, + { + "epoch": 2.9534410380358733, + "ewc_loss": 0.05188833177089691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5944165827240795e-05, + "grad_norm": 32.18721008300781, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8844582438468933, + "num_tokens": 885752362.0, + "step": 23217 + }, + { + "epoch": 2.953568248314464, + "ewc_loss": 0.051805417984724045, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.590270923974458e-05, + "grad_norm": 32.31224060058594, + "learning_rate": 1e-06, + "loss": 0.4354, + "mean_token_accuracy": 0.8695005774497986, + "num_tokens": 885792485.0, + "step": 23218 + }, + { + "epoch": 2.9536954585930544, + "ewc_loss": 0.05185569450259209, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5927847673301585e-05, + "grad_norm": 32.09969711303711, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8747777938842773, + "num_tokens": 885834809.0, + "step": 23219 + }, + { + "epoch": 2.953822668871645, + "ewc_loss": 0.05172542482614517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5862713300739415e-05, + "grad_norm": 32.17082214355469, + "learning_rate": 1e-06, + "loss": 0.3998, + "mean_token_accuracy": 0.8781369924545288, + "num_tokens": 885870555.0, + "step": 23220 + }, + { + "epoch": 2.9539498791502354, + "ewc_loss": 0.05183303728699684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.59165190072963e-05, + "grad_norm": 32.11443328857422, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8748699426651001, + "num_tokens": 885911911.0, + "step": 23221 + }, + { + "epoch": 2.954077089428826, + "ewc_loss": 0.051741644740104675, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5870822355500422e-05, + "grad_norm": 32.2361946105957, + "learning_rate": 1e-06, + "loss": 0.3761, + "mean_token_accuracy": 0.8856166005134583, + "num_tokens": 885941094.0, + "step": 23222 + }, + { + "epoch": 2.9542042997074165, + "ewc_loss": 0.05182535573840141, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5912677301676013e-05, + "grad_norm": 32.02788543701172, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.871625542640686, + "num_tokens": 885978094.0, + "step": 23223 + }, + { + "epoch": 2.954331509986007, + "ewc_loss": 0.051699575036764145, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5849787562037818e-05, + "grad_norm": 32.020591735839844, + "learning_rate": 1e-06, + "loss": 0.3713, + "mean_token_accuracy": 0.8895654678344727, + "num_tokens": 886014859.0, + "step": 23224 + }, + { + "epoch": 2.954458720264597, + "ewc_loss": 0.051858969032764435, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5929484763764776e-05, + "grad_norm": 32.18600845336914, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8793911933898926, + "num_tokens": 886048317.0, + "step": 23225 + }, + { + "epoch": 2.954585930543188, + "ewc_loss": 0.05184575170278549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5922876375261694e-05, + "grad_norm": 32.20396423339844, + "learning_rate": 1e-06, + "loss": 0.4071, + "mean_token_accuracy": 0.8747702836990356, + "num_tokens": 886081797.0, + "step": 23226 + }, + { + "epoch": 2.954713140821778, + "ewc_loss": 0.05188410356640816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5942052161553875e-05, + "grad_norm": 32.012603759765625, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.8600799441337585, + "num_tokens": 886124111.0, + "step": 23227 + }, + { + "epoch": 2.954840351100369, + "ewc_loss": 0.051858555525541306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5929277398972772e-05, + "grad_norm": 32.063201904296875, + "learning_rate": 1e-06, + "loss": 0.3983, + "mean_token_accuracy": 0.8835936784744263, + "num_tokens": 886162823.0, + "step": 23228 + }, + { + "epoch": 2.954967561378959, + "ewc_loss": 0.05199691280722618, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5998457203968428e-05, + "grad_norm": 32.36687088012695, + "learning_rate": 1e-06, + "loss": 0.4123, + "mean_token_accuracy": 0.8755523562431335, + "num_tokens": 886204706.0, + "step": 23229 + }, + { + "epoch": 2.95509477165755, + "ewc_loss": 0.05197705328464508, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.598852734081447e-05, + "grad_norm": 32.062862396240234, + "learning_rate": 1e-06, + "loss": 0.3938, + "mean_token_accuracy": 0.8813761472702026, + "num_tokens": 886241461.0, + "step": 23230 + }, + { + "epoch": 2.9552219819361403, + "ewc_loss": 0.051819827407598495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5909914256772026e-05, + "grad_norm": 32.202911376953125, + "learning_rate": 1e-06, + "loss": 0.4307, + "mean_token_accuracy": 0.8689427971839905, + "num_tokens": 886279276.0, + "step": 23231 + }, + { + "epoch": 2.955349192214731, + "ewc_loss": 0.05201869085431099, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6009345674538054e-05, + "grad_norm": 32.237728118896484, + "learning_rate": 1e-06, + "loss": 0.4078, + "mean_token_accuracy": 0.8752237558364868, + "num_tokens": 886317703.0, + "step": 23232 + }, + { + "epoch": 2.9554764024933213, + "ewc_loss": 0.051832422614097595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5916211598087102e-05, + "grad_norm": 32.12604522705078, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8795501589775085, + "num_tokens": 886355235.0, + "step": 23233 + }, + { + "epoch": 2.955603612771912, + "ewc_loss": 0.0519612617790699, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.598063110781368e-05, + "grad_norm": 32.28154373168945, + "learning_rate": 1e-06, + "loss": 0.3431, + "mean_token_accuracy": 0.8971073031425476, + "num_tokens": 886387354.0, + "step": 23234 + }, + { + "epoch": 2.9557308230505024, + "ewc_loss": 0.05179180949926376, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5895904400385916e-05, + "grad_norm": 32.14155197143555, + "learning_rate": 1e-06, + "loss": 0.4201, + "mean_token_accuracy": 0.8713513612747192, + "num_tokens": 886419742.0, + "step": 23235 + }, + { + "epoch": 2.955858033329093, + "ewc_loss": 0.05185465142130852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5927325623342767e-05, + "grad_norm": 32.2327995300293, + "learning_rate": 1e-06, + "loss": 0.4118, + "mean_token_accuracy": 0.8739075660705566, + "num_tokens": 886455218.0, + "step": 23236 + }, + { + "epoch": 2.9559852436076834, + "ewc_loss": 0.051916107535362244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5958053811336868e-05, + "grad_norm": 32.067710876464844, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8749809265136719, + "num_tokens": 886494454.0, + "step": 23237 + }, + { + "epoch": 2.956112453886274, + "ewc_loss": 0.051850058138370514, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.592502823972609e-05, + "grad_norm": 32.210060119628906, + "learning_rate": 1e-06, + "loss": 0.4658, + "mean_token_accuracy": 0.8501526117324829, + "num_tokens": 886532854.0, + "step": 23238 + }, + { + "epoch": 2.9562396641648645, + "ewc_loss": 0.05184192582964897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5920962798409164e-05, + "grad_norm": 32.09354019165039, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8716039657592773, + "num_tokens": 886564288.0, + "step": 23239 + }, + { + "epoch": 2.956366874443455, + "ewc_loss": 0.05181972309947014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5909861506079324e-05, + "grad_norm": 32.20438766479492, + "learning_rate": 1e-06, + "loss": 0.4008, + "mean_token_accuracy": 0.8754564523696899, + "num_tokens": 886597155.0, + "step": 23240 + }, + { + "epoch": 2.9564940847220456, + "ewc_loss": 0.051878493279218674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5939247279893607e-05, + "grad_norm": 32.15428161621094, + "learning_rate": 1e-06, + "loss": 0.4016, + "mean_token_accuracy": 0.8759840726852417, + "num_tokens": 886632015.0, + "step": 23241 + }, + { + "epoch": 2.956621295000636, + "ewc_loss": 0.051813334226608276, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5906667360686697e-05, + "grad_norm": 32.098976135253906, + "learning_rate": 1e-06, + "loss": 0.3583, + "mean_token_accuracy": 0.8902014493942261, + "num_tokens": 886665736.0, + "step": 23242 + }, + { + "epoch": 2.9567485052792266, + "ewc_loss": 0.051872704178094864, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5936351448763162e-05, + "grad_norm": 32.18289566040039, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8735252618789673, + "num_tokens": 886702652.0, + "step": 23243 + }, + { + "epoch": 2.956875715557817, + "ewc_loss": 0.0520150400698185, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6007519409176894e-05, + "grad_norm": 32.249053955078125, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8702747225761414, + "num_tokens": 886743573.0, + "step": 23244 + }, + { + "epoch": 2.9570029258364077, + "ewc_loss": 0.05183608829975128, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5918043320416473e-05, + "grad_norm": 32.11848449707031, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8784933090209961, + "num_tokens": 886777932.0, + "step": 23245 + }, + { + "epoch": 2.957130136114998, + "ewc_loss": 0.05192546918988228, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.596273407107219e-05, + "grad_norm": 32.09393310546875, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8626807928085327, + "num_tokens": 886813211.0, + "step": 23246 + }, + { + "epoch": 2.9572573463935887, + "ewc_loss": 0.05203902721405029, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6019513825303875e-05, + "grad_norm": 32.26075744628906, + "learning_rate": 1e-06, + "loss": 0.3978, + "mean_token_accuracy": 0.8784915208816528, + "num_tokens": 886849416.0, + "step": 23247 + }, + { + "epoch": 2.9573845566721793, + "ewc_loss": 0.05196184292435646, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5980922146118246e-05, + "grad_norm": 32.067420959472656, + "learning_rate": 1e-06, + "loss": 0.4663, + "mean_token_accuracy": 0.8587988615036011, + "num_tokens": 886887983.0, + "step": 23248 + }, + { + "epoch": 2.95751176695077, + "ewc_loss": 0.05198204144835472, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5991021175286733e-05, + "grad_norm": 32.24726867675781, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8574138879776001, + "num_tokens": 886930867.0, + "step": 23249 + }, + { + "epoch": 2.95763897722936, + "ewc_loss": 0.051892176270484924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5946088499040343e-05, + "grad_norm": 31.994712829589844, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8712503910064697, + "num_tokens": 886971862.0, + "step": 23250 + }, + { + "epoch": 2.957766187507951, + "ewc_loss": 0.05195578187704086, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.597789170977194e-05, + "grad_norm": 32.14022445678711, + "learning_rate": 1e-06, + "loss": 0.3893, + "mean_token_accuracy": 0.8797914981842041, + "num_tokens": 887005238.0, + "step": 23251 + }, + { + "epoch": 2.957893397786541, + "ewc_loss": 0.05204036459326744, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6020181394414976e-05, + "grad_norm": 32.24272918701172, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.8654458522796631, + "num_tokens": 887035588.0, + "step": 23252 + }, + { + "epoch": 2.958020608065132, + "ewc_loss": 0.05200054496526718, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6000272555393167e-05, + "grad_norm": 32.20658493041992, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8736917972564697, + "num_tokens": 887079962.0, + "step": 23253 + }, + { + "epoch": 2.958147818343722, + "ewc_loss": 0.05197349935770035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5986750188167207e-05, + "grad_norm": 32.16756057739258, + "learning_rate": 1e-06, + "loss": 0.3521, + "mean_token_accuracy": 0.8929766416549683, + "num_tokens": 887114572.0, + "step": 23254 + }, + { + "epoch": 2.958275028622313, + "ewc_loss": 0.05193689465522766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.596844751678873e-05, + "grad_norm": 32.18120193481445, + "learning_rate": 1e-06, + "loss": 0.4, + "mean_token_accuracy": 0.8779892325401306, + "num_tokens": 887149396.0, + "step": 23255 + }, + { + "epoch": 2.958402238900903, + "ewc_loss": 0.0519995354115963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.599976687633898e-05, + "grad_norm": 32.28907775878906, + "learning_rate": 1e-06, + "loss": 0.4266, + "mean_token_accuracy": 0.8681302070617676, + "num_tokens": 887191916.0, + "step": 23256 + }, + { + "epoch": 2.9585294491794936, + "ewc_loss": 0.05190989375114441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.595494697743561e-05, + "grad_norm": 32.0322151184082, + "learning_rate": 1e-06, + "loss": 0.3948, + "mean_token_accuracy": 0.88236004114151, + "num_tokens": 887228263.0, + "step": 23257 + }, + { + "epoch": 2.958656659458084, + "ewc_loss": 0.0519661009311676, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5983050363720395e-05, + "grad_norm": 32.38689041137695, + "learning_rate": 1e-06, + "loss": 0.3738, + "mean_token_accuracy": 0.8892644643783569, + "num_tokens": 887258688.0, + "step": 23258 + }, + { + "epoch": 2.9587838697366746, + "ewc_loss": 0.051943354308605194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5971676222980022e-05, + "grad_norm": 32.04961013793945, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8693407773971558, + "num_tokens": 887301565.0, + "step": 23259 + }, + { + "epoch": 2.958911080015265, + "ewc_loss": 0.051800437271595, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5900219043251127e-05, + "grad_norm": 32.210975646972656, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8706355690956116, + "num_tokens": 887337511.0, + "step": 23260 + }, + { + "epoch": 2.9590382902938557, + "ewc_loss": 0.05198180302977562, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.59909011219861e-05, + "grad_norm": 32.18348693847656, + "learning_rate": 1e-06, + "loss": 0.3894, + "mean_token_accuracy": 0.8774675130844116, + "num_tokens": 887370694.0, + "step": 23261 + }, + { + "epoch": 2.959165500572446, + "ewc_loss": 0.05190582573413849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5952913347282447e-05, + "grad_norm": 32.21082305908203, + "learning_rate": 1e-06, + "loss": 0.4283, + "mean_token_accuracy": 0.8721647262573242, + "num_tokens": 887412131.0, + "step": 23262 + }, + { + "epoch": 2.9592927108510367, + "ewc_loss": 0.05193827673792839, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5969138732762076e-05, + "grad_norm": 32.1064338684082, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8685153722763062, + "num_tokens": 887447179.0, + "step": 23263 + }, + { + "epoch": 2.9594199211296273, + "ewc_loss": 0.05192699655890465, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.596349804662168e-05, + "grad_norm": 32.23069381713867, + "learning_rate": 1e-06, + "loss": 0.4335, + "mean_token_accuracy": 0.8690340518951416, + "num_tokens": 887486079.0, + "step": 23264 + }, + { + "epoch": 2.959547131408218, + "ewc_loss": 0.051939696073532104, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.596984813862946e-05, + "grad_norm": 32.14246368408203, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8816421031951904, + "num_tokens": 887519092.0, + "step": 23265 + }, + { + "epoch": 2.9596743416868083, + "ewc_loss": 0.05180371552705765, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5901857952703722e-05, + "grad_norm": 32.15918731689453, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8750487565994263, + "num_tokens": 887552008.0, + "step": 23266 + }, + { + "epoch": 2.959801551965399, + "ewc_loss": 0.051850344985723495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.592517193988897e-05, + "grad_norm": 32.05646896362305, + "learning_rate": 1e-06, + "loss": 0.4252, + "mean_token_accuracy": 0.8697707653045654, + "num_tokens": 887585827.0, + "step": 23267 + }, + { + "epoch": 2.9599287622439894, + "ewc_loss": 0.051938317716121674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5969158741645515e-05, + "grad_norm": 32.095943450927734, + "learning_rate": 1e-06, + "loss": 0.4185, + "mean_token_accuracy": 0.8721867799758911, + "num_tokens": 887628041.0, + "step": 23268 + }, + { + "epoch": 2.96005597252258, + "ewc_loss": 0.05202188715338707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6010942747234367e-05, + "grad_norm": 32.15814971923828, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8777132034301758, + "num_tokens": 887667210.0, + "step": 23269 + }, + { + "epoch": 2.9601831828011704, + "ewc_loss": 0.05195257067680359, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5976285542128608e-05, + "grad_norm": 32.02721405029297, + "learning_rate": 1e-06, + "loss": 0.3889, + "mean_token_accuracy": 0.8800086975097656, + "num_tokens": 887705429.0, + "step": 23270 + }, + { + "epoch": 2.960310393079761, + "ewc_loss": 0.05205931514501572, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.602965832920745e-05, + "grad_norm": 32.324134826660156, + "learning_rate": 1e-06, + "loss": 0.4814, + "mean_token_accuracy": 0.8554122447967529, + "num_tokens": 887753188.0, + "step": 23271 + }, + { + "epoch": 2.9604376033583515, + "ewc_loss": 0.0520772822201252, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.603864049888216e-05, + "grad_norm": 32.18625259399414, + "learning_rate": 1e-06, + "loss": 0.4594, + "mean_token_accuracy": 0.8578965067863464, + "num_tokens": 887795907.0, + "step": 23272 + }, + { + "epoch": 2.960564813636942, + "ewc_loss": 0.051924556493759155, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.59622775047319e-05, + "grad_norm": 32.22014236450195, + "learning_rate": 1e-06, + "loss": 0.4241, + "mean_token_accuracy": 0.8730769753456116, + "num_tokens": 887834859.0, + "step": 23273 + }, + { + "epoch": 2.9606920239155325, + "ewc_loss": 0.05201521888375282, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6007608539657667e-05, + "grad_norm": 32.32304763793945, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8687297701835632, + "num_tokens": 887879041.0, + "step": 23274 + }, + { + "epoch": 2.9608192341941226, + "ewc_loss": 0.05180961638689041, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5904808353516273e-05, + "grad_norm": 32.1821403503418, + "learning_rate": 1e-06, + "loss": 0.4261, + "mean_token_accuracy": 0.8698620200157166, + "num_tokens": 887913752.0, + "step": 23275 + }, + { + "epoch": 2.9609464444727136, + "ewc_loss": 0.05188548192381859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.594274155853782e-05, + "grad_norm": 32.11967468261719, + "learning_rate": 1e-06, + "loss": 0.394, + "mean_token_accuracy": 0.8795689344406128, + "num_tokens": 887950946.0, + "step": 23276 + }, + { + "epoch": 2.9610736547513037, + "ewc_loss": 0.05193600058555603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5968000045395456e-05, + "grad_norm": 32.322967529296875, + "learning_rate": 1e-06, + "loss": 0.4206, + "mean_token_accuracy": 0.8709803819656372, + "num_tokens": 887993155.0, + "step": 23277 + }, + { + "epoch": 2.9612008650298947, + "ewc_loss": 0.05190063640475273, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5950317649403587e-05, + "grad_norm": 32.24337387084961, + "learning_rate": 1e-06, + "loss": 0.4305, + "mean_token_accuracy": 0.8727017641067505, + "num_tokens": 888027831.0, + "step": 23278 + }, + { + "epoch": 2.9613280753084847, + "ewc_loss": 0.051908351480960846, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5954175725928508e-05, + "grad_norm": 32.354576110839844, + "learning_rate": 1e-06, + "loss": 0.3825, + "mean_token_accuracy": 0.8839737772941589, + "num_tokens": 888066941.0, + "step": 23279 + }, + { + "epoch": 2.9614552855870753, + "ewc_loss": 0.05184600129723549, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5923000066541135e-05, + "grad_norm": 32.16050338745117, + "learning_rate": 1e-06, + "loss": 0.3678, + "mean_token_accuracy": 0.887493908405304, + "num_tokens": 888099739.0, + "step": 23280 + }, + { + "epoch": 2.961582495865666, + "ewc_loss": 0.05179091915488243, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5895458747982047e-05, + "grad_norm": 32.30510330200195, + "learning_rate": 1e-06, + "loss": 0.4081, + "mean_token_accuracy": 0.8744954466819763, + "num_tokens": 888139675.0, + "step": 23281 + }, + { + "epoch": 2.9617097061442563, + "ewc_loss": 0.05184031277894974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5920156986103393e-05, + "grad_norm": 32.1772346496582, + "learning_rate": 1e-06, + "loss": 0.4408, + "mean_token_accuracy": 0.8601144552230835, + "num_tokens": 888179374.0, + "step": 23282 + }, + { + "epoch": 2.961836916422847, + "ewc_loss": 0.05176267772912979, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.588133793324232e-05, + "grad_norm": 32.215431213378906, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8599292039871216, + "num_tokens": 888214139.0, + "step": 23283 + }, + { + "epoch": 2.9619641267014374, + "ewc_loss": 0.05188107118010521, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5940535124391317e-05, + "grad_norm": 32.316715240478516, + "learning_rate": 1e-06, + "loss": 0.4171, + "mean_token_accuracy": 0.8716455698013306, + "num_tokens": 888246365.0, + "step": 23284 + }, + { + "epoch": 2.962091336980028, + "ewc_loss": 0.05179542303085327, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5897710656863637e-05, + "grad_norm": 32.0805778503418, + "learning_rate": 1e-06, + "loss": 0.439, + "mean_token_accuracy": 0.8676789402961731, + "num_tokens": 888290903.0, + "step": 23285 + }, + { + "epoch": 2.9622185472586184, + "ewc_loss": 0.051818620413541794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.590931035229005e-05, + "grad_norm": 32.12910079956055, + "learning_rate": 1e-06, + "loss": 0.4273, + "mean_token_accuracy": 0.8706836700439453, + "num_tokens": 888331329.0, + "step": 23286 + }, + { + "epoch": 2.962345757537209, + "ewc_loss": 0.051856428384780884, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.59282151091611e-05, + "grad_norm": 32.177589416503906, + "learning_rate": 1e-06, + "loss": 0.4319, + "mean_token_accuracy": 0.8668769001960754, + "num_tokens": 888372411.0, + "step": 23287 + }, + { + "epoch": 2.9624729678157995, + "ewc_loss": 0.051858652383089066, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5929326511686668e-05, + "grad_norm": 32.166107177734375, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.8730415105819702, + "num_tokens": 888409778.0, + "step": 23288 + }, + { + "epoch": 2.96260017809439, + "ewc_loss": 0.051722172647714615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5861087124212645e-05, + "grad_norm": 32.15678787231445, + "learning_rate": 1e-06, + "loss": 0.3964, + "mean_token_accuracy": 0.8796087503433228, + "num_tokens": 888453593.0, + "step": 23289 + }, + { + "epoch": 2.9627273883729806, + "ewc_loss": 0.05197171866893768, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.598585888335947e-05, + "grad_norm": 32.27245330810547, + "learning_rate": 1e-06, + "loss": 0.3751, + "mean_token_accuracy": 0.8850700855255127, + "num_tokens": 888488245.0, + "step": 23290 + }, + { + "epoch": 2.962854598651571, + "ewc_loss": 0.051829420030117035, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5914709112839773e-05, + "grad_norm": 32.25381851196289, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.871627926826477, + "num_tokens": 888524306.0, + "step": 23291 + }, + { + "epoch": 2.9629818089301616, + "ewc_loss": 0.051850300282239914, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5925150112016127e-05, + "grad_norm": 32.12353515625, + "learning_rate": 1e-06, + "loss": 0.3815, + "mean_token_accuracy": 0.8855277299880981, + "num_tokens": 888563791.0, + "step": 23292 + }, + { + "epoch": 2.963109019208752, + "ewc_loss": 0.05176866054534912, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5884330170811154e-05, + "grad_norm": 32.18626403808594, + "learning_rate": 1e-06, + "loss": 0.4036, + "mean_token_accuracy": 0.877846896648407, + "num_tokens": 888605021.0, + "step": 23293 + }, + { + "epoch": 2.9632362294873427, + "ewc_loss": 0.05187351256608963, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5936757083400153e-05, + "grad_norm": 32.127586364746094, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8745798468589783, + "num_tokens": 888647382.0, + "step": 23294 + }, + { + "epoch": 2.963363439765933, + "ewc_loss": 0.051839184015989304, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.591959128039889e-05, + "grad_norm": 32.23218536376953, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8733059167861938, + "num_tokens": 888682174.0, + "step": 23295 + }, + { + "epoch": 2.9634906500445237, + "ewc_loss": 0.05186760053038597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5933799406629987e-05, + "grad_norm": 32.01713562011719, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8763651251792908, + "num_tokens": 888724401.0, + "step": 23296 + }, + { + "epoch": 2.9636178603231143, + "ewc_loss": 0.05186071619391441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5930357878678478e-05, + "grad_norm": 32.21366882324219, + "learning_rate": 1e-06, + "loss": 0.397, + "mean_token_accuracy": 0.8794636130332947, + "num_tokens": 888766230.0, + "step": 23297 + }, + { + "epoch": 2.963745070601705, + "ewc_loss": 0.05197151005268097, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5985755200963467e-05, + "grad_norm": 32.093719482421875, + "learning_rate": 1e-06, + "loss": 0.4798, + "mean_token_accuracy": 0.8517965078353882, + "num_tokens": 888811865.0, + "step": 23298 + }, + { + "epoch": 2.9638722808802953, + "ewc_loss": 0.051826026290655136, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.591301381471567e-05, + "grad_norm": 32.12251281738281, + "learning_rate": 1e-06, + "loss": 0.4162, + "mean_token_accuracy": 0.8738993406295776, + "num_tokens": 888849549.0, + "step": 23299 + }, + { + "epoch": 2.9639994911588854, + "ewc_loss": 0.051920317113399506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.596015838207677e-05, + "grad_norm": 32.26722717285156, + "learning_rate": 1e-06, + "loss": 0.4031, + "mean_token_accuracy": 0.8751437664031982, + "num_tokens": 888881305.0, + "step": 23300 + }, + { + "epoch": 2.9641267014374764, + "ewc_loss": 0.05192964896559715, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5964824089896865e-05, + "grad_norm": 32.13441848754883, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.877805769443512, + "num_tokens": 888917862.0, + "step": 23301 + }, + { + "epoch": 2.9642539117160664, + "ewc_loss": 0.051855962723493576, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5927982278517447e-05, + "grad_norm": 32.28627395629883, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8736371994018555, + "num_tokens": 888953883.0, + "step": 23302 + }, + { + "epoch": 2.9643811219946574, + "ewc_loss": 0.051923952996730804, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5961977371480316e-05, + "grad_norm": 32.280975341796875, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8904094696044922, + "num_tokens": 888991052.0, + "step": 23303 + }, + { + "epoch": 2.9645083322732475, + "ewc_loss": 0.051733434200286865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.586671689641662e-05, + "grad_norm": 32.11418533325195, + "learning_rate": 1e-06, + "loss": 0.3827, + "mean_token_accuracy": 0.8859456777572632, + "num_tokens": 889022523.0, + "step": 23304 + }, + { + "epoch": 2.964635542551838, + "ewc_loss": 0.051855847239494324, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5927924070856534e-05, + "grad_norm": 32.25907516479492, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8650733232498169, + "num_tokens": 889063470.0, + "step": 23305 + }, + { + "epoch": 2.9647627528304286, + "ewc_loss": 0.0517984963953495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5899247702909634e-05, + "grad_norm": 32.038665771484375, + "learning_rate": 1e-06, + "loss": 0.3882, + "mean_token_accuracy": 0.882132887840271, + "num_tokens": 889101160.0, + "step": 23306 + }, + { + "epoch": 2.964889963109019, + "ewc_loss": 0.051805272698402405, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.590263648016844e-05, + "grad_norm": 32.13906478881836, + "learning_rate": 1e-06, + "loss": 0.422, + "mean_token_accuracy": 0.8710237741470337, + "num_tokens": 889139128.0, + "step": 23307 + }, + { + "epoch": 2.9650171733876096, + "ewc_loss": 0.05197641998529434, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5988210836658254e-05, + "grad_norm": 32.18621063232422, + "learning_rate": 1e-06, + "loss": 0.4799, + "mean_token_accuracy": 0.8507605195045471, + "num_tokens": 889175574.0, + "step": 23308 + }, + { + "epoch": 2.9651443836662, + "ewc_loss": 0.05192521587014198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5962608560803346e-05, + "grad_norm": 32.349979400634766, + "learning_rate": 1e-06, + "loss": 0.4289, + "mean_token_accuracy": 0.8703292012214661, + "num_tokens": 889216520.0, + "step": 23309 + }, + { + "epoch": 2.9652715939447907, + "ewc_loss": 0.051914140582084656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.595706973806955e-05, + "grad_norm": 32.09635543823242, + "learning_rate": 1e-06, + "loss": 0.3838, + "mean_token_accuracy": 0.8845807313919067, + "num_tokens": 889256746.0, + "step": 23310 + }, + { + "epoch": 2.965398804223381, + "ewc_loss": 0.05175754427909851, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5878771339193918e-05, + "grad_norm": 32.289188385009766, + "learning_rate": 1e-06, + "loss": 0.4348, + "mean_token_accuracy": 0.8700242042541504, + "num_tokens": 889298868.0, + "step": 23311 + }, + { + "epoch": 2.9655260145019717, + "ewc_loss": 0.05197887122631073, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.598943501652684e-05, + "grad_norm": 32.21934127807617, + "learning_rate": 1e-06, + "loss": 0.4286, + "mean_token_accuracy": 0.8683415651321411, + "num_tokens": 889337758.0, + "step": 23312 + }, + { + "epoch": 2.9656532247805623, + "ewc_loss": 0.051799431443214417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5899715183186345e-05, + "grad_norm": 32.196990966796875, + "learning_rate": 1e-06, + "loss": 0.4074, + "mean_token_accuracy": 0.8787263631820679, + "num_tokens": 889375308.0, + "step": 23313 + }, + { + "epoch": 2.965780435059153, + "ewc_loss": 0.05190957337617874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.59547869063681e-05, + "grad_norm": 32.21577072143555, + "learning_rate": 1e-06, + "loss": 0.3869, + "mean_token_accuracy": 0.8814108371734619, + "num_tokens": 889413735.0, + "step": 23314 + }, + { + "epoch": 2.9659076453377433, + "ewc_loss": 0.05191444233059883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5957220714190044e-05, + "grad_norm": 32.2285270690918, + "learning_rate": 1e-06, + "loss": 0.4096, + "mean_token_accuracy": 0.8766372203826904, + "num_tokens": 889451585.0, + "step": 23315 + }, + { + "epoch": 2.966034855616334, + "ewc_loss": 0.051830366253852844, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.59151838690741e-05, + "grad_norm": 32.252655029296875, + "learning_rate": 1e-06, + "loss": 0.4708, + "mean_token_accuracy": 0.857336163520813, + "num_tokens": 889487405.0, + "step": 23316 + }, + { + "epoch": 2.9661620658949244, + "ewc_loss": 0.05188695713877678, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5943478249246255e-05, + "grad_norm": 32.25832748413086, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8624318838119507, + "num_tokens": 889523051.0, + "step": 23317 + }, + { + "epoch": 2.966289276173515, + "ewc_loss": 0.051791734993457794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5895868020597845e-05, + "grad_norm": 32.178157806396484, + "learning_rate": 1e-06, + "loss": 0.4613, + "mean_token_accuracy": 0.8547366857528687, + "num_tokens": 889559822.0, + "step": 23318 + }, + { + "epoch": 2.9664164864521054, + "ewc_loss": 0.05176914855837822, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.588457391539123e-05, + "grad_norm": 32.168426513671875, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8732544779777527, + "num_tokens": 889591598.0, + "step": 23319 + }, + { + "epoch": 2.966543696730696, + "ewc_loss": 0.05181293189525604, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5906465452862903e-05, + "grad_norm": 32.21891784667969, + "learning_rate": 1e-06, + "loss": 0.381, + "mean_token_accuracy": 0.8834158182144165, + "num_tokens": 889629787.0, + "step": 23320 + }, + { + "epoch": 2.9666709070092865, + "ewc_loss": 0.05193765461444855, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.596882768557407e-05, + "grad_norm": 32.20945739746094, + "learning_rate": 1e-06, + "loss": 0.3949, + "mean_token_accuracy": 0.8800470232963562, + "num_tokens": 889668730.0, + "step": 23321 + }, + { + "epoch": 2.966798117287877, + "ewc_loss": 0.051908042281866074, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5954021111829206e-05, + "grad_norm": 32.19081497192383, + "learning_rate": 1e-06, + "loss": 0.4216, + "mean_token_accuracy": 0.8723634481430054, + "num_tokens": 889705215.0, + "step": 23322 + }, + { + "epoch": 2.966925327566467, + "ewc_loss": 0.05189260467886925, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.594630313978996e-05, + "grad_norm": 32.13037872314453, + "learning_rate": 1e-06, + "loss": 0.4158, + "mean_token_accuracy": 0.8753647208213806, + "num_tokens": 889738528.0, + "step": 23323 + }, + { + "epoch": 2.967052537845058, + "ewc_loss": 0.05192603915929794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5963019652408548e-05, + "grad_norm": 32.206233978271484, + "learning_rate": 1e-06, + "loss": 0.4113, + "mean_token_accuracy": 0.8770914077758789, + "num_tokens": 889771694.0, + "step": 23324 + }, + { + "epoch": 2.967179748123648, + "ewc_loss": 0.0520610436797142, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6030522349174134e-05, + "grad_norm": 32.27558135986328, + "learning_rate": 1e-06, + "loss": 0.3836, + "mean_token_accuracy": 0.8864357471466064, + "num_tokens": 889806096.0, + "step": 23325 + }, + { + "epoch": 2.967306958402239, + "ewc_loss": 0.05199353024363518, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.599676554382313e-05, + "grad_norm": 32.222103118896484, + "learning_rate": 1e-06, + "loss": 0.4287, + "mean_token_accuracy": 0.8730161786079407, + "num_tokens": 889848514.0, + "step": 23326 + }, + { + "epoch": 2.967434168680829, + "ewc_loss": 0.05198514461517334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5992572773247957e-05, + "grad_norm": 32.184722900390625, + "learning_rate": 1e-06, + "loss": 0.3589, + "mean_token_accuracy": 0.8935269117355347, + "num_tokens": 889889689.0, + "step": 23327 + }, + { + "epoch": 2.96756137895942, + "ewc_loss": 0.05197176709771156, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.598588434921112e-05, + "grad_norm": 32.2882194519043, + "learning_rate": 1e-06, + "loss": 0.4061, + "mean_token_accuracy": 0.8764758706092834, + "num_tokens": 889925550.0, + "step": 23328 + }, + { + "epoch": 2.9676885892380103, + "ewc_loss": 0.05202287435531616, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.601143751235213e-05, + "grad_norm": 32.25947952270508, + "learning_rate": 1e-06, + "loss": 0.4488, + "mean_token_accuracy": 0.8615220785140991, + "num_tokens": 889961797.0, + "step": 23329 + }, + { + "epoch": 2.967815799516601, + "ewc_loss": 0.05202675983309746, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6013380193035118e-05, + "grad_norm": 32.207763671875, + "learning_rate": 1e-06, + "loss": 0.4007, + "mean_token_accuracy": 0.8784176707267761, + "num_tokens": 889997935.0, + "step": 23330 + }, + { + "epoch": 2.9679430097951913, + "ewc_loss": 0.05198018252849579, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.599009167170152e-05, + "grad_norm": 32.172176361083984, + "learning_rate": 1e-06, + "loss": 0.3766, + "mean_token_accuracy": 0.8849586248397827, + "num_tokens": 890037900.0, + "step": 23331 + }, + { + "epoch": 2.968070220073782, + "ewc_loss": 0.05208871513605118, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6044357582577504e-05, + "grad_norm": 32.173858642578125, + "learning_rate": 1e-06, + "loss": 0.3976, + "mean_token_accuracy": 0.8797191381454468, + "num_tokens": 890074814.0, + "step": 23332 + }, + { + "epoch": 2.9681974303523724, + "ewc_loss": 0.0520591214299202, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.602956010377966e-05, + "grad_norm": 32.34912872314453, + "learning_rate": 1e-06, + "loss": 0.3829, + "mean_token_accuracy": 0.8866739273071289, + "num_tokens": 890107091.0, + "step": 23333 + }, + { + "epoch": 2.968324640630963, + "ewc_loss": 0.052001263946294785, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.600063271529507e-05, + "grad_norm": 32.24647521972656, + "learning_rate": 1e-06, + "loss": 0.3961, + "mean_token_accuracy": 0.8820704817771912, + "num_tokens": 890140558.0, + "step": 23334 + }, + { + "epoch": 2.9684518509095534, + "ewc_loss": 0.051988635212183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5994317184085958e-05, + "grad_norm": 32.384700775146484, + "learning_rate": 1e-06, + "loss": 0.4111, + "mean_token_accuracy": 0.8748018741607666, + "num_tokens": 890174563.0, + "step": 23335 + }, + { + "epoch": 2.968579061188144, + "ewc_loss": 0.05196787416934967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.598393621155992e-05, + "grad_norm": 32.21897888183594, + "learning_rate": 1e-06, + "loss": 0.416, + "mean_token_accuracy": 0.8713375329971313, + "num_tokens": 890210584.0, + "step": 23336 + }, + { + "epoch": 2.9687062714667345, + "ewc_loss": 0.05191802978515625, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.595901423774194e-05, + "grad_norm": 32.281227111816406, + "learning_rate": 1e-06, + "loss": 0.3765, + "mean_token_accuracy": 0.88544762134552, + "num_tokens": 890251955.0, + "step": 23337 + }, + { + "epoch": 2.968833481745325, + "ewc_loss": 0.051953017711639404, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5976509277825244e-05, + "grad_norm": 32.199581146240234, + "learning_rate": 1e-06, + "loss": 0.4199, + "mean_token_accuracy": 0.8698750734329224, + "num_tokens": 890289095.0, + "step": 23338 + }, + { + "epoch": 2.9689606920239155, + "ewc_loss": 0.051880981773138046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5940491468645632e-05, + "grad_norm": 32.15908432006836, + "learning_rate": 1e-06, + "loss": 0.4151, + "mean_token_accuracy": 0.8753052949905396, + "num_tokens": 890327864.0, + "step": 23339 + }, + { + "epoch": 2.969087902302506, + "ewc_loss": 0.051911771297454834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.595588557596784e-05, + "grad_norm": 32.343605041503906, + "learning_rate": 1e-06, + "loss": 0.4121, + "mean_token_accuracy": 0.873636782169342, + "num_tokens": 890364244.0, + "step": 23340 + }, + { + "epoch": 2.9692151125810966, + "ewc_loss": 0.052037324756383896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6018662538263015e-05, + "grad_norm": 32.27933120727539, + "learning_rate": 1e-06, + "loss": 0.4656, + "mean_token_accuracy": 0.8598147034645081, + "num_tokens": 890397652.0, + "step": 23341 + }, + { + "epoch": 2.969342322859687, + "ewc_loss": 0.05188746005296707, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5943729269783944e-05, + "grad_norm": 32.33978271484375, + "learning_rate": 1e-06, + "loss": 0.3989, + "mean_token_accuracy": 0.8788567781448364, + "num_tokens": 890435598.0, + "step": 23342 + }, + { + "epoch": 2.9694695331382777, + "ewc_loss": 0.052080318331718445, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.604015935503412e-05, + "grad_norm": 32.309322357177734, + "learning_rate": 1e-06, + "loss": 0.4139, + "mean_token_accuracy": 0.8773356676101685, + "num_tokens": 890473445.0, + "step": 23343 + }, + { + "epoch": 2.969596743416868, + "ewc_loss": 0.051848046481609344, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.592402415757533e-05, + "grad_norm": 32.27585220336914, + "learning_rate": 1e-06, + "loss": 0.3972, + "mean_token_accuracy": 0.8777129650115967, + "num_tokens": 890514263.0, + "step": 23344 + }, + { + "epoch": 2.9697239536954587, + "ewc_loss": 0.051999419927597046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.599971048766747e-05, + "grad_norm": 32.267147064208984, + "learning_rate": 1e-06, + "loss": 0.3887, + "mean_token_accuracy": 0.8798807263374329, + "num_tokens": 890554123.0, + "step": 23345 + }, + { + "epoch": 2.9698511639740492, + "ewc_loss": 0.05185285955667496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.592642886156682e-05, + "grad_norm": 32.25820541381836, + "learning_rate": 1e-06, + "loss": 0.4247, + "mean_token_accuracy": 0.8734601736068726, + "num_tokens": 890593938.0, + "step": 23346 + }, + { + "epoch": 2.9699783742526398, + "ewc_loss": 0.05193657800555229, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.596828926471062e-05, + "grad_norm": 32.286773681640625, + "learning_rate": 1e-06, + "loss": 0.4466, + "mean_token_accuracy": 0.862595796585083, + "num_tokens": 890632260.0, + "step": 23347 + }, + { + "epoch": 2.97010558453123, + "ewc_loss": 0.05187876895070076, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5939383704098873e-05, + "grad_norm": 32.257991790771484, + "learning_rate": 1e-06, + "loss": 0.4172, + "mean_token_accuracy": 0.8709010481834412, + "num_tokens": 890666852.0, + "step": 23348 + }, + { + "epoch": 2.970232794809821, + "ewc_loss": 0.05197160691022873, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5985804313677363e-05, + "grad_norm": 32.40658950805664, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8693457841873169, + "num_tokens": 890707959.0, + "step": 23349 + }, + { + "epoch": 2.970360005088411, + "ewc_loss": 0.05194965377449989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5974826712626964e-05, + "grad_norm": 32.212528228759766, + "learning_rate": 1e-06, + "loss": 0.4935, + "mean_token_accuracy": 0.8530930280685425, + "num_tokens": 890748729.0, + "step": 23350 + }, + { + "epoch": 2.970487215367002, + "ewc_loss": 0.05189487338066101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.594743637018837e-05, + "grad_norm": 32.31269454956055, + "learning_rate": 1e-06, + "loss": 0.3748, + "mean_token_accuracy": 0.8869688510894775, + "num_tokens": 890782116.0, + "step": 23351 + }, + { + "epoch": 2.970614425645592, + "ewc_loss": 0.05192264914512634, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5961324354284443e-05, + "grad_norm": 32.278438568115234, + "learning_rate": 1e-06, + "loss": 0.453, + "mean_token_accuracy": 0.8619080781936646, + "num_tokens": 890827309.0, + "step": 23352 + }, + { + "epoch": 2.970741635924183, + "ewc_loss": 0.05184309184551239, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5921546694007702e-05, + "grad_norm": 32.208831787109375, + "learning_rate": 1e-06, + "loss": 0.458, + "mean_token_accuracy": 0.863717257976532, + "num_tokens": 890863879.0, + "step": 23353 + }, + { + "epoch": 2.970868846202773, + "ewc_loss": 0.051885075867176056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.594253783172462e-05, + "grad_norm": 32.25025939941406, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8763796091079712, + "num_tokens": 890901184.0, + "step": 23354 + }, + { + "epoch": 2.9709960564813636, + "ewc_loss": 0.05193118378520012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.596559170342516e-05, + "grad_norm": 32.05548095703125, + "learning_rate": 1e-06, + "loss": 0.3821, + "mean_token_accuracy": 0.8838001489639282, + "num_tokens": 890937779.0, + "step": 23355 + }, + { + "epoch": 2.971123266759954, + "ewc_loss": 0.05190529301762581, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5952645955840126e-05, + "grad_norm": 32.284584045410156, + "learning_rate": 1e-06, + "loss": 0.3695, + "mean_token_accuracy": 0.8863768577575684, + "num_tokens": 890978055.0, + "step": 23356 + }, + { + "epoch": 2.9712504770385446, + "ewc_loss": 0.0519961379468441, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5998069759225473e-05, + "grad_norm": 32.19729995727539, + "learning_rate": 1e-06, + "loss": 0.5408, + "mean_token_accuracy": 0.8437151908874512, + "num_tokens": 891010985.0, + "step": 23357 + }, + { + "epoch": 2.971377687317135, + "ewc_loss": 0.05186629295349121, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5933146389434114e-05, + "grad_norm": 32.141109466552734, + "learning_rate": 1e-06, + "loss": 0.4013, + "mean_token_accuracy": 0.8783504962921143, + "num_tokens": 891045904.0, + "step": 23358 + }, + { + "epoch": 2.9715048975957257, + "ewc_loss": 0.05201931297779083, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.600965672172606e-05, + "grad_norm": 32.337608337402344, + "learning_rate": 1e-06, + "loss": 0.3884, + "mean_token_accuracy": 0.8772546648979187, + "num_tokens": 891086240.0, + "step": 23359 + }, + { + "epoch": 2.971632107874316, + "ewc_loss": 0.05195244774222374, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5976223696488887e-05, + "grad_norm": 32.136497497558594, + "learning_rate": 1e-06, + "loss": 0.4226, + "mean_token_accuracy": 0.8713029026985168, + "num_tokens": 891126482.0, + "step": 23360 + }, + { + "epoch": 2.9717593181529067, + "ewc_loss": 0.05185576528310776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5927882234100252e-05, + "grad_norm": 32.1574821472168, + "learning_rate": 1e-06, + "loss": 0.444, + "mean_token_accuracy": 0.8640565872192383, + "num_tokens": 891165982.0, + "step": 23361 + }, + { + "epoch": 2.9718865284314973, + "ewc_loss": 0.05208612605929375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.604306246212218e-05, + "grad_norm": 32.23005676269531, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.860201358795166, + "num_tokens": 891199101.0, + "step": 23362 + }, + { + "epoch": 2.972013738710088, + "ewc_loss": 0.052015066146850586, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.600753214210272e-05, + "grad_norm": 32.217506408691406, + "learning_rate": 1e-06, + "loss": 0.4251, + "mean_token_accuracy": 0.8733063340187073, + "num_tokens": 891231511.0, + "step": 23363 + }, + { + "epoch": 2.9721409489886783, + "ewc_loss": 0.0519765242934227, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5988261768361554e-05, + "grad_norm": 32.19286346435547, + "learning_rate": 1e-06, + "loss": 0.4563, + "mean_token_accuracy": 0.862401008605957, + "num_tokens": 891262369.0, + "step": 23364 + }, + { + "epoch": 2.972268159267269, + "ewc_loss": 0.052046697586774826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.602334825496655e-05, + "grad_norm": 32.23704528808594, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8577420711517334, + "num_tokens": 891304206.0, + "step": 23365 + }, + { + "epoch": 2.9723953695458594, + "ewc_loss": 0.05202624201774597, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6013121896539815e-05, + "grad_norm": 32.153926849365234, + "learning_rate": 1e-06, + "loss": 0.4541, + "mean_token_accuracy": 0.8618544340133667, + "num_tokens": 891348006.0, + "step": 23366 + }, + { + "epoch": 2.97252257982445, + "ewc_loss": 0.051992468535900116, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5996234398917295e-05, + "grad_norm": 32.374412536621094, + "learning_rate": 1e-06, + "loss": 0.3883, + "mean_token_accuracy": 0.8808248043060303, + "num_tokens": 891386851.0, + "step": 23367 + }, + { + "epoch": 2.9726497901030404, + "ewc_loss": 0.05209125950932503, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6045629056170583e-05, + "grad_norm": 32.26078414916992, + "learning_rate": 1e-06, + "loss": 0.3885, + "mean_token_accuracy": 0.8829135894775391, + "num_tokens": 891423582.0, + "step": 23368 + }, + { + "epoch": 2.972777000381631, + "ewc_loss": 0.05199449136853218, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5997245757025667e-05, + "grad_norm": 32.19877624511719, + "learning_rate": 1e-06, + "loss": 0.3597, + "mean_token_accuracy": 0.8889064788818359, + "num_tokens": 891463773.0, + "step": 23369 + }, + { + "epoch": 2.9729042106602215, + "ewc_loss": 0.05197322368621826, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5986611944972537e-05, + "grad_norm": 32.24021530151367, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8846647143363953, + "num_tokens": 891498117.0, + "step": 23370 + }, + { + "epoch": 2.973031420938812, + "ewc_loss": 0.0520266555249691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6013327442342415e-05, + "grad_norm": 32.25830078125, + "learning_rate": 1e-06, + "loss": 0.3892, + "mean_token_accuracy": 0.881125271320343, + "num_tokens": 891529055.0, + "step": 23371 + }, + { + "epoch": 2.9731586312174025, + "ewc_loss": 0.051995694637298584, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.599784784251824e-05, + "grad_norm": 32.253780364990234, + "learning_rate": 1e-06, + "loss": 0.3859, + "mean_token_accuracy": 0.8819588422775269, + "num_tokens": 891567370.0, + "step": 23372 + }, + { + "epoch": 2.9732858414959926, + "ewc_loss": 0.05200006812810898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6000034267781302e-05, + "grad_norm": 32.30695724487305, + "learning_rate": 1e-06, + "loss": 0.4052, + "mean_token_accuracy": 0.8755780458450317, + "num_tokens": 891602737.0, + "step": 23373 + }, + { + "epoch": 2.9734130517745836, + "ewc_loss": 0.052091099321842194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6045549020636827e-05, + "grad_norm": 32.39332962036133, + "learning_rate": 1e-06, + "loss": 0.4524, + "mean_token_accuracy": 0.8655202388763428, + "num_tokens": 891639713.0, + "step": 23374 + }, + { + "epoch": 2.9735402620531737, + "ewc_loss": 0.05196058750152588, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5980294594774023e-05, + "grad_norm": 32.26121139526367, + "learning_rate": 1e-06, + "loss": 0.3839, + "mean_token_accuracy": 0.8854309320449829, + "num_tokens": 891673931.0, + "step": 23375 + }, + { + "epoch": 2.9736674723317646, + "ewc_loss": 0.05197807773947716, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5989038476836868e-05, + "grad_norm": 32.309242248535156, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8725563287734985, + "num_tokens": 891708015.0, + "step": 23376 + }, + { + "epoch": 2.9737946826103547, + "ewc_loss": 0.05196071416139603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5980356440413743e-05, + "grad_norm": 32.14221954345703, + "learning_rate": 1e-06, + "loss": 0.4281, + "mean_token_accuracy": 0.8697142004966736, + "num_tokens": 891746023.0, + "step": 23377 + }, + { + "epoch": 2.9739218928889453, + "ewc_loss": 0.05192575231194496, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5962875952245668e-05, + "grad_norm": 32.22831344604492, + "learning_rate": 1e-06, + "loss": 0.4323, + "mean_token_accuracy": 0.871580183506012, + "num_tokens": 891777767.0, + "step": 23378 + }, + { + "epoch": 2.974049103167536, + "ewc_loss": 0.05210408195853233, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6052040993818082e-05, + "grad_norm": 32.24838638305664, + "learning_rate": 1e-06, + "loss": 0.4006, + "mean_token_accuracy": 0.8762319684028625, + "num_tokens": 891810816.0, + "step": 23379 + }, + { + "epoch": 2.9741763134461263, + "ewc_loss": 0.05190598592162132, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5952993382816203e-05, + "grad_norm": 32.14396286010742, + "learning_rate": 1e-06, + "loss": 0.4274, + "mean_token_accuracy": 0.8669144511222839, + "num_tokens": 891848810.0, + "step": 23380 + }, + { + "epoch": 2.974303523724717, + "ewc_loss": 0.05203899368643761, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6019497454399243e-05, + "grad_norm": 32.13581085205078, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8768911361694336, + "num_tokens": 891892553.0, + "step": 23381 + }, + { + "epoch": 2.9744307340033074, + "ewc_loss": 0.05204988643527031, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.602494350867346e-05, + "grad_norm": 32.422508239746094, + "learning_rate": 1e-06, + "loss": 0.3782, + "mean_token_accuracy": 0.8844174146652222, + "num_tokens": 891927539.0, + "step": 23382 + }, + { + "epoch": 2.974557944281898, + "ewc_loss": 0.05204281583428383, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6021407393272966e-05, + "grad_norm": 32.103511810302734, + "learning_rate": 1e-06, + "loss": 0.3923, + "mean_token_accuracy": 0.8778303265571594, + "num_tokens": 891959577.0, + "step": 23383 + }, + { + "epoch": 2.9746851545604884, + "ewc_loss": 0.05194449797272682, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.597224920464214e-05, + "grad_norm": 32.43667984008789, + "learning_rate": 1e-06, + "loss": 0.4997, + "mean_token_accuracy": 0.8479021191596985, + "num_tokens": 892000799.0, + "step": 23384 + }, + { + "epoch": 2.974812364839079, + "ewc_loss": 0.052158135920763016, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6079067538375966e-05, + "grad_norm": 32.277767181396484, + "learning_rate": 1e-06, + "loss": 0.4357, + "mean_token_accuracy": 0.8673666715621948, + "num_tokens": 892042200.0, + "step": 23385 + }, + { + "epoch": 2.9749395751176695, + "ewc_loss": 0.051940735429525375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5970368369598873e-05, + "grad_norm": 32.2227783203125, + "learning_rate": 1e-06, + "loss": 0.4465, + "mean_token_accuracy": 0.864341676235199, + "num_tokens": 892084838.0, + "step": 23386 + }, + { + "epoch": 2.97506678539626, + "ewc_loss": 0.0520804189145565, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.604021028673742e-05, + "grad_norm": 32.312442779541016, + "learning_rate": 1e-06, + "loss": 0.4632, + "mean_token_accuracy": 0.8571683168411255, + "num_tokens": 892126442.0, + "step": 23387 + }, + { + "epoch": 2.9751939956748505, + "ewc_loss": 0.05195029079914093, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5975145035772584e-05, + "grad_norm": 32.25371551513672, + "learning_rate": 1e-06, + "loss": 0.4339, + "mean_token_accuracy": 0.8622905015945435, + "num_tokens": 892165019.0, + "step": 23388 + }, + { + "epoch": 2.975321205953441, + "ewc_loss": 0.05196778103709221, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5983890736824833e-05, + "grad_norm": 32.22422409057617, + "learning_rate": 1e-06, + "loss": 0.4129, + "mean_token_accuracy": 0.8754788637161255, + "num_tokens": 892202291.0, + "step": 23389 + }, + { + "epoch": 2.9754484162320316, + "ewc_loss": 0.05196436494588852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5982182705774903e-05, + "grad_norm": 32.23398208618164, + "learning_rate": 1e-06, + "loss": 0.4497, + "mean_token_accuracy": 0.8633570671081543, + "num_tokens": 892235159.0, + "step": 23390 + }, + { + "epoch": 2.975575626510622, + "ewc_loss": 0.052043020725250244, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6021511075668968e-05, + "grad_norm": 32.25905227661133, + "learning_rate": 1e-06, + "loss": 0.4586, + "mean_token_accuracy": 0.8609691262245178, + "num_tokens": 892271967.0, + "step": 23391 + }, + { + "epoch": 2.9757028367892127, + "ewc_loss": 0.05196071416139603, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5980356440413743e-05, + "grad_norm": 32.130767822265625, + "learning_rate": 1e-06, + "loss": 0.4664, + "mean_token_accuracy": 0.8589946031570435, + "num_tokens": 892306230.0, + "step": 23392 + }, + { + "epoch": 2.975830047067803, + "ewc_loss": 0.052084218710660934, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.604210931167472e-05, + "grad_norm": 32.26741409301758, + "learning_rate": 1e-06, + "loss": 0.402, + "mean_token_accuracy": 0.8754458427429199, + "num_tokens": 892342118.0, + "step": 23393 + }, + { + "epoch": 2.9759572573463937, + "ewc_loss": 0.052128951996564865, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.606447560538072e-05, + "grad_norm": 32.22386932373047, + "learning_rate": 1e-06, + "loss": 0.4033, + "mean_token_accuracy": 0.8778746724128723, + "num_tokens": 892380764.0, + "step": 23394 + }, + { + "epoch": 2.9760844676249842, + "ewc_loss": 0.05204623565077782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.60231172433123e-05, + "grad_norm": 32.341285705566406, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8700685501098633, + "num_tokens": 892421113.0, + "step": 23395 + }, + { + "epoch": 2.9762116779035748, + "ewc_loss": 0.05205647274851799, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6028235879493877e-05, + "grad_norm": 32.10287094116211, + "learning_rate": 1e-06, + "loss": 0.3969, + "mean_token_accuracy": 0.8797235488891602, + "num_tokens": 892457512.0, + "step": 23396 + }, + { + "epoch": 2.9763388881821653, + "ewc_loss": 0.05201231688261032, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.600615880510304e-05, + "grad_norm": 32.31229019165039, + "learning_rate": 1e-06, + "loss": 0.393, + "mean_token_accuracy": 0.8781337141990662, + "num_tokens": 892493599.0, + "step": 23397 + }, + { + "epoch": 2.9764660984607554, + "ewc_loss": 0.0521124042570591, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6056202841573395e-05, + "grad_norm": 32.16337585449219, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8642232418060303, + "num_tokens": 892528474.0, + "step": 23398 + }, + { + "epoch": 2.9765933087393464, + "ewc_loss": 0.05204140767455101, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6020703444373794e-05, + "grad_norm": 32.24403762817383, + "learning_rate": 1e-06, + "loss": 0.3912, + "mean_token_accuracy": 0.8799411058425903, + "num_tokens": 892569591.0, + "step": 23399 + }, + { + "epoch": 2.9767205190179364, + "ewc_loss": 0.0521649606525898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6082479962497018e-05, + "grad_norm": 32.24633026123047, + "learning_rate": 1e-06, + "loss": 0.3957, + "mean_token_accuracy": 0.8783921003341675, + "num_tokens": 892607185.0, + "step": 23400 + }, + { + "epoch": 2.9768477292965274, + "ewc_loss": 0.05197940766811371, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5989704226958565e-05, + "grad_norm": 32.09286117553711, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8735889792442322, + "num_tokens": 892641453.0, + "step": 23401 + }, + { + "epoch": 2.9769749395751175, + "ewc_loss": 0.052087049931287766, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.604352448543068e-05, + "grad_norm": 32.31070327758789, + "learning_rate": 1e-06, + "loss": 0.4372, + "mean_token_accuracy": 0.8676304221153259, + "num_tokens": 892676802.0, + "step": 23402 + }, + { + "epoch": 2.977102149853708, + "ewc_loss": 0.05210466682910919, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6052333851112053e-05, + "grad_norm": 32.17046356201172, + "learning_rate": 1e-06, + "loss": 0.391, + "mean_token_accuracy": 0.8853554725646973, + "num_tokens": 892711694.0, + "step": 23403 + }, + { + "epoch": 2.9772293601322986, + "ewc_loss": 0.052030049264431, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6015024559455924e-05, + "grad_norm": 32.16697311401367, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8640093207359314, + "num_tokens": 892752662.0, + "step": 23404 + }, + { + "epoch": 2.977356570410889, + "ewc_loss": 0.052130136638879776, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6065068595926277e-05, + "grad_norm": 32.14372253417969, + "learning_rate": 1e-06, + "loss": 0.4135, + "mean_token_accuracy": 0.8752407431602478, + "num_tokens": 892790170.0, + "step": 23405 + }, + { + "epoch": 2.9774837806894796, + "ewc_loss": 0.05199361965060234, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5996809199568816e-05, + "grad_norm": 32.131263732910156, + "learning_rate": 1e-06, + "loss": 0.4811, + "mean_token_accuracy": 0.855034351348877, + "num_tokens": 892831437.0, + "step": 23406 + }, + { + "epoch": 2.97761099096807, + "ewc_loss": 0.05203758552670479, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.601879350550007e-05, + "grad_norm": 32.11566925048828, + "learning_rate": 1e-06, + "loss": 0.3506, + "mean_token_accuracy": 0.8948734998703003, + "num_tokens": 892869022.0, + "step": 23407 + }, + { + "epoch": 2.9777382012466607, + "ewc_loss": 0.05212698504328728, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6063493351102807e-05, + "grad_norm": 32.20903396606445, + "learning_rate": 1e-06, + "loss": 0.415, + "mean_token_accuracy": 0.8749755620956421, + "num_tokens": 892910738.0, + "step": 23408 + }, + { + "epoch": 2.977865411525251, + "ewc_loss": 0.052061066031455994, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6030533263110556e-05, + "grad_norm": 32.17488098144531, + "learning_rate": 1e-06, + "loss": 0.3994, + "mean_token_accuracy": 0.8815615177154541, + "num_tokens": 892952055.0, + "step": 23409 + }, + { + "epoch": 2.9779926218038417, + "ewc_loss": 0.05202460661530495, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.601230335130822e-05, + "grad_norm": 32.20457458496094, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8659510612487793, + "num_tokens": 892985843.0, + "step": 23410 + }, + { + "epoch": 2.9781198320824323, + "ewc_loss": 0.05205070227384567, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6025350962299854e-05, + "grad_norm": 32.25009536743164, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8633637428283691, + "num_tokens": 893031197.0, + "step": 23411 + }, + { + "epoch": 2.978247042361023, + "ewc_loss": 0.05204307287931442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6021536541520618e-05, + "grad_norm": 32.047542572021484, + "learning_rate": 1e-06, + "loss": 0.3667, + "mean_token_accuracy": 0.8864444494247437, + "num_tokens": 893068113.0, + "step": 23412 + }, + { + "epoch": 2.9783742526396133, + "ewc_loss": 0.05219480022788048, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.60973993135849e-05, + "grad_norm": 32.30994415283203, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8835299015045166, + "num_tokens": 893109148.0, + "step": 23413 + }, + { + "epoch": 2.978501462918204, + "ewc_loss": 0.05203573405742645, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6017867639893666e-05, + "grad_norm": 32.02373123168945, + "learning_rate": 1e-06, + "loss": 0.3988, + "mean_token_accuracy": 0.8783522248268127, + "num_tokens": 893151406.0, + "step": 23414 + }, + { + "epoch": 2.9786286731967944, + "ewc_loss": 0.051991868764162064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.599593426566571e-05, + "grad_norm": 32.181209564208984, + "learning_rate": 1e-06, + "loss": 0.3999, + "mean_token_accuracy": 0.8772604465484619, + "num_tokens": 893190209.0, + "step": 23415 + }, + { + "epoch": 2.978755883475385, + "ewc_loss": 0.052085548639297485, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.604277506179642e-05, + "grad_norm": 32.04448318481445, + "learning_rate": 1e-06, + "loss": 0.4288, + "mean_token_accuracy": 0.8730636835098267, + "num_tokens": 893232949.0, + "step": 23416 + }, + { + "epoch": 2.9788830937539754, + "ewc_loss": 0.0520523302257061, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6026165869552642e-05, + "grad_norm": 32.18962097167969, + "learning_rate": 1e-06, + "loss": 0.4141, + "mean_token_accuracy": 0.8739393353462219, + "num_tokens": 893273931.0, + "step": 23417 + }, + { + "epoch": 2.979010304032566, + "ewc_loss": 0.0521356575191021, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6067828002851456e-05, + "grad_norm": 32.24994659423828, + "learning_rate": 1e-06, + "loss": 0.4745, + "mean_token_accuracy": 0.8540287017822266, + "num_tokens": 893317465.0, + "step": 23418 + }, + { + "epoch": 2.9791375143111565, + "ewc_loss": 0.05201488360762596, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.600744119263254e-05, + "grad_norm": 32.229183197021484, + "learning_rate": 1e-06, + "loss": 0.4683, + "mean_token_accuracy": 0.858349084854126, + "num_tokens": 893354089.0, + "step": 23419 + }, + { + "epoch": 2.979264724589747, + "ewc_loss": 0.05202453210949898, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6012265152530745e-05, + "grad_norm": 32.187957763671875, + "learning_rate": 1e-06, + "loss": 0.4192, + "mean_token_accuracy": 0.8707017302513123, + "num_tokens": 893388051.0, + "step": 23420 + }, + { + "epoch": 2.979391934868337, + "ewc_loss": 0.05204404518008232, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6022022211691365e-05, + "grad_norm": 32.265743255615234, + "learning_rate": 1e-06, + "loss": 0.4379, + "mean_token_accuracy": 0.8682702779769897, + "num_tokens": 893426191.0, + "step": 23421 + }, + { + "epoch": 2.979519145146928, + "ewc_loss": 0.052028853446245193, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6014426111942157e-05, + "grad_norm": 32.355220794677734, + "learning_rate": 1e-06, + "loss": 0.4375, + "mean_token_accuracy": 0.8663591146469116, + "num_tokens": 893462134.0, + "step": 23422 + }, + { + "epoch": 2.979646355425518, + "ewc_loss": 0.051925092935562134, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5962546715163626e-05, + "grad_norm": 32.095035552978516, + "learning_rate": 1e-06, + "loss": 0.4484, + "mean_token_accuracy": 0.8680131435394287, + "num_tokens": 893503159.0, + "step": 23423 + }, + { + "epoch": 2.979773565704109, + "ewc_loss": 0.05197595804929733, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5987979825004004e-05, + "grad_norm": 32.319862365722656, + "learning_rate": 1e-06, + "loss": 0.4852, + "mean_token_accuracy": 0.8554346561431885, + "num_tokens": 893538379.0, + "step": 23424 + }, + { + "epoch": 2.979900775982699, + "ewc_loss": 0.05207958444952965, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6039791919174604e-05, + "grad_norm": 32.23672866821289, + "learning_rate": 1e-06, + "loss": 0.4498, + "mean_token_accuracy": 0.8606696128845215, + "num_tokens": 893580431.0, + "step": 23425 + }, + { + "epoch": 2.98002798626129, + "ewc_loss": 0.05203614383935928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6018071366706863e-05, + "grad_norm": 32.368080139160156, + "learning_rate": 1e-06, + "loss": 0.4504, + "mean_token_accuracy": 0.8663902282714844, + "num_tokens": 893622412.0, + "step": 23426 + }, + { + "epoch": 2.9801551965398803, + "ewc_loss": 0.0519702285528183, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.598511491669342e-05, + "grad_norm": 32.26592254638672, + "learning_rate": 1e-06, + "loss": 0.4197, + "mean_token_accuracy": 0.876049816608429, + "num_tokens": 893658575.0, + "step": 23427 + }, + { + "epoch": 2.980282406818471, + "ewc_loss": 0.05185672268271446, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5928360628313385e-05, + "grad_norm": 32.218387603759766, + "learning_rate": 1e-06, + "loss": 0.3919, + "mean_token_accuracy": 0.8797985911369324, + "num_tokens": 893702812.0, + "step": 23428 + }, + { + "epoch": 2.9804096170970613, + "ewc_loss": 0.051991552114486694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.59957760135876e-05, + "grad_norm": 32.349281311035156, + "learning_rate": 1e-06, + "loss": 0.4386, + "mean_token_accuracy": 0.8661202788352966, + "num_tokens": 893742785.0, + "step": 23429 + }, + { + "epoch": 2.980536827375652, + "ewc_loss": 0.051881104707717896, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.594055149529595e-05, + "grad_norm": 32.16527557373047, + "learning_rate": 1e-06, + "loss": 0.4021, + "mean_token_accuracy": 0.8783916234970093, + "num_tokens": 893782310.0, + "step": 23430 + }, + { + "epoch": 2.9806640376542424, + "ewc_loss": 0.05189470946788788, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.594735451566521e-05, + "grad_norm": 32.088985443115234, + "learning_rate": 1e-06, + "loss": 0.3579, + "mean_token_accuracy": 0.8893370628356934, + "num_tokens": 893822794.0, + "step": 23431 + }, + { + "epoch": 2.980791247932833, + "ewc_loss": 0.052011724561452866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6005862309830263e-05, + "grad_norm": 32.26300048828125, + "learning_rate": 1e-06, + "loss": 0.4124, + "mean_token_accuracy": 0.8779771327972412, + "num_tokens": 893870462.0, + "step": 23432 + }, + { + "epoch": 2.9809184582114234, + "ewc_loss": 0.052097387611866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6048694053315558e-05, + "grad_norm": 32.190673828125, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8727242946624756, + "num_tokens": 893910654.0, + "step": 23433 + }, + { + "epoch": 2.981045668490014, + "ewc_loss": 0.05194460600614548, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5972303774324246e-05, + "grad_norm": 32.197879791259766, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8698605895042419, + "num_tokens": 893948819.0, + "step": 23434 + }, + { + "epoch": 2.9811728787686045, + "ewc_loss": 0.05195661261677742, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5978306439355947e-05, + "grad_norm": 32.2091064453125, + "learning_rate": 1e-06, + "loss": 0.3935, + "mean_token_accuracy": 0.8793139457702637, + "num_tokens": 893983325.0, + "step": 23435 + }, + { + "epoch": 2.981300089047195, + "ewc_loss": 0.052020683884620667, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6010342480731197e-05, + "grad_norm": 32.19729995727539, + "learning_rate": 1e-06, + "loss": 0.4156, + "mean_token_accuracy": 0.8715088367462158, + "num_tokens": 894023712.0, + "step": 23436 + }, + { + "epoch": 2.9814272993257855, + "ewc_loss": 0.05195415019989014, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5977074983529747e-05, + "grad_norm": 32.18553161621094, + "learning_rate": 1e-06, + "loss": 0.3846, + "mean_token_accuracy": 0.8824367523193359, + "num_tokens": 894059719.0, + "step": 23437 + }, + { + "epoch": 2.981554509604376, + "ewc_loss": 0.05203322321176529, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.601661253720522e-05, + "grad_norm": 32.391578674316406, + "learning_rate": 1e-06, + "loss": 0.4131, + "mean_token_accuracy": 0.8743994235992432, + "num_tokens": 894100150.0, + "step": 23438 + }, + { + "epoch": 2.9816817198829666, + "ewc_loss": 0.051956307142972946, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.597815364424605e-05, + "grad_norm": 32.185943603515625, + "learning_rate": 1e-06, + "loss": 0.5001, + "mean_token_accuracy": 0.8527994155883789, + "num_tokens": 894134336.0, + "step": 23439 + }, + { + "epoch": 2.981808930161557, + "ewc_loss": 0.05183156579732895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5915782316587865e-05, + "grad_norm": 32.24364471435547, + "learning_rate": 1e-06, + "loss": 0.461, + "mean_token_accuracy": 0.8585600256919861, + "num_tokens": 894168972.0, + "step": 23440 + }, + { + "epoch": 2.9819361404401477, + "ewc_loss": 0.052100006490945816, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.605000372568611e-05, + "grad_norm": 32.32477951049805, + "learning_rate": 1e-06, + "loss": 0.4232, + "mean_token_accuracy": 0.8684986233711243, + "num_tokens": 894204641.0, + "step": 23441 + }, + { + "epoch": 2.982063350718738, + "ewc_loss": 0.0519639328122139, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.598196624603588e-05, + "grad_norm": 32.19864273071289, + "learning_rate": 1e-06, + "loss": 0.4205, + "mean_token_accuracy": 0.8761692047119141, + "num_tokens": 894248403.0, + "step": 23442 + }, + { + "epoch": 2.9821905609973287, + "ewc_loss": 0.05195757374167442, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5978786652558483e-05, + "grad_norm": 32.188724517822266, + "learning_rate": 1e-06, + "loss": 0.3568, + "mean_token_accuracy": 0.890338659286499, + "num_tokens": 894284035.0, + "step": 23443 + }, + { + "epoch": 2.9823177712759192, + "ewc_loss": 0.05194918066263199, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5974590244004503e-05, + "grad_norm": 32.2845458984375, + "learning_rate": 1e-06, + "loss": 0.3967, + "mean_token_accuracy": 0.8763402700424194, + "num_tokens": 894321804.0, + "step": 23444 + }, + { + "epoch": 2.9824449815545098, + "ewc_loss": 0.052080441266298294, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.604022120067384e-05, + "grad_norm": 32.23983383178711, + "learning_rate": 1e-06, + "loss": 0.3886, + "mean_token_accuracy": 0.8808457255363464, + "num_tokens": 894355648.0, + "step": 23445 + }, + { + "epoch": 2.9825721918331, + "ewc_loss": 0.05199265480041504, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5996327167376876e-05, + "grad_norm": 32.23243713378906, + "learning_rate": 1e-06, + "loss": 0.4095, + "mean_token_accuracy": 0.8731353282928467, + "num_tokens": 894392947.0, + "step": 23446 + }, + { + "epoch": 2.982699402111691, + "ewc_loss": 0.052108876407146454, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6054438421851955e-05, + "grad_norm": 32.2928352355957, + "learning_rate": 1e-06, + "loss": 0.4496, + "mean_token_accuracy": 0.8639761209487915, + "num_tokens": 894433437.0, + "step": 23447 + }, + { + "epoch": 2.982826612390281, + "ewc_loss": 0.051976561546325684, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.598827995825559e-05, + "grad_norm": 32.18914031982422, + "learning_rate": 1e-06, + "loss": 0.404, + "mean_token_accuracy": 0.8782252669334412, + "num_tokens": 894481428.0, + "step": 23448 + }, + { + "epoch": 2.982953822668872, + "ewc_loss": 0.051999542862176895, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5999772333307192e-05, + "grad_norm": 32.24958419799805, + "learning_rate": 1e-06, + "loss": 0.4175, + "mean_token_accuracy": 0.8727028369903564, + "num_tokens": 894518055.0, + "step": 23449 + }, + { + "epoch": 2.983081032947462, + "ewc_loss": 0.052022214978933334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.601110827527009e-05, + "grad_norm": 32.19413757324219, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8682594299316406, + "num_tokens": 894562693.0, + "step": 23450 + }, + { + "epoch": 2.983208243226053, + "ewc_loss": 0.052065517753362656, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6032759706140496e-05, + "grad_norm": 32.228790283203125, + "learning_rate": 1e-06, + "loss": 0.4219, + "mean_token_accuracy": 0.8723436594009399, + "num_tokens": 894604973.0, + "step": 23451 + }, + { + "epoch": 2.983335453504643, + "ewc_loss": 0.05201878771185875, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6009392968262546e-05, + "grad_norm": 32.244842529296875, + "learning_rate": 1e-06, + "loss": 0.3946, + "mean_token_accuracy": 0.8794956803321838, + "num_tokens": 894645672.0, + "step": 23452 + }, + { + "epoch": 2.9834626637832335, + "ewc_loss": 0.05194652080535889, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.597326056275051e-05, + "grad_norm": 32.11751937866211, + "learning_rate": 1e-06, + "loss": 0.4249, + "mean_token_accuracy": 0.8713023662567139, + "num_tokens": 894686702.0, + "step": 23453 + }, + { + "epoch": 2.983589874061824, + "ewc_loss": 0.052018485963344574, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6009243811131455e-05, + "grad_norm": 32.163909912109375, + "learning_rate": 1e-06, + "loss": 0.3931, + "mean_token_accuracy": 0.8816555142402649, + "num_tokens": 894722768.0, + "step": 23454 + }, + { + "epoch": 2.9837170843404146, + "ewc_loss": 0.052166204899549484, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.608310205687303e-05, + "grad_norm": 32.227352142333984, + "learning_rate": 1e-06, + "loss": 0.3852, + "mean_token_accuracy": 0.8859075307846069, + "num_tokens": 894759216.0, + "step": 23455 + }, + { + "epoch": 2.983844294619005, + "ewc_loss": 0.05208555981516838, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.604278051876463e-05, + "grad_norm": 32.15637969970703, + "learning_rate": 1e-06, + "loss": 0.3974, + "mean_token_accuracy": 0.8779241442680359, + "num_tokens": 894799691.0, + "step": 23456 + }, + { + "epoch": 2.9839715048975957, + "ewc_loss": 0.05209510773420334, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6047553546959534e-05, + "grad_norm": 32.30504608154297, + "learning_rate": 1e-06, + "loss": 0.4352, + "mean_token_accuracy": 0.8702244162559509, + "num_tokens": 894835869.0, + "step": 23457 + }, + { + "epoch": 2.984098715176186, + "ewc_loss": 0.052172642201185226, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6086321668117307e-05, + "grad_norm": 32.283180236816406, + "learning_rate": 1e-06, + "loss": 0.4774, + "mean_token_accuracy": 0.8518654108047485, + "num_tokens": 894875706.0, + "step": 23458 + }, + { + "epoch": 2.9842259254547767, + "ewc_loss": 0.051987480372190475, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5993740564445034e-05, + "grad_norm": 32.078617095947266, + "learning_rate": 1e-06, + "loss": 0.4067, + "mean_token_accuracy": 0.8785855770111084, + "num_tokens": 894920135.0, + "step": 23459 + }, + { + "epoch": 2.9843531357333672, + "ewc_loss": 0.05211078003048897, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.605538975331001e-05, + "grad_norm": 32.372154235839844, + "learning_rate": 1e-06, + "loss": 0.4075, + "mean_token_accuracy": 0.8751403093338013, + "num_tokens": 894960825.0, + "step": 23460 + }, + { + "epoch": 2.9844803460119578, + "ewc_loss": 0.05219113081693649, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6095565772266127e-05, + "grad_norm": 32.17148971557617, + "learning_rate": 1e-06, + "loss": 0.4161, + "mean_token_accuracy": 0.8750718832015991, + "num_tokens": 895001935.0, + "step": 23461 + }, + { + "epoch": 2.9846075562905483, + "ewc_loss": 0.05193891376256943, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5969457055907696e-05, + "grad_norm": 32.23757553100586, + "learning_rate": 1e-06, + "loss": 0.3798, + "mean_token_accuracy": 0.8817309141159058, + "num_tokens": 895033559.0, + "step": 23462 + }, + { + "epoch": 2.984734766569139, + "ewc_loss": 0.05205244570970535, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6026222258224152e-05, + "grad_norm": 32.237815856933594, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8723697066307068, + "num_tokens": 895070958.0, + "step": 23463 + }, + { + "epoch": 2.9848619768477294, + "ewc_loss": 0.052030064165592194, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6015031835413538e-05, + "grad_norm": 32.46387481689453, + "learning_rate": 1e-06, + "loss": 0.4207, + "mean_token_accuracy": 0.871719479560852, + "num_tokens": 895111533.0, + "step": 23464 + }, + { + "epoch": 2.98498918712632, + "ewc_loss": 0.05205328390002251, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.602664244477637e-05, + "grad_norm": 32.19585037231445, + "learning_rate": 1e-06, + "loss": 0.4263, + "mean_token_accuracy": 0.8690001964569092, + "num_tokens": 895149861.0, + "step": 23465 + }, + { + "epoch": 2.9851163974049104, + "ewc_loss": 0.051922012120485306, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5961006031138822e-05, + "grad_norm": 32.19925308227539, + "learning_rate": 1e-06, + "loss": 0.3962, + "mean_token_accuracy": 0.8798511624336243, + "num_tokens": 895192958.0, + "step": 23466 + }, + { + "epoch": 2.985243607683501, + "ewc_loss": 0.051998939365148544, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.5999470381066203e-05, + "grad_norm": 32.20631790161133, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.883608341217041, + "num_tokens": 895226710.0, + "step": 23467 + }, + { + "epoch": 2.9853708179620915, + "ewc_loss": 0.05200009420514107, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6000047000707127e-05, + "grad_norm": 32.28522872924805, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8777843117713928, + "num_tokens": 895267695.0, + "step": 23468 + }, + { + "epoch": 2.985498028240682, + "ewc_loss": 0.05218628793954849, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6093144697370008e-05, + "grad_norm": 32.223899841308594, + "learning_rate": 1e-06, + "loss": 0.3731, + "mean_token_accuracy": 0.8890178799629211, + "num_tokens": 895303101.0, + "step": 23469 + }, + { + "epoch": 2.9856252385192725, + "ewc_loss": 0.05202440544962883, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6012203306891024e-05, + "grad_norm": 32.251338958740234, + "learning_rate": 1e-06, + "loss": 0.395, + "mean_token_accuracy": 0.8786808848381042, + "num_tokens": 895340437.0, + "step": 23470 + }, + { + "epoch": 2.9857524487978626, + "ewc_loss": 0.05207373574376106, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6036866984213702e-05, + "grad_norm": 32.180110931396484, + "learning_rate": 1e-06, + "loss": 0.377, + "mean_token_accuracy": 0.8873653411865234, + "num_tokens": 895377695.0, + "step": 23471 + }, + { + "epoch": 2.9858796590764536, + "ewc_loss": 0.05200330540537834, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6001653168350458e-05, + "grad_norm": 32.19447708129883, + "learning_rate": 1e-06, + "loss": 0.4445, + "mean_token_accuracy": 0.8638882637023926, + "num_tokens": 895418599.0, + "step": 23472 + }, + { + "epoch": 2.9860068693550437, + "ewc_loss": 0.05213438346982002, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6067191356560215e-05, + "grad_norm": 32.33442306518555, + "learning_rate": 1e-06, + "loss": 0.4157, + "mean_token_accuracy": 0.8729936480522156, + "num_tokens": 895454832.0, + "step": 23473 + }, + { + "epoch": 2.9861340796336346, + "ewc_loss": 0.05213097482919693, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6065486963489093e-05, + "grad_norm": 32.20207977294922, + "learning_rate": 1e-06, + "loss": 0.4788, + "mean_token_accuracy": 0.8559253215789795, + "num_tokens": 895490607.0, + "step": 23474 + }, + { + "epoch": 2.9862612899122247, + "ewc_loss": 0.05204736441373825, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6023682949016802e-05, + "grad_norm": 32.239253997802734, + "learning_rate": 1e-06, + "loss": 0.4299, + "mean_token_accuracy": 0.866513729095459, + "num_tokens": 895531340.0, + "step": 23475 + }, + { + "epoch": 2.9863885001908153, + "ewc_loss": 0.05210879445075989, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6054396585095674e-05, + "grad_norm": 32.198089599609375, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8743072748184204, + "num_tokens": 895568688.0, + "step": 23476 + }, + { + "epoch": 2.986515710469406, + "ewc_loss": 0.05215189605951309, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6075947971548885e-05, + "grad_norm": 32.29819869995117, + "learning_rate": 1e-06, + "loss": 0.4459, + "mean_token_accuracy": 0.8633912205696106, + "num_tokens": 895607316.0, + "step": 23477 + }, + { + "epoch": 2.9866429207479963, + "ewc_loss": 0.0521194264292717, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.605971349112224e-05, + "grad_norm": 32.197288513183594, + "learning_rate": 1e-06, + "loss": 0.3639, + "mean_token_accuracy": 0.8904385566711426, + "num_tokens": 895648694.0, + "step": 23478 + }, + { + "epoch": 2.986770131026587, + "ewc_loss": 0.052060339599847794, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6030169465229847e-05, + "grad_norm": 32.293731689453125, + "learning_rate": 1e-06, + "loss": 0.432, + "mean_token_accuracy": 0.8693419694900513, + "num_tokens": 895687996.0, + "step": 23479 + }, + { + "epoch": 2.9868973413051774, + "ewc_loss": 0.05211351066827774, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6056755814352073e-05, + "grad_norm": 32.2352180480957, + "learning_rate": 1e-06, + "loss": 0.4027, + "mean_token_accuracy": 0.8773935437202454, + "num_tokens": 895726727.0, + "step": 23480 + }, + { + "epoch": 2.987024551583768, + "ewc_loss": 0.05203346908092499, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6016734409495257e-05, + "grad_norm": 32.2585334777832, + "learning_rate": 1e-06, + "loss": 0.3928, + "mean_token_accuracy": 0.8771215677261353, + "num_tokens": 895760688.0, + "step": 23481 + }, + { + "epoch": 2.9871517618623584, + "ewc_loss": 0.05206244811415672, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6031224479083903e-05, + "grad_norm": 32.33063507080078, + "learning_rate": 1e-06, + "loss": 0.4733, + "mean_token_accuracy": 0.8530353903770447, + "num_tokens": 895797303.0, + "step": 23482 + }, + { + "epoch": 2.987278972140949, + "ewc_loss": 0.05205194279551506, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6025971237686463e-05, + "grad_norm": 32.27227783203125, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8796718120574951, + "num_tokens": 895834078.0, + "step": 23483 + }, + { + "epoch": 2.9874061824195395, + "ewc_loss": 0.05202285572886467, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6011428417405114e-05, + "grad_norm": 32.260597229003906, + "learning_rate": 1e-06, + "loss": 0.4527, + "mean_token_accuracy": 0.8633959293365479, + "num_tokens": 895872640.0, + "step": 23484 + }, + { + "epoch": 2.98753339269813, + "ewc_loss": 0.052037984132766724, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6018991775345057e-05, + "grad_norm": 32.232460021972656, + "learning_rate": 1e-06, + "loss": 0.3791, + "mean_token_accuracy": 0.8872211575508118, + "num_tokens": 895907646.0, + "step": 23485 + }, + { + "epoch": 2.9876606029767205, + "ewc_loss": 0.05208484083414078, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6042420358862728e-05, + "grad_norm": 32.210487365722656, + "learning_rate": 1e-06, + "loss": 0.3896, + "mean_token_accuracy": 0.8821783065795898, + "num_tokens": 895944183.0, + "step": 23486 + }, + { + "epoch": 2.987787813255311, + "ewc_loss": 0.05209466442465782, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.60473316302523e-05, + "grad_norm": 32.268367767333984, + "learning_rate": 1e-06, + "loss": 0.4026, + "mean_token_accuracy": 0.8774635791778564, + "num_tokens": 895983987.0, + "step": 23487 + }, + { + "epoch": 2.9879150235339016, + "ewc_loss": 0.05197449028491974, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.598724495328497e-05, + "grad_norm": 32.17512893676758, + "learning_rate": 1e-06, + "loss": 0.447, + "mean_token_accuracy": 0.8618829846382141, + "num_tokens": 896020928.0, + "step": 23488 + }, + { + "epoch": 2.988042233812492, + "ewc_loss": 0.0520622693002224, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.603113534860313e-05, + "grad_norm": 32.444854736328125, + "learning_rate": 1e-06, + "loss": 0.3864, + "mean_token_accuracy": 0.880365788936615, + "num_tokens": 896057459.0, + "step": 23489 + }, + { + "epoch": 2.9881694440910826, + "ewc_loss": 0.052142899483442307, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6071449610753916e-05, + "grad_norm": 32.25301742553711, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8703399300575256, + "num_tokens": 896095049.0, + "step": 23490 + }, + { + "epoch": 2.988296654369673, + "ewc_loss": 0.05196306109428406, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.598152968857903e-05, + "grad_norm": 32.20844650268555, + "learning_rate": 1e-06, + "loss": 0.384, + "mean_token_accuracy": 0.8844385147094727, + "num_tokens": 896127433.0, + "step": 23491 + }, + { + "epoch": 2.9884238646482637, + "ewc_loss": 0.05210977792739868, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6054889531224035e-05, + "grad_norm": 32.325504302978516, + "learning_rate": 1e-06, + "loss": 0.4003, + "mean_token_accuracy": 0.8791404962539673, + "num_tokens": 896165662.0, + "step": 23492 + }, + { + "epoch": 2.9885510749268542, + "ewc_loss": 0.05211726576089859, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6058633011416532e-05, + "grad_norm": 32.36406707763672, + "learning_rate": 1e-06, + "loss": 0.4217, + "mean_token_accuracy": 0.8723287582397461, + "num_tokens": 896207753.0, + "step": 23493 + }, + { + "epoch": 2.9886782852054448, + "ewc_loss": 0.052035242319107056, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6017620257334784e-05, + "grad_norm": 32.17866134643555, + "learning_rate": 1e-06, + "loss": 0.4169, + "mean_token_accuracy": 0.8730611801147461, + "num_tokens": 896249105.0, + "step": 23494 + }, + { + "epoch": 2.9888054954840353, + "ewc_loss": 0.05203957483172417, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6019786673714407e-05, + "grad_norm": 32.3055534362793, + "learning_rate": 1e-06, + "loss": 0.4122, + "mean_token_accuracy": 0.873803436756134, + "num_tokens": 896291728.0, + "step": 23495 + }, + { + "epoch": 2.9889327057626254, + "ewc_loss": 0.05204688385128975, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.602344102342613e-05, + "grad_norm": 32.11046600341797, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8631690144538879, + "num_tokens": 896331729.0, + "step": 23496 + }, + { + "epoch": 2.9890599160412163, + "ewc_loss": 0.05208921059966087, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6044604965136386e-05, + "grad_norm": 32.368309020996094, + "learning_rate": 1e-06, + "loss": 0.4447, + "mean_token_accuracy": 0.8627581596374512, + "num_tokens": 896371804.0, + "step": 23497 + }, + { + "epoch": 2.9891871263198064, + "ewc_loss": 0.0522179938852787, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.610899719002191e-05, + "grad_norm": 32.14678192138672, + "learning_rate": 1e-06, + "loss": 0.4149, + "mean_token_accuracy": 0.870606541633606, + "num_tokens": 896408398.0, + "step": 23498 + }, + { + "epoch": 2.9893143365983974, + "ewc_loss": 0.05193718150258064, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.596859121695161e-05, + "grad_norm": 32.250572204589844, + "learning_rate": 1e-06, + "loss": 0.3812, + "mean_token_accuracy": 0.8886379599571228, + "num_tokens": 896440496.0, + "step": 23499 + }, + { + "epoch": 2.9894415468769875, + "ewc_loss": 0.052179936319589615, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6089968741871417e-05, + "grad_norm": 32.20158386230469, + "learning_rate": 1e-06, + "loss": 0.4163, + "mean_token_accuracy": 0.8753985166549683, + "num_tokens": 896475212.0, + "step": 23500 + }, + { + "epoch": 2.989568757155578, + "ewc_loss": 0.052137210965156555, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.606860471132677e-05, + "grad_norm": 32.250267028808594, + "learning_rate": 1e-06, + "loss": 0.423, + "mean_token_accuracy": 0.8706140518188477, + "num_tokens": 896517122.0, + "step": 23501 + }, + { + "epoch": 2.9896959674341685, + "ewc_loss": 0.05217255651950836, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6086278012371622e-05, + "grad_norm": 32.28527069091797, + "learning_rate": 1e-06, + "loss": 0.3934, + "mean_token_accuracy": 0.8813241720199585, + "num_tokens": 896557259.0, + "step": 23502 + }, + { + "epoch": 2.989823177712759, + "ewc_loss": 0.052050165832042694, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6025083570857532e-05, + "grad_norm": 32.21268081665039, + "learning_rate": 1e-06, + "loss": 0.443, + "mean_token_accuracy": 0.8647962212562561, + "num_tokens": 896600237.0, + "step": 23503 + }, + { + "epoch": 2.9899503879913496, + "ewc_loss": 0.05207321420311928, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6036606868728995e-05, + "grad_norm": 32.304786682128906, + "learning_rate": 1e-06, + "loss": 0.3799, + "mean_token_accuracy": 0.8853801488876343, + "num_tokens": 896640564.0, + "step": 23504 + }, + { + "epoch": 2.99007759826994, + "ewc_loss": 0.052126266062259674, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6063133191200905e-05, + "grad_norm": 32.18496322631836, + "learning_rate": 1e-06, + "loss": 0.3735, + "mean_token_accuracy": 0.8839811682701111, + "num_tokens": 896674164.0, + "step": 23505 + }, + { + "epoch": 2.9902048085485307, + "ewc_loss": 0.05220050737261772, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6100253307959065e-05, + "grad_norm": 32.28342056274414, + "learning_rate": 1e-06, + "loss": 0.4417, + "mean_token_accuracy": 0.8685294389724731, + "num_tokens": 896716613.0, + "step": 23506 + }, + { + "epoch": 2.990332018827121, + "ewc_loss": 0.05223296955227852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6116484150406905e-05, + "grad_norm": 32.229305267333984, + "learning_rate": 1e-06, + "loss": 0.4471, + "mean_token_accuracy": 0.8629493117332458, + "num_tokens": 896755586.0, + "step": 23507 + }, + { + "epoch": 2.9904592291057117, + "ewc_loss": 0.0521526038646698, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6076302674482577e-05, + "grad_norm": 32.21003723144531, + "learning_rate": 1e-06, + "loss": 0.4165, + "mean_token_accuracy": 0.8719578981399536, + "num_tokens": 896797641.0, + "step": 23508 + }, + { + "epoch": 2.9905864393843022, + "ewc_loss": 0.0521235466003418, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6061772587127052e-05, + "grad_norm": 32.15367889404297, + "learning_rate": 1e-06, + "loss": 0.4256, + "mean_token_accuracy": 0.87018883228302, + "num_tokens": 896835113.0, + "step": 23509 + }, + { + "epoch": 2.9907136496628928, + "ewc_loss": 0.052131474018096924, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.606573616503738e-05, + "grad_norm": 32.213768005371094, + "learning_rate": 1e-06, + "loss": 0.4084, + "mean_token_accuracy": 0.873466432094574, + "num_tokens": 896866496.0, + "step": 23510 + }, + { + "epoch": 2.9908408599414833, + "ewc_loss": 0.05218967795372009, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.609483817650471e-05, + "grad_norm": 32.22004318237305, + "learning_rate": 1e-06, + "loss": 0.4253, + "mean_token_accuracy": 0.8707818984985352, + "num_tokens": 896907535.0, + "step": 23511 + }, + { + "epoch": 2.990968070220074, + "ewc_loss": 0.05205776169896126, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6028881620732136e-05, + "grad_norm": 32.16407012939453, + "learning_rate": 1e-06, + "loss": 0.3602, + "mean_token_accuracy": 0.8902624249458313, + "num_tokens": 896948574.0, + "step": 23512 + }, + { + "epoch": 2.9910952804986644, + "ewc_loss": 0.052190691232681274, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6095345674548298e-05, + "grad_norm": 32.344451904296875, + "learning_rate": 1e-06, + "loss": 0.4361, + "mean_token_accuracy": 0.867233157157898, + "num_tokens": 896992377.0, + "step": 23513 + }, + { + "epoch": 2.991222490777255, + "ewc_loss": 0.05217193439602852, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6085966965183616e-05, + "grad_norm": 32.24325180053711, + "learning_rate": 1e-06, + "loss": 0.4313, + "mean_token_accuracy": 0.8684852719306946, + "num_tokens": 897035411.0, + "step": 23514 + }, + { + "epoch": 2.9913497010558454, + "ewc_loss": 0.052067168056964874, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6033583708340302e-05, + "grad_norm": 32.32417297363281, + "learning_rate": 1e-06, + "loss": 0.4174, + "mean_token_accuracy": 0.875778317451477, + "num_tokens": 897074661.0, + "step": 23515 + }, + { + "epoch": 2.991476911334436, + "ewc_loss": 0.052111588418483734, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6055793568957597e-05, + "grad_norm": 32.19991683959961, + "learning_rate": 1e-06, + "loss": 0.4167, + "mean_token_accuracy": 0.8753790855407715, + "num_tokens": 897115550.0, + "step": 23516 + }, + { + "epoch": 2.9916041216130265, + "ewc_loss": 0.05204306170344353, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6021531084552407e-05, + "grad_norm": 32.21583938598633, + "learning_rate": 1e-06, + "loss": 0.4344, + "mean_token_accuracy": 0.8669149875640869, + "num_tokens": 897155091.0, + "step": 23517 + }, + { + "epoch": 2.991731331891617, + "ewc_loss": 0.05208528786897659, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6042644094559364e-05, + "grad_norm": 32.25606918334961, + "learning_rate": 1e-06, + "loss": 0.4282, + "mean_token_accuracy": 0.8678820133209229, + "num_tokens": 897196551.0, + "step": 23518 + }, + { + "epoch": 2.991858542170207, + "ewc_loss": 0.052160121500492096, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6080060706590302e-05, + "grad_norm": 32.27079772949219, + "learning_rate": 1e-06, + "loss": 0.3775, + "mean_token_accuracy": 0.8832156658172607, + "num_tokens": 897231885.0, + "step": 23519 + }, + { + "epoch": 2.991985752448798, + "ewc_loss": 0.05213455483317375, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6067276849062182e-05, + "grad_norm": 32.22468948364258, + "learning_rate": 1e-06, + "loss": 0.4182, + "mean_token_accuracy": 0.8730301260948181, + "num_tokens": 897267944.0, + "step": 23520 + }, + { + "epoch": 2.992112962727388, + "ewc_loss": 0.052078571170568466, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.603928624012042e-05, + "grad_norm": 32.18292236328125, + "learning_rate": 1e-06, + "loss": 0.4088, + "mean_token_accuracy": 0.8763628005981445, + "num_tokens": 897307330.0, + "step": 23521 + }, + { + "epoch": 2.992240173005979, + "ewc_loss": 0.052138976752758026, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6069488740176894e-05, + "grad_norm": 32.22271728515625, + "learning_rate": 1e-06, + "loss": 0.4101, + "mean_token_accuracy": 0.8750259876251221, + "num_tokens": 897344028.0, + "step": 23522 + }, + { + "epoch": 2.992367383284569, + "ewc_loss": 0.052124638110399246, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6062318283948116e-05, + "grad_norm": 32.116004943847656, + "learning_rate": 1e-06, + "loss": 0.4399, + "mean_token_accuracy": 0.8657447099685669, + "num_tokens": 897391213.0, + "step": 23523 + }, + { + "epoch": 2.99249459356316, + "ewc_loss": 0.05209352821111679, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6046764105558395e-05, + "grad_norm": 32.18471145629883, + "learning_rate": 1e-06, + "loss": 0.4383, + "mean_token_accuracy": 0.8669829964637756, + "num_tokens": 897428222.0, + "step": 23524 + }, + { + "epoch": 2.9926218038417502, + "ewc_loss": 0.05205843597650528, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6029218133771792e-05, + "grad_norm": 32.077964782714844, + "learning_rate": 1e-06, + "loss": 0.4631, + "mean_token_accuracy": 0.8624435663223267, + "num_tokens": 897457737.0, + "step": 23525 + }, + { + "epoch": 2.992749014120341, + "ewc_loss": 0.052213456481695175, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6106728910235688e-05, + "grad_norm": 32.28591537475586, + "learning_rate": 1e-06, + "loss": 0.3905, + "mean_token_accuracy": 0.8802721500396729, + "num_tokens": 897498925.0, + "step": 23526 + }, + { + "epoch": 2.9928762243989313, + "ewc_loss": 0.05226746201515198, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6133731807931326e-05, + "grad_norm": 32.12717056274414, + "learning_rate": 1e-06, + "loss": 0.4327, + "mean_token_accuracy": 0.8688048720359802, + "num_tokens": 897533157.0, + "step": 23527 + }, + { + "epoch": 2.993003434677522, + "ewc_loss": 0.05215168744325638, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6075844289152883e-05, + "grad_norm": 32.30357360839844, + "learning_rate": 1e-06, + "loss": 0.4089, + "mean_token_accuracy": 0.8755100965499878, + "num_tokens": 897574159.0, + "step": 23528 + }, + { + "epoch": 2.9931306449561124, + "ewc_loss": 0.052310001105070114, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6155001251026988e-05, + "grad_norm": 32.20611572265625, + "learning_rate": 1e-06, + "loss": 0.4336, + "mean_token_accuracy": 0.865034818649292, + "num_tokens": 897606970.0, + "step": 23529 + }, + { + "epoch": 2.993257855234703, + "ewc_loss": 0.05220695585012436, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.610347837617155e-05, + "grad_norm": 32.30268478393555, + "learning_rate": 1e-06, + "loss": 0.49, + "mean_token_accuracy": 0.8506700396537781, + "num_tokens": 897647725.0, + "step": 23530 + }, + { + "epoch": 2.9933850655132934, + "ewc_loss": 0.05219724401831627, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6098621674464084e-05, + "grad_norm": 32.165218353271484, + "learning_rate": 1e-06, + "loss": 0.4056, + "mean_token_accuracy": 0.8770066499710083, + "num_tokens": 897687109.0, + "step": 23531 + }, + { + "epoch": 2.993512275791884, + "ewc_loss": 0.05211571604013443, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.605785812193062e-05, + "grad_norm": 32.275787353515625, + "learning_rate": 1e-06, + "loss": 0.3872, + "mean_token_accuracy": 0.8843461275100708, + "num_tokens": 897720994.0, + "step": 23532 + }, + { + "epoch": 2.9936394860704745, + "ewc_loss": 0.052261050790548325, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6130524929612875e-05, + "grad_norm": 32.294471740722656, + "learning_rate": 1e-06, + "loss": 0.3655, + "mean_token_accuracy": 0.8888452053070068, + "num_tokens": 897749855.0, + "step": 23533 + }, + { + "epoch": 2.993766696349065, + "ewc_loss": 0.052124202251434326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6062101824209094e-05, + "grad_norm": 32.259498596191406, + "learning_rate": 1e-06, + "loss": 0.4098, + "mean_token_accuracy": 0.8723053932189941, + "num_tokens": 897792846.0, + "step": 23534 + }, + { + "epoch": 2.9938939066276555, + "ewc_loss": 0.052058640867471695, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.602931999717839e-05, + "grad_norm": 32.133766174316406, + "learning_rate": 1e-06, + "loss": 0.4059, + "mean_token_accuracy": 0.8756394386291504, + "num_tokens": 897836765.0, + "step": 23535 + }, + { + "epoch": 2.994021116906246, + "ewc_loss": 0.052230797708034515, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6115398213732988e-05, + "grad_norm": 32.29556655883789, + "learning_rate": 1e-06, + "loss": 0.4143, + "mean_token_accuracy": 0.8738855719566345, + "num_tokens": 897880918.0, + "step": 23536 + }, + { + "epoch": 2.9941483271848366, + "ewc_loss": 0.05220818519592285, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.610409319458995e-05, + "grad_norm": 32.18134689331055, + "learning_rate": 1e-06, + "loss": 0.4189, + "mean_token_accuracy": 0.8712510466575623, + "num_tokens": 897916676.0, + "step": 23537 + }, + { + "epoch": 2.994275537463427, + "ewc_loss": 0.05207040533423424, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6035202608909458e-05, + "grad_norm": 32.196510314941406, + "learning_rate": 1e-06, + "loss": 0.3973, + "mean_token_accuracy": 0.8795353770256042, + "num_tokens": 897953968.0, + "step": 23538 + }, + { + "epoch": 2.9944027477420176, + "ewc_loss": 0.05227254703640938, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.613627293612808e-05, + "grad_norm": 32.34541320800781, + "learning_rate": 1e-06, + "loss": 0.431, + "mean_token_accuracy": 0.8728954792022705, + "num_tokens": 897993055.0, + "step": 23539 + }, + { + "epoch": 2.994529958020608, + "ewc_loss": 0.052203524857759476, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6101763069164008e-05, + "grad_norm": 32.30402374267578, + "learning_rate": 1e-06, + "loss": 0.3865, + "mean_token_accuracy": 0.8809653520584106, + "num_tokens": 898030202.0, + "step": 23540 + }, + { + "epoch": 2.9946571682991987, + "ewc_loss": 0.05213179811835289, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6065899874083698e-05, + "grad_norm": 32.21889877319336, + "learning_rate": 1e-06, + "loss": 0.4215, + "mean_token_accuracy": 0.8697847127914429, + "num_tokens": 898067234.0, + "step": 23541 + }, + { + "epoch": 2.9947843785777892, + "ewc_loss": 0.05213247239589691, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6066236387123354e-05, + "grad_norm": 32.246192932128906, + "learning_rate": 1e-06, + "loss": 0.3982, + "mean_token_accuracy": 0.8781290650367737, + "num_tokens": 898105423.0, + "step": 23542 + }, + { + "epoch": 2.9949115888563798, + "ewc_loss": 0.05213300511240959, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.606650195957627e-05, + "grad_norm": 32.141273498535156, + "learning_rate": 1e-06, + "loss": 0.3503, + "mean_token_accuracy": 0.8927841186523438, + "num_tokens": 898144550.0, + "step": 23543 + }, + { + "epoch": 2.99503879913497, + "ewc_loss": 0.05220906436443329, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6104531571036205e-05, + "grad_norm": 32.22050094604492, + "learning_rate": 1e-06, + "loss": 0.4093, + "mean_token_accuracy": 0.8762569427490234, + "num_tokens": 898179277.0, + "step": 23544 + }, + { + "epoch": 2.995166009413561, + "ewc_loss": 0.052197325974702835, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6098663511220366e-05, + "grad_norm": 32.23544692993164, + "learning_rate": 1e-06, + "loss": 0.409, + "mean_token_accuracy": 0.8718379735946655, + "num_tokens": 898218518.0, + "step": 23545 + }, + { + "epoch": 2.995293219692151, + "ewc_loss": 0.052205637097358704, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6102818083018064e-05, + "grad_norm": 32.16144561767578, + "learning_rate": 1e-06, + "loss": 0.4485, + "mean_token_accuracy": 0.8615512251853943, + "num_tokens": 898259266.0, + "step": 23546 + }, + { + "epoch": 2.995420429970742, + "ewc_loss": 0.052080024033784866, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6040012016892433e-05, + "grad_norm": 32.20201110839844, + "learning_rate": 1e-06, + "loss": 0.4453, + "mean_token_accuracy": 0.8723430633544922, + "num_tokens": 898292648.0, + "step": 23547 + }, + { + "epoch": 2.995547640249332, + "ewc_loss": 0.05217926949262619, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6089634047821164e-05, + "grad_norm": 32.23299026489258, + "learning_rate": 1e-06, + "loss": 0.4127, + "mean_token_accuracy": 0.8776604533195496, + "num_tokens": 898328413.0, + "step": 23548 + }, + { + "epoch": 2.995674850527923, + "ewc_loss": 0.0522594228386879, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.612971184134949e-05, + "grad_norm": 32.18426513671875, + "learning_rate": 1e-06, + "loss": 0.4144, + "mean_token_accuracy": 0.8736592531204224, + "num_tokens": 898376963.0, + "step": 23549 + }, + { + "epoch": 2.995802060806513, + "ewc_loss": 0.05221276357769966, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.610638148325961e-05, + "grad_norm": 32.21852493286133, + "learning_rate": 1e-06, + "loss": 0.3757, + "mean_token_accuracy": 0.8849142789840698, + "num_tokens": 898407256.0, + "step": 23550 + }, + { + "epoch": 2.9959292710851035, + "ewc_loss": 0.05233072489500046, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6165362214669585e-05, + "grad_norm": 32.240116119384766, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8771468997001648, + "num_tokens": 898448800.0, + "step": 23551 + }, + { + "epoch": 2.996056481363694, + "ewc_loss": 0.05222085490822792, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6110426915693097e-05, + "grad_norm": 32.213138580322266, + "learning_rate": 1e-06, + "loss": 0.4302, + "mean_token_accuracy": 0.8705441951751709, + "num_tokens": 898488912.0, + "step": 23552 + }, + { + "epoch": 2.9961836916422846, + "ewc_loss": 0.052293822169303894, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6146910386160016e-05, + "grad_norm": 32.19334030151367, + "learning_rate": 1e-06, + "loss": 0.4358, + "mean_token_accuracy": 0.8678778409957886, + "num_tokens": 898530434.0, + "step": 23553 + }, + { + "epoch": 2.996310901920875, + "ewc_loss": 0.052096106112003326, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.604805376904551e-05, + "grad_norm": 32.071163177490234, + "learning_rate": 1e-06, + "loss": 0.4102, + "mean_token_accuracy": 0.8741632103919983, + "num_tokens": 898565044.0, + "step": 23554 + }, + { + "epoch": 2.9964381121994657, + "ewc_loss": 0.05227048322558403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.613524156913627e-05, + "grad_norm": 32.2554931640625, + "learning_rate": 1e-06, + "loss": 0.3868, + "mean_token_accuracy": 0.8836734890937805, + "num_tokens": 898601798.0, + "step": 23555 + }, + { + "epoch": 2.996565322478056, + "ewc_loss": 0.05227392911911011, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6136964152101427e-05, + "grad_norm": 32.149600982666016, + "learning_rate": 1e-06, + "loss": 0.4435, + "mean_token_accuracy": 0.8641501665115356, + "num_tokens": 898644789.0, + "step": 23556 + }, + { + "epoch": 2.9966925327566467, + "ewc_loss": 0.05217290297150612, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.608645081636496e-05, + "grad_norm": 32.21914291381836, + "learning_rate": 1e-06, + "loss": 0.4168, + "mean_token_accuracy": 0.8723968267440796, + "num_tokens": 898683928.0, + "step": 23557 + }, + { + "epoch": 2.9968197430352372, + "ewc_loss": 0.05225047096610069, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6125235308427364e-05, + "grad_norm": 32.140228271484375, + "learning_rate": 1e-06, + "loss": 0.4277, + "mean_token_accuracy": 0.8689184188842773, + "num_tokens": 898726217.0, + "step": 23558 + }, + { + "epoch": 2.9969469533138278, + "ewc_loss": 0.05219300836324692, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6096504370798357e-05, + "grad_norm": 32.23646545410156, + "learning_rate": 1e-06, + "loss": 0.3844, + "mean_token_accuracy": 0.8828902244567871, + "num_tokens": 898760575.0, + "step": 23559 + }, + { + "epoch": 2.9970741635924183, + "ewc_loss": 0.05229995772242546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6149979021283798e-05, + "grad_norm": 32.183040618896484, + "learning_rate": 1e-06, + "loss": 0.4225, + "mean_token_accuracy": 0.8710298538208008, + "num_tokens": 898801232.0, + "step": 23560 + }, + { + "epoch": 2.997201373871009, + "ewc_loss": 0.05231941118836403, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6159705157624558e-05, + "grad_norm": 32.346282958984375, + "learning_rate": 1e-06, + "loss": 0.3895, + "mean_token_accuracy": 0.8793646097183228, + "num_tokens": 898837174.0, + "step": 23561 + }, + { + "epoch": 2.9973285841495994, + "ewc_loss": 0.05227558687329292, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6137793611269444e-05, + "grad_norm": 32.2276611328125, + "learning_rate": 1e-06, + "loss": 0.4512, + "mean_token_accuracy": 0.8651054501533508, + "num_tokens": 898875849.0, + "step": 23562 + }, + { + "epoch": 2.99745579442819, + "ewc_loss": 0.05222581326961517, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.611290619825013e-05, + "grad_norm": 32.32252502441406, + "learning_rate": 1e-06, + "loss": 0.3915, + "mean_token_accuracy": 0.8817803263664246, + "num_tokens": 898909975.0, + "step": 23563 + }, + { + "epoch": 2.9975830047067804, + "ewc_loss": 0.05233190581202507, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6165953386225738e-05, + "grad_norm": 32.246891021728516, + "learning_rate": 1e-06, + "loss": 0.4356, + "mean_token_accuracy": 0.8671665191650391, + "num_tokens": 898948250.0, + "step": 23564 + }, + { + "epoch": 2.997710214985371, + "ewc_loss": 0.052148886024951935, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6074443667312153e-05, + "grad_norm": 32.23457717895508, + "learning_rate": 1e-06, + "loss": 0.3778, + "mean_token_accuracy": 0.8905433416366577, + "num_tokens": 898989122.0, + "step": 23565 + }, + { + "epoch": 2.9978374252639615, + "ewc_loss": 0.05226069316267967, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6130346668651327e-05, + "grad_norm": 32.23649215698242, + "learning_rate": 1e-06, + "loss": 0.427, + "mean_token_accuracy": 0.8687469959259033, + "num_tokens": 899028715.0, + "step": 23566 + }, + { + "epoch": 2.997964635542552, + "ewc_loss": 0.052256543189287186, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.612827120174188e-05, + "grad_norm": 32.268184661865234, + "learning_rate": 1e-06, + "loss": 0.4015, + "mean_token_accuracy": 0.8769299983978271, + "num_tokens": 899064101.0, + "step": 23567 + }, + { + "epoch": 2.9980918458211425, + "ewc_loss": 0.05215095356106758, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6075476853293367e-05, + "grad_norm": 32.184791564941406, + "learning_rate": 1e-06, + "loss": 0.4105, + "mean_token_accuracy": 0.8770792484283447, + "num_tokens": 899101759.0, + "step": 23568 + }, + { + "epoch": 2.9982190560997326, + "ewc_loss": 0.0522506907582283, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6125344447791576e-05, + "grad_norm": 32.25908279418945, + "learning_rate": 1e-06, + "loss": 0.434, + "mean_token_accuracy": 0.8676742315292358, + "num_tokens": 899142273.0, + "step": 23569 + }, + { + "epoch": 2.9983462663783236, + "ewc_loss": 0.052260030061006546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6130015612579882e-05, + "grad_norm": 32.31482696533203, + "learning_rate": 1e-06, + "loss": 0.4104, + "mean_token_accuracy": 0.8745002150535583, + "num_tokens": 899175878.0, + "step": 23570 + }, + { + "epoch": 2.9984734766569137, + "ewc_loss": 0.05219325050711632, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.609662442409899e-05, + "grad_norm": 32.33396530151367, + "learning_rate": 1e-06, + "loss": 0.3732, + "mean_token_accuracy": 0.8847329616546631, + "num_tokens": 899208937.0, + "step": 23571 + }, + { + "epoch": 2.9986006869355046, + "ewc_loss": 0.05216629430651665, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.608314753160812e-05, + "grad_norm": 32.26712417602539, + "learning_rate": 1e-06, + "loss": 0.4159, + "mean_token_accuracy": 0.8736299872398376, + "num_tokens": 899244088.0, + "step": 23572 + }, + { + "epoch": 2.9987278972140947, + "ewc_loss": 0.05216323584318161, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6081617761519738e-05, + "grad_norm": 32.319881439208984, + "learning_rate": 1e-06, + "loss": 0.4368, + "mean_token_accuracy": 0.86916184425354, + "num_tokens": 899280111.0, + "step": 23573 + }, + { + "epoch": 2.9988551074926852, + "ewc_loss": 0.05224088951945305, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6120444090338424e-05, + "grad_norm": 32.39230728149414, + "learning_rate": 1e-06, + "loss": 0.4181, + "mean_token_accuracy": 0.8731918931007385, + "num_tokens": 899310882.0, + "step": 23574 + }, + { + "epoch": 2.9989823177712758, + "ewc_loss": 0.052211176604032516, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6105588403879665e-05, + "grad_norm": 32.15555191040039, + "learning_rate": 1e-06, + "loss": 0.4687, + "mean_token_accuracy": 0.8607510328292847, + "num_tokens": 899346820.0, + "step": 23575 + }, + { + "epoch": 2.9991095280498663, + "ewc_loss": 0.05223304033279419, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6116520530194975e-05, + "grad_norm": 32.259254455566406, + "learning_rate": 1e-06, + "loss": 0.373, + "mean_token_accuracy": 0.8872630596160889, + "num_tokens": 899382044.0, + "step": 23576 + }, + { + "epoch": 2.999236738328457, + "ewc_loss": 0.05230811610817909, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6154057195526548e-05, + "grad_norm": 32.42197036743164, + "learning_rate": 1e-06, + "loss": 0.4053, + "mean_token_accuracy": 0.8744838237762451, + "num_tokens": 899418389.0, + "step": 23577 + }, + { + "epoch": 2.9993639486070474, + "ewc_loss": 0.05223635956645012, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.611817944853101e-05, + "grad_norm": 32.259971618652344, + "learning_rate": 1e-06, + "loss": 0.4292, + "mean_token_accuracy": 0.8722593188285828, + "num_tokens": 899458486.0, + "step": 23578 + }, + { + "epoch": 2.999491158885638, + "ewc_loss": 0.052200205624103546, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.610010233183857e-05, + "grad_norm": 32.34835433959961, + "learning_rate": 1e-06, + "loss": 0.4214, + "mean_token_accuracy": 0.8723591566085815, + "num_tokens": 899498168.0, + "step": 23579 + }, + { + "epoch": 2.9996183691642284, + "ewc_loss": 0.05216682702302933, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6083413104061037e-05, + "grad_norm": 32.08125305175781, + "learning_rate": 1e-06, + "loss": 0.3648, + "mean_token_accuracy": 0.8902066946029663, + "num_tokens": 899538655.0, + "step": 23580 + }, + { + "epoch": 2.999745579442819, + "ewc_loss": 0.05223885923624039, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6119429094251245e-05, + "grad_norm": 32.37930679321289, + "learning_rate": 1e-06, + "loss": 0.4481, + "mean_token_accuracy": 0.8627644777297974, + "num_tokens": 899581693.0, + "step": 23581 + }, + { + "epoch": 2.9998727897214095, + "ewc_loss": 0.05229523032903671, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.614761433505919e-05, + "grad_norm": 32.28519821166992, + "learning_rate": 1e-06, + "loss": 0.4258, + "mean_token_accuracy": 0.8705236315727234, + "num_tokens": 899623817.0, + "step": 23582 + }, + { + "epoch": 3.0, + "ewc_loss": 0.052088044583797455, + "ewc_loss_diag": 0.0, + "ewc_loss_parallel": 2.6044022888527252e-05, + "grad_norm": 32.264102935791016, + "learning_rate": 1e-06, + "loss": 0.4051, + "mean_token_accuracy": 0.8773747086524963, + "num_tokens": 899664226.0, + "step": 23583 + }, + { + "epoch": 3.0, + "step": 23583, + "total_flos": 5.62815163329864e+19, + "train_loss": 0.0, + "train_runtime": 852.4232, + "train_samples_per_second": 442.632, + "train_steps_per_second": 27.666 + } + ], + "logging_steps": 1, + "max_steps": 23583, + "num_input_tokens_seen": 0, + "num_train_epochs": 3, + "save_steps": 11792, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.62815163329864e+19, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +} diff --git a/training_args.bin b/training_args.bin new file mode 100644 index 0000000..e375938 --- /dev/null +++ b/training_args.bin @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a7cfd3d765fcfb20d71d623709fffb59c309c5cd769214b074c10653f5ef86f +size 13393